In [4]:
import pandas as pd

names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'class']
dataset = pd.read_csv("iris.data", names=names)
print(dataset.head())

   sepal-length  sepal-width  petal-length  petal-width        class
0           5.1          3.5           1.4          0.2  Iris-setosa
1           4.9          3.0           1.4          0.2  Iris-setosa
2           4.7          3.2           1.3          0.2  Iris-setosa
3           4.6          3.1           1.5          0.2  Iris-setosa
4           5.0          3.6           1.4          0.2  Iris-setosa


In [5]:
dataset.shape

(150, 5)

In [6]:
dataset.groupby('class').size()

class
Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
dtype: int64

In [7]:
# train / test dataset
from sklearn.model_selection import train_test_split

array = dataset.values
X = array[:,0:4]
Y = array[:,4]
t_size = 0.20
seed = 7
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,test_size=t_size, random_state=seed)

In [8]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# Make predictions
knn = KNeighborsClassifier()
knn.fit(X_train, Y_train)
predictions = knn.predict(X_test)
print(accuracy_score(Y_test, predictions))
print(confusion_matrix(Y_test, predictions))
print(classification_report(Y_test, predictions))

0.9
[[ 7  0  0]
 [ 0 11  1]
 [ 0  2  9]]
                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00         7
Iris-versicolor       0.85      0.92      0.88        12
 Iris-virginica       0.90      0.82      0.86        11

       accuracy                           0.90        30
      macro avg       0.92      0.91      0.91        30
   weighted avg       0.90      0.90      0.90        30



In [9]:
for k in range(1,10):
    knn = KNeighborsClassifier(n_neighbors = k)
    knn.fit(X_train, Y_train)
    predictions = knn.predict(X_test)
    print(accuracy_score(Y_test, predictions))

0.9
0.9333333333333333
0.9
0.9333333333333333
0.9
0.8666666666666667
0.8666666666666667
0.9
0.9


In [10]:
for k in range(1,10):
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y,test_size=t_size, random_state=k)
    knn = KNeighborsClassifier()
    knn.fit(X_train, Y_train)
    predictions = knn.predict(X_test)
    print(accuracy_score(Y_test, predictions))



1.0
1.0
0.9666666666666667
0.9666666666666667
0.9333333333333333
0.9666666666666667
0.9
0.9
1.0


In [11]:
dataset = pd.read_csv("datatraining.txt")
dataset.head()

Unnamed: 0,date,Temperature,Humidity,Light,CO2,HumidityRatio,Occupancy
1,2015-02-04 17:51:00,23.18,27.272,426.0,721.25,0.004793,1
2,2015-02-04 17:51:59,23.15,27.2675,429.5,714.0,0.004783,1
3,2015-02-04 17:53:00,23.15,27.245,426.0,713.5,0.004779,1
4,2015-02-04 17:54:00,23.15,27.2,426.0,708.25,0.004772,1
5,2015-02-04 17:55:00,23.1,27.2,426.0,704.5,0.004757,1


In [12]:
knn = KNeighborsClassifier()
X_train = dataset.drop(['Occupancy','date'], axis=1)
Y_train = dataset.Occupancy

knn.fit(X_train, Y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [17]:
dataset = pd.read_csv("datatest.txt")
dataset.head()

knn = KNeighborsClassifier()
X_test = dataset.drop(['Occupancy','date'], axis=1)
Y_test = dataset.Occupancy

knn.fit(X_train, Y_train)

predictions = knn.predict(X_test)
print(accuracy_score(Y_test, predictions))

for k in range(1,10):
    knn = KNeighborsClassifier(n_neighbors = k)
    knn.fit(X_train, Y_train)
    predictions = knn.predict(X_test)
    print(k,accuracy_score(Y_test, predictions))

0.9425891181988743
1 0.9365853658536586
2 0.9230769230769231
3 0.9350844277673546
4 0.9275797373358349
5 0.9425891181988743
6 0.9324577861163227
7 0.9609756097560975
8 0.9549718574108818
9 0.9617260787992495


In [18]:
dataset = pd.read_csv("datatest2.txt")
dataset.head()

knn = KNeighborsClassifier()
X_test = dataset.drop(['Occupancy','date'], axis=1)
Y_test = dataset.Occupancy

knn.fit(X_train, Y_train)

predictions = knn.predict(X_test)
print(accuracy_score(Y_test, predictions))

for k in range(1,10):
    knn = KNeighborsClassifier(n_neighbors = k)
    knn.fit(X_train, Y_train)
    predictions = knn.predict(X_test)
    print(k,accuracy_score(Y_test, predictions))

0.9621616078753076
1 0.9502666119770303
2 0.9525225594749795
3 0.9579573420836751
4 0.9539581624282198
5 0.9621616078753076
6 0.9628794093519278
7 0.9649302707136997
8 0.9658531583264971
9 0.9656480721903199
