# KNN Classifier

In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [39]:
#loading in the datasets
df_training = pd.read_csv('../data/archive/train.csv')
df_testing = pd.read_csv('../data/archive/test.csv')

# PCA - Principal Component Analysis

In [20]:
#Start my importing the necessary library
from sklearn.decomposition import PCA

In [21]:
pca = PCA(n_components = 3)

In [22]:
#Now we have to extract only the independent variables 
x_train_pca = df_training.iloc[:,:-2]
x_test_pca = df_testing.iloc[:,:-2]

In [23]:
#Now let's fit our features to our pca model on both training and test sets
train_components = pca.fit_transform(x_train_pca)
test_components = pca.fit_transform(x_test_pca)

In [24]:
#Let's look at our pca features for training set 
pca_train_df = pd.DataFrame(data = train_components, columns = ['principal component 1', 'principal component 2',
                                                    'principal component 3'] )
pca_train_df.head()

Unnamed: 0,principal component 1,principal component 2,principal component 3
0,-5.52028,-0.290278,-1.529928
1,-5.53535,-0.08253,-1.924804
2,-5.474988,0.287387,-2.144641
3,-5.677232,0.897031,-2.01822
4,-5.748749,1.162952,-2.139533


In [25]:
#Now for the test set
pca_test_df = pd.DataFrame(data = test_components, columns = ['principal component 1', 'principal component 2',
                                                    'principal component 3'] )
pca_test_df.head()

Unnamed: 0,principal component 1,principal component 2,principal component 3
0,-2.754984,-1.387993,0.1294
1,-4.399115,-1.256751,-0.480589
2,-5.066335,-0.616208,-1.428895
3,-5.186594,-0.900992,-1.470128
4,-5.080981,-1.593048,-1.103905


In [26]:
#Before using these features to train our model, we should turn our categorical 'activity' variable to numerical 
df_training['Activity'] = np.where((df_training['Activity'] == 'LAYING') , 1, df_training['Activity'])
df_training['Activity'] = np.where((df_training['Activity'] == 'STANDING') , 2, df_training['Activity'])
df_training['Activity'] = np.where((df_training['Activity'] == 'SITTING') , 3, df_training['Activity'])
df_training['Activity'] = np.where((df_training['Activity'] == 'WALKING') , 4, df_training['Activity'])
df_training['Activity'] = np.where((df_training['Activity'] == 'WALKING_UPSTAIRS') , 5, df_training['Activity'])
df_training['Activity'] = np.where((df_training['Activity'] == 'WALKING_DOWNSTAIRS') , 6, df_training['Activity'])

In [27]:
#Now if we look at our acitivity column we have changed values to numbers
df_training['Activity'].head()

0    2
1    2
2    2
3    2
4    2
Name: Activity, dtype: object

In [28]:
#Now let's do the same for testing
df_testing['Activity'] = np.where((df_testing['Activity'] == 'LAYING') , 1, df_testing['Activity'])
df_testing['Activity'] = np.where((df_testing['Activity'] == 'STANDING') , 2, df_testing['Activity'])
df_testing['Activity'] = np.where((df_testing['Activity'] == 'SITTING') , 3, df_testing['Activity'])
df_testing['Activity'] = np.where((df_testing['Activity'] == 'WALKING') , 4, df_testing['Activity'])
df_testing['Activity'] = np.where((df_testing['Activity'] == 'WALKING_UPSTAIRS') , 5, df_testing['Activity'])
df_testing['Activity'] = np.where((df_testing['Activity'] == 'WALKING_DOWNSTAIRS') , 6, df_testing['Activity'])

In [29]:
df_testing['Activity'].head()

0    2
1    2
2    2
3    2
4    2
Name: Activity, dtype: object

In [30]:
df_training['Activity'] = pd.to_numeric(df_training['Activity'])
df_testing['Activity'] = pd.to_numeric(df_testing['Activity'])

In [31]:
#Now let's get our x and y features
x_train = pca_train_df.iloc[:]
y_train = df_training.iloc[:,-1]

x_test = pca_test_df.iloc[:]
y_test = df_testing.iloc[:,-1]

In [32]:
#initializing our lists so we can later store the best params that achieve best scores across metrics
accuracy_best_params = []
roc_auc_best_params = []
f1_best_params = []


#Now that we have our new features, we can use them to train our KNN model
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold

#Making a grid of values we want our grid search to test to find the best parameters
grid_values ={'n_neighbors': list(range(1,105,4))}

knn = KNeighborsClassifier()

knn_classifier = GridSearchCV(estimator = knn, param_grid = grid_values, cv = StratifiedKFold(n_splits = 5),
                      scoring = ['accuracy', 'roc_auc_ovr', 'f1_micro'], refit = False, verbose = 0)

knn_model = knn_classifier.fit(x_train, y_train)


accuracy_best_params.append(knn_model.cv_results_['params'][ np.argmin(knn_model.cv_results_['rank_test_accuracy'])])
roc_auc_best_params.append(knn_model.cv_results_['params'][ np.argmin(knn_model.cv_results_['rank_test_roc_auc_ovr'])])
f1_best_params.append(knn_model.cv_results_['params'][ np.argmin(knn_model.cv_results_['rank_test_f1_micro']) ])

In [33]:
accuracy_best_params

[{'n_neighbors': 69}]

In [34]:
roc_auc_best_params

[{'n_neighbors': 101}]

In [35]:
f1_best_params

[{'n_neighbors': 69}]

In [40]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
accuracy_test_score = []

knn_clf = KNeighborsClassifier(n_neighbors = 69)
model = knn_clf.fit(x_train, y_train)

y_pred = model.predict(x_test)

accuracy_test_score.append(accuracy_score(y_test, y_pred))

In [41]:
accuracy_test_score

[0.6192738378011537]