# Students exam performance binary classification using K Nearest neighbor classifier

In [56]:
## Import libraries

import numpy as np
import pandas as pd

from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, cross_val_score, cross_validate
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer

## Getting data 

In [31]:
df1 = pd.read_csv('OnehotAll.csv')
df2 = pd.read_csv('OnehotOrd.csv')

In [32]:
df1.head() #Only one-hot encoding

Unnamed: 0,parent_associate's degree,parent_bachelor's degree,parent_high school,parent_master's degree,parent_some college,race A,race B,race C,race D,gender,standard lunch,completed course,above avg score
0,0,1,0,0,0,0,1,0,0,1,1,0,1
1,0,0,0,0,1,0,0,1,0,1,1,1,1
2,0,0,0,1,0,0,1,0,0,1,1,0,1
3,1,0,0,0,0,1,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,1,0,0,1,0,1


In [33]:
df2.head() #Parent edu ordinal

Unnamed: 0,parent edu,race A,race B,race C,race D,gender,standard lunch,completed course,above avg score
0,4,0,1,0,0,1,1,0,1
1,2,0,0,1,0,1,1,1,1
2,5,0,1,0,0,1,1,0,1
3,3,1,0,0,0,0,0,0,0
4,2,0,0,1,0,0,1,0,1


## Model training and evaluation

### 1- No parameter tuning

### 1.1 - All one-hot encoded

In [34]:
#Define feature variable
features = ["parent_associate's degree", "parent_bachelor's degree", "parent_high school", "parent_master's degree", "parent_some college", "race A", "race B", "race C", "race D", "gender", "standard lunch", "completed course"]
X_1 = df1[features]

# Target score:
y_1 = df1['above avg score']

In [41]:
# Train test split:
X_1_train, X_1_test, y_1_train, y_1_test = train_test_split(X_1, y_1, test_size = 0.3, random_state = 34)

In [59]:
# Define K-Fold and Stratified K-Fold CV:
kf_1 = KFold(n_splits=10, shuffle=True, random_state=13)
skf_1 = StratifiedKFold(n_splits=10, shuffle=True, random_state=13)

In [37]:
# Get our model:
clf_1 = KNeighborsClassifier()

In [42]:
# Training and evaluation with train-test split:
clf_1.fit(X_1_train, y_1_train)
y_1_pred = clf_1.predict(X_1_test)

print('report:\n ', metrics.classification_report(y_1_test, y_1_pred))
print('confusion matrix \n', metrics.confusion_matrix(y_1_test, y_1_pred))

report:
                precision    recall  f1-score   support

           0       0.50      0.62      0.55       130
           1       0.64      0.52      0.58       170

    accuracy                           0.57       300
   macro avg       0.57      0.57      0.57       300
weighted avg       0.58      0.57      0.57       300

confusion matrix 
 [[81 49]
 [81 89]]


In [60]:
# Training and evaluation using Cross-validate and specifying metrics. Macro averaged scores are found, as used for other metrics.
# Note that the average is found over all models to find a performance metrics for our setup as a whole:

#The defined Folds in cell above is changed and tested for 5 and 10 folds for both cross-validators. 

cv_11 = cross_validate(clf_1, X_1, y_1, cv=kf_1, scoring=('accuracy', 'precision_macro', 'recall_macro', 'f1_macro'))
print('K-Fold:')
print('Avg accuracy: ', cv_11['test_accuracy'].mean())
print('Avg macroavg-f1-score: ', cv_11['test_f1_macro'].mean())
print('Avg precision_macroavg score: ', cv_11['test_precision_macro'].mean())
print('Avg recall_macroavg score: ', cv_11['test_recall_macro'].mean())

cv_12 = cross_validate(clf_1, X_1, y_1, cv=skf_1, scoring=('accuracy', 'precision_macro', 'recall_macro', 'f1_macro'))
print('\nStratified K-Fold:')
print('Avg accuracy: ', cv_12['test_accuracy'].mean())
print('Avg macroavg-f1-score: ', cv_12['test_f1_macro'].mean())
print('Avg precision_macroavg score: ', cv_12['test_precision_macro'].mean())
print('Avg recall_macroavg score: ', cv_12['test_recall_macro'].mean())

K-Fold:
Avg accuracy:  0.595
Avg macroavg-f1-score:  0.5925506869777641
Avg precision_macroavg score:  0.5939821734920671
Avg recall_macroavg score:  0.5933896418769297

Stratified K-Fold:
Avg accuracy:  0.6129999999999999
Avg macroavg-f1-score:  0.61133309545854
Avg precision_macroavg score:  0.6126758576372613
Avg recall_macroavg score:  0.6121933511924981


### 1.2 - All one hot encoded, parental edu ordinal

In [43]:
# Minmaxscaling parent edu ordinal encoding to values between 0-1:

scaler = MinMaxScaler()
df2["parent edu sc"] = scaler.fit_transform(df2["parent edu"].values.reshape(-1,1))

df3 = df2.drop("parent edu", axis=1)
df2 = df2.drop("parent edu sc", axis=1)

In [44]:
# Define feature columns:
feature = ["parent edu sc", "race A", "race B", "race C", "race D", "gender", "standard lunch", "completed course"]
X_2 = df3[feature]

# Target score:
y_2 = df3['above avg score']

In [51]:
# Train test split:
X_2_train, X_2_test, y_2_train, y_2_test = train_test_split(X_2, y_2, test_size = 0.3, random_state = 34)

In [62]:
# Define K-Fold and Stratified K-Fold CV:
kf_2 = KFold(n_splits=10, shuffle=True, random_state=13)
skf_2 = StratifiedKFold(n_splits=10, shuffle=True, random_state=13)

In [53]:
# Get our model:
clf_2 = KNeighborsClassifier()

In [54]:
# Training and evaluating model with train-test split:
clf_2.fit(X_2_train, y_2_train)
y_2_pred = clf_2.predict(X_2_test)

print('report:\n ', metrics.classification_report(y_2_test, y_2_pred))
print('confusion matrix \n', metrics.confusion_matrix(y_2_test, y_2_pred))

report:
                precision    recall  f1-score   support

           0       0.54      0.58      0.56       130
           1       0.66      0.61      0.63       170

    accuracy                           0.60       300
   macro avg       0.60      0.60      0.60       300
weighted avg       0.60      0.60      0.60       300

confusion matrix 
 [[ 76  54]
 [ 66 104]]


In [63]:
# Training and evaluation using Cross-validate and specifying metrics. Macro averaged scores are found, as used for other metrics.
# Note that the average is found over all models to find a performance metrics for our setup as a whole:

#The defined Folds in cell above is changed and tested for 5 and 10 folds for both cross-validators. 

cv_21 = cross_validate(clf_2, X_2, y_2, cv=kf_2, scoring=('accuracy', 'precision_macro', 'recall_macro', 'f1_macro'))
print('K-Fold:')
print('Avg accuracy: ', cv_21['test_accuracy'].mean())
print('Avg macroavg-f1-score: ', cv_21['test_f1_macro'].mean())
print('Avg precision_macroavg score: ', cv_21['test_precision_macro'].mean())
print('Avg recall_macroavg score: ', cv_21['test_recall_macro'].mean())

cv_22 = cross_validate(clf_2, X_2, y_2, cv=skf_2, scoring=('accuracy', 'precision_macro', 'recall_macro', 'f1_macro'))
print('\nStratified K-Fold:')
print('Avg accuracy: ', cv_22['test_accuracy'].mean())
print('Avg macroavg-f1-score: ', cv_22['test_f1_macro'].mean())
print('Avg precision_macroavg score: ', cv_22['test_precision_macro'].mean())
print('Avg recall_macroavg score: ', cv_22['test_recall_macro'].mean())

K-Fold:
Avg accuracy:  0.608
Avg macroavg-f1-score:  0.6037882853136475
Avg precision_macroavg score:  0.6091754552650603
Avg recall_macroavg score:  0.6057903009357926

Stratified K-Fold:
Avg accuracy:  0.609
Avg macroavg-f1-score:  0.6059635949828476
Avg precision_macroavg score:  0.6088064333288224
Avg recall_macroavg score:  0.6072653539923211


## 2 - Manual parameter tuning

### 2.1 - All one-hot encoded

In [64]:
# Define feature columns:
fcols1 = ["parent_associate's degree", "parent_bachelor's degree", "parent_high school", "parent_master's degree", "parent_some college", "race A", "race B", "race C", "race D", "gender", "standard lunch", "completed course"]
X1 = df1[fcols1]

# Define target variable
y1 = df1['above avg score']

In [219]:
# Train test split:
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size = 0.3, random_state = 34)

In [230]:
# Define K-Fold and Stratified K-Fold CV:
kf1 = KFold(n_splits=10, shuffle=True, random_state=13)
skf1 = StratifiedKFold(n_splits=10, shuffle=True, random_state=13)

In [221]:
# Import model
clf1 = KNeighborsClassifier(n_neighbors = 8, weights = 'uniform')

In [222]:
# Train and evaluate model using train test split:
clf1.fit(X1_train, y1_train)
y1_pred = clf1.predict(X1_test)

print('report:\n ', metrics.classification_report(y1_test, y1_pred))
print('confusion matrix \n', metrics.confusion_matrix(y1_test, y1_pred))

report:
                precision    recall  f1-score   support

           0       0.52      0.65      0.57       130
           1       0.66      0.54      0.59       170

    accuracy                           0.58       300
   macro avg       0.59      0.59      0.58       300
weighted avg       0.60      0.58      0.58       300

confusion matrix 
 [[84 46]
 [79 91]]


In [231]:
# Training and evaluation using Cross-validate and specifying metrics. Macro averaged scores are found, as used for other metrics.
# Note that the average is found over all models to find a performance metrics for our setup as a whole:

#The defined Folds in cell above is changed and tested for 5 and 10 folds for both cross-validators. 

cv11 = cross_validate(clf1, X1, y1, cv=kf1, scoring=('accuracy', 'precision_macro', 'recall_macro', 'f1_macro'))
print('K-Fold:')
print('Avg accuracy: ', cv11['test_accuracy'].mean())
print('Avg macroavg-f1-score: ', cv11['test_f1_macro'].mean())
print('Avg precision_macroavg score: ', cv11['test_precision_macro'].mean())
print('Avg recall_macroavg score: ', cv11['test_recall_macro'].mean())

cv12 = cross_validate(clf1, X1, y1, cv=skf1, scoring=('accuracy', 'precision_macro', 'recall_macro', 'f1_macro'))
print('\nStratified K-Fold:')
print('Avg accuracy: ', cv12['test_accuracy'].mean())
print('Avg macroavg-f1-score: ', cv12['test_f1_macro'].mean())
print('Avg precision_macroavg score: ', cv12['test_precision_macro'].mean())
print('Avg recall_macroavg score: ', cv12['test_recall_macro'].mean())

K-Fold:
Avg accuracy:  0.588
Avg macroavg-f1-score:  0.5846299043570052
Avg precision_macroavg score:  0.5959782099674574
Avg recall_macroavg score:  0.5930815871547415

Stratified K-Fold:
Avg accuracy:  0.604
Avg macroavg-f1-score:  0.6003439077284117
Avg precision_macroavg score:  0.6164921399178913
Avg recall_macroavg score:  0.6100053140022028


### 2.2 - Parent edu Ordinal encoded

In [24]:
## use the scaled dataframe df3
# Feature and target
fcols2 = ["parent edu sc", "race A", "race B", "race C", "race D", "gender", "standard lunch", "completed course"]
X2 = df3[fcols2]
y2 = df3['above avg score']

In [227]:
#Train test split
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size = 0.3, random_state = 34)

In [233]:
# Define K-Fold and Stratified K-Fold CV:
kf2 = KFold(n_splits=10, shuffle=True, random_state=13)
skf2 = StratifiedKFold(n_splits=10, shuffle=True, random_state=13)

In [223]:
# Import model
clf2 = KNeighborsClassifier(n_neighbors = 8, weights='uniform')

In [228]:
# Train and evaluate model using train test split:
clf2.fit(X2_train, y2_train)
y2_pred = clf2.predict(X2_test)

print('report:\n ', metrics.classification_report(y2_test, y2_pred))
print('confusion matrix \n', metrics.confusion_matrix(y2_test, y2_pred))

report:
                precision    recall  f1-score   support

           0       0.53      0.68      0.60       130
           1       0.69      0.53      0.60       170

    accuracy                           0.60       300
   macro avg       0.61      0.61      0.60       300
weighted avg       0.62      0.60      0.60       300

confusion matrix 
 [[89 41]
 [80 90]]


In [234]:
# Training and evaluation using Cross-validate and specifying metrics. Macro averaged scores are found, as used for other metrics.
# Note that the average is found over all models to find a performance metrics for our setup as a whole:

#The defined Folds in cell above is changed and tested for 5 and 10 folds for both cross-validators. 

cv21 = cross_validate(clf2, X2, y2, cv=kf2, scoring=('accuracy', 'precision_macro', 'recall_macro', 'f1_macro'))
print('K-Fold:')
print('Avg accuracy: ', cv21['test_accuracy'].mean())
print('Avg macroavg-f1-score: ', cv21['test_f1_macro'].mean())
print('Avg precision_macroavg score: ', cv21['test_precision_macro'].mean())
print('Avg recall_macroavg score: ', cv21['test_recall_macro'].mean())

cv22 = cross_validate(clf2, X2, y2, cv=skf2, scoring=('accuracy', 'precision_macro', 'recall_macro', 'f1_macro'))
print('\nStratified K-Fold:')
print('Avg accuracy: ', cv22['test_accuracy'].mean())
print('Avg macroavg-f1-score: ', cv22['test_f1_macro'].mean())
print('Avg precision_macroavg score: ', cv22['test_precision_macro'].mean())
print('Avg recall_macroavg score: ', cv22['test_recall_macro'].mean())

K-Fold:
Avg accuracy:  0.621
Avg macroavg-f1-score:  0.6177140151657192
Avg precision_macroavg score:  0.6268235597676555
Avg recall_macroavg score:  0.6238301965346734

Stratified K-Fold:
Avg accuracy:  0.617
Avg macroavg-f1-score:  0.6152242244199504
Avg precision_macroavg score:  0.6238201601444636
Avg recall_macroavg score:  0.6208821951332488
