# Students exam performance binary classification using Support vector classifier

In [53]:
## Import libraries, note that not all ar necesarrily used in the final version
import numpy as np
import pandas as pd

from sklearn import metrics
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, cross_val_score, cross_validate
from sklearn.preprocessing import MinMaxScaler

## Get data

In [31]:
df1 = pd.read_csv('OnehotAll.csv')
df2 = pd.read_csv('OnehotOrd.csv')

In [32]:
df1.head() #Only one hot encoding

Unnamed: 0,parent_associate's degree,parent_bachelor's degree,parent_high school,parent_master's degree,parent_some college,race A,race B,race C,race D,gender,standard lunch,completed course,above avg score
0,0,1,0,0,0,0,1,0,0,1,1,0,1
1,0,0,0,0,1,0,0,1,0,1,1,1,1
2,0,0,0,1,0,0,1,0,0,1,1,0,1
3,1,0,0,0,0,1,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,1,0,0,1,0,1


In [33]:
df2.head() #Parent edu is ordinal encoded

Unnamed: 0,parent edu,race A,race B,race C,race D,gender,standard lunch,completed course,above avg score
0,4,0,1,0,0,1,1,0,1
1,2,0,0,1,0,1,1,1,1
2,5,0,1,0,0,1,1,0,1
3,3,1,0,0,0,0,0,0,0
4,2,0,0,1,0,0,1,0,1


## Model training and evaluation

Note that random_state values are consistent throughout the notebook, to ensure reproducability of results.
For train/test splits and K-fold CV, the configurations in this final version of the notebook are simply the last ones used. They are varied throughout experimenting.

## 1 - Default parameters

### 1.1 - One hot encoded data

In [34]:
#Define feature variables of the model
features = ["parent_associate's degree", "parent_bachelor's degree", "parent_high school", "parent_master's degree", "parent_some college", "race A", "race B", "race C", "race D", "gender", "standard lunch", "completed course"]
X_1 = df1[features]

# Target label of model, ie. what to predict:
y_1 = df1['above avg score']

In [41]:
# Train test split, variations of 80/20, 90/10 and 70/30 are used
X_1_train, X_1_test, y_1_train, y_1_test = train_test_split(X_1, y_1, test_size = 0.3, random_state = 34)

In [56]:
# Define K-Fold and Stratified K-Fold CV, 5 and 10-folds are used for both.
kf_1 = KFold(n_splits=10, shuffle=True, random_state=13)
skf_1 = StratifiedKFold(n_splits=10, shuffle=True, random_state=13)

In [37]:
# Get our model:
clf_1 = SVC()

In [42]:
# Training and evaluation with train-test split:
clf_1.fit(X_1_train, y_1_train)
y_1_pred = clf_1.predict(X_1_test)

print('report:\n ', metrics.classification_report(y_1_test, y_1_pred))
print('confusion matrix \n', metrics.confusion_matrix(y_1_test, y_1_pred))

report:
                precision    recall  f1-score   support

           0       0.55      0.63      0.59       130
           1       0.68      0.61      0.65       170

    accuracy                           0.62       300
   macro avg       0.62      0.62      0.62       300
weighted avg       0.63      0.62      0.62       300

confusion matrix 
 [[ 82  48]
 [ 66 104]]


In [57]:
# Training and evaluation using Cross-validate and specifying metrics. Macro averaged scores are found, as used for other metrics.
# Note that the average is found over all models to find a performance metrics for our setup as a whole:

#The defined Folds in cell above is changed and tested for 5 and 10 folds for both cross-validators. 

cv_11 = cross_validate(clf_1, X_1, y_1, cv=kf_1, scoring=('accuracy', 'precision_macro', 'recall_macro', 'f1_macro'))
print('K-Fold:')
print('Avg accuracy: ', cv_11['test_accuracy'].mean())
print('Avg macroavg-f1-score: ', cv_11['test_f1_macro'].mean())
print('Avg precision_macroavg score: ', cv_11['test_precision_macro'].mean())
print('Avg recall_macroavg score: ', cv_11['test_recall_macro'].mean())

cv_12 = cross_validate(clf_1, X_1, y_1, cv=skf_1, scoring=('accuracy', 'precision_macro', 'recall_macro', 'f1_macro'))
print('\nStratified K-Fold:')
print('Avg accuracy: ', cv_12['test_accuracy'].mean())
print('Avg macroavg-f1-score: ', cv_12['test_f1_macro'].mean())
print('Avg precision_macroavg score: ', cv_12['test_precision_macro'].mean())
print('Avg recall_macroavg score: ', cv_12['test_recall_macro'].mean())

K-Fold:
Avg accuracy:  0.6209999999999999
Avg macroavg-f1-score:  0.6161883508242451
Avg precision_macroavg score:  0.6203126937373554
Avg recall_macroavg score:  0.6168357739515994

Stratified K-Fold:
Avg accuracy:  0.625
Avg macroavg-f1-score:  0.6203517259904681
Avg precision_macroavg score:  0.6251011579881747
Avg recall_macroavg score:  0.6219852365953329


### 1.2 - Parental education ordinal

In [43]:
# Minmaxscaling parent edu ordinal encoding to values between 0-1:

scaler = MinMaxScaler()
df2["parent edu sc"] = scaler.fit_transform(df2["parent edu"].values.reshape(-1,1))

df3 = df2.drop("parent edu", axis=1)
df2 = df2.drop("parent edu sc", axis=1)

In [44]:
# Define feature columns:
feature = ["parent edu sc", "race A", "race B", "race C", "race D", "gender", "standard lunch", "completed course"]
X_2 = df3[feature]

# Target score:
y_2 = df3['above avg score']

In [51]:
# Train test split:
X_2_train, X_2_test, y_2_train, y_2_test = train_test_split(X_2, y_2, test_size = 0.3, random_state = 34)

In [59]:
# Define K-Fold and Stratified K-Fold CV:
kf_2 = KFold(n_splits=10, shuffle=True, random_state=13)
skf_2 = StratifiedKFold(n_splits=10, shuffle=True, random_state=13)

In [47]:
# Get our model:
clf_2 = SVC()

In [52]:
# Training and evaluating model with train-test split:
clf_2.fit(X_2_train, y_2_train)
y_2_pred = clf_2.predict(X_2_test)

print('report:\n ', metrics.classification_report(y_2_test, y_2_pred))
print('confusion matrix \n', metrics.confusion_matrix(y_2_test, y_2_pred))

report:
                precision    recall  f1-score   support

           0       0.57      0.61      0.59       130
           1       0.69      0.65      0.67       170

    accuracy                           0.63       300
   macro avg       0.63      0.63      0.63       300
weighted avg       0.64      0.63      0.63       300

confusion matrix 
 [[ 79  51]
 [ 59 111]]


In [60]:
# Training and evaluation using Cross-validate and specifying metrics. Macro averaged scores are found, as used for other metrics.
# Note that the average is found over all models to find a performance metrics for our setup as a whole:

#The defined Folds in cell above is changed and tested for 5 and 10 folds for both cross-validators. 

cv_21 = cross_validate(clf_2, X_2, y_2, cv=kf_2, scoring=('accuracy', 'precision_macro', 'recall_macro', 'f1_macro'))
print('K-Fold:')
print('Avg accuracy: ', cv_21['test_accuracy'].mean())
print('Avg macroavg-f1-score: ', cv_21['test_f1_macro'].mean())
print('Avg precision_macroavg score: ', cv_21['test_precision_macro'].mean())
print('Avg recall_macroavg score: ', cv_21['test_recall_macro'].mean())

cv_22 = cross_validate(clf_2, X_2, y_2, cv=skf_2, scoring=('accuracy', 'precision_macro', 'recall_macro', 'f1_macro'))
print('\nStratified K-Fold:')
print('Avg accuracy: ', cv_22['test_accuracy'].mean())
print('Avg macroavg-f1-score: ', cv_22['test_f1_macro'].mean())
print('Avg precision_macroavg score: ', cv_22['test_precision_macro'].mean())
print('Avg recall_macroavg score: ', cv_22['test_recall_macro'].mean())

K-Fold:
Avg accuracy:  0.64
Avg macroavg-f1-score:  0.6334030623743363
Avg precision_macroavg score:  0.6421082444919797
Avg recall_macroavg score:  0.6343521201144245

Stratified K-Fold:
Avg accuracy:  0.646
Avg macroavg-f1-score:  0.638808222199652
Avg precision_macroavg score:  0.6476707190093353
Avg recall_macroavg score:  0.6409273062512224


## 2 - Manual parameter tuning

### 2.1 - One-hot encoded

In [61]:
#Define feature variable
fcols1 = ["parent_associate's degree", "parent_bachelor's degree", "parent_high school", "parent_master's degree", "parent_some college", "race A", "race B", "race C", "race D", "gender", "standard lunch", "completed course"]
X1 = df1[fcols1]

# Target score:
y1 = df1['above avg score']

In [230]:
# Train test split:
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size = 0.3, random_state = 34)

In [238]:
# Define K-Fold and Stratified K-Fold CV:
kf1 = KFold(n_splits=10, shuffle=True, random_state=13)
skf1 = StratifiedKFold(n_splits=10, shuffle=True, random_state=13)

In [228]:
# Get model
clf1 = SVC(C=0.49, gamma=0.01, kernel='rbf')

In [231]:
# Training and evaluation with train-test split:
clf1.fit(X1_train, y1_train)
y1_pred = clf1.predict(X1_test)

print('report:\n ', metrics.classification_report(y1_test, y1_pred))
print('confusion matrix \n', metrics.confusion_matrix(y1_test, y1_pred))

report:
                precision    recall  f1-score   support

           0       0.62      0.55      0.59       130
           1       0.68      0.74      0.71       170

    accuracy                           0.66       300
   macro avg       0.65      0.65      0.65       300
weighted avg       0.66      0.66      0.66       300

confusion matrix 
 [[ 72  58]
 [ 44 126]]


In [239]:
# Training and evaluation using Cross-validate and specifying metrics. Macro averaged scores are found, as used for other metrics.
# Note that the average is found over all models to find a performance metrics for our setup as a whole:

#The defined Folds in cell above is changed and tested for 5 and 10 folds for both cross-validators. 

cv11 = cross_validate(clf1, X1, y1, cv=kf1, scoring=('accuracy', 'precision_macro', 'recall_macro', 'f1_macro'))
print('K-Fold:')
print('Avg accuracy: ', cv11['test_accuracy'].mean())
print('Avg macroavg-f1-score: ', cv11['test_f1_macro'].mean())
print('Avg precision_macroavg score: ', cv11['test_precision_macro'].mean())
print('Avg recall_macroavg score: ', cv11['test_recall_macro'].mean())

cv12 = cross_validate(clf1, X1, y1, cv=skf1, scoring=('accuracy', 'precision_macro', 'recall_macro', 'f1_macro'))
print('\nStratified K-Fold:')
print('Avg accuracy: ', cv12['test_accuracy'].mean())
print('Avg macroavg-f1-score: ', cv12['test_f1_macro'].mean())
print('Avg precision_macroavg score: ', cv12['test_precision_macro'].mean())
print('Avg recall_macroavg score: ', cv12['test_recall_macro'].mean())

K-Fold:
Avg accuracy:  0.6369999999999999
Avg macroavg-f1-score:  0.619616200195957
Avg precision_macroavg score:  0.6476887989000403
Avg recall_macroavg score:  0.6276940139642594

Stratified K-Fold:
Avg accuracy:  0.648
Avg macroavg-f1-score:  0.6311085664694037
Avg precision_macroavg score:  0.6608253407692851
Avg recall_macroavg score:  0.6387922804146209


### 2.2 - Ordinal encoded parental edu

In [24]:
# use df3 from before, with scaled ordinal encoding

# Define feature columns:
fcols2= ["parent edu sc", "race A", "race B", "race C", "race D", "gender", "standard lunch", "completed course"]
X2 = df3[fcols2]

# Target score:
y2 = df3['above avg score']

In [235]:
# Train test split:
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size = 0.3, random_state = 34)

In [241]:
# Define K-Fold and Stratified K-Fold CV:
kf2 = KFold(n_splits=10, shuffle=True, random_state=13)
skf2 = StratifiedKFold(n_splits=10, shuffle=True, random_state=13)

In [219]:
# Get model
clf2 = SVC(C=0.49, gamma=0.01, kernel='rbf')

In [236]:
# Training and evaluating model with train-test split:
clf2.fit(X2_train, y2_train)
y2_pred = clf2.predict(X2_test)

print('report:\n ', metrics.classification_report(y2_test, y2_pred))
print('confusion matrix \n', metrics.confusion_matrix(y2_test, y2_pred))

report:
                precision    recall  f1-score   support

           0       0.59      0.53      0.56       130
           1       0.67      0.72      0.69       170

    accuracy                           0.64       300
   macro avg       0.63      0.62      0.62       300
weighted avg       0.63      0.64      0.63       300

confusion matrix 
 [[ 69  61]
 [ 48 122]]


In [242]:
# Training and evaluation using Cross-validate and specifying metrics. Macro averaged scores are found, as used for other metrics.
# Note that the average is found over all models to find a performance metrics for our setup as a whole:

#The defined Folds in cell above is changed and tested for 5 and 10 folds for both cross-validators. 

cv21 = cross_validate(clf2, X2, y2, cv=kf2, scoring=('accuracy', 'precision_macro', 'recall_macro', 'f1_macro'))
print('K-Fold:')
print('Avg accuracy: ', cv21['test_accuracy'].mean())
print('Avg macroavg-f1-score: ', cv21['test_f1_macro'].mean())
print('Avg precision_macroavg score: ', cv21['test_precision_macro'].mean())
print('Avg recall_macroavg score: ', cv21['test_recall_macro'].mean())

cv22 = cross_validate(clf2, X2, y2, cv=skf2, scoring=('accuracy', 'precision_macro', 'recall_macro', 'f1_macro'))
print('\nStratified K-Fold:')
print('Avg accuracy: ', cv22['test_accuracy'].mean())
print('Avg macroavg-f1-score: ', cv22['test_f1_macro'].mean())
print('Avg precision_macroavg score: ', cv22['test_precision_macro'].mean())
print('Avg recall_macroavg score: ', cv22['test_recall_macro'].mean())

K-Fold:
Avg accuracy:  0.6419999999999999
Avg macroavg-f1-score:  0.6265758405647279
Avg precision_macroavg score:  0.6545490952887029
Avg recall_macroavg score:  0.6326994553381518

Stratified K-Fold:
Avg accuracy:  0.6460000000000001
Avg macroavg-f1-score:  0.6312834808191321
Avg precision_macroavg score:  0.6569676006519808
Avg recall_macroavg score:  0.6373368163851404
