# Students exam performance binary classification using Support vector classifier

In [1]:
## Import libraries
import numpy as np
import pandas as pd

from sklearn import metrics
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, cross_val_score
from sklearn.preprocessing import MinMaxScaler

## Get data

In [2]:
df1 = pd.read_csv('OnehotAll.csv')
df2 = pd.read_csv('OnehotOrd.csv')

In [3]:
df1.head() #Only one hot encoding

Unnamed: 0,parent_associate's degree,parent_bachelor's degree,parent_high school,parent_master's degree,parent_some college,race A,race B,race C,race D,gender,standard lunch,completed course,above avg score
0,0,1,0,0,0,0,1,0,0,1,1,0,1
1,0,0,0,0,1,0,0,1,0,1,1,1,1
2,0,0,0,1,0,0,1,0,0,1,1,0,1
3,1,0,0,0,0,1,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,1,0,0,1,0,1


In [4]:
df2.head() #Parent edu is ordinal encoded

Unnamed: 0,parent edu,race A,race B,race C,race D,gender,standard lunch,completed course,above avg score
0,4,0,1,0,0,1,1,0,1
1,2,0,0,1,0,1,1,1,1
2,5,0,1,0,0,1,1,0,1
3,3,1,0,0,0,0,0,0,0
4,2,0,0,1,0,0,1,0,1


## Model training and evaluation

## 1 - Default parameters

### 1.1 - One hot encoded data

In [5]:
#Define feature variable
features = ["parent_associate's degree", "parent_bachelor's degree", "parent_high school", "parent_master's degree", "parent_some college", "race A", "race B", "race C", "race D", "gender", "standard lunch", "completed course"]
X_1 = df1[features]

# Target score:
y_1 = df1['above avg score']

In [6]:
# Train test split:
X_1_train, X_1_test, y_1_train, y_1_test = train_test_split(X_1, y_1, test_size = 0.2, random_state = 34)

In [7]:
# Define K-Fold and Stratified K-Fold CV:
kf_1 = KFold(n_splits=5, shuffle=True, random_state=13)
skf_1 = StratifiedKFold(n_splits=5, shuffle=True, random_state=13)

In [8]:
# Get our model:
clf_1 = SVC()

In [9]:
# Training and evaluation with train-test split:
clf_1.fit(X_1_train, y_1_train)
y_1_pred = clf_1.predict(X_1_test)

print('report:\n ', metrics.classification_report(y_1_test, y_1_pred))
print('confusion matrix \n', metrics.confusion_matrix(y_1_test, y_1_pred))

report:
                precision    recall  f1-score   support

           0       0.56      0.57      0.56        87
           1       0.66      0.65      0.65       113

    accuracy                           0.61       200
   macro avg       0.61      0.61      0.61       200
weighted avg       0.62      0.61      0.62       200

confusion matrix 
 [[50 37]
 [40 73]]


In [10]:
# Training and evaluation using Cross-Validation approach:
cv_11 = cross_val_score(clf_1, X_1, y_1, cv=kf_1)
print('Accuracies: ', cv_11)
print('K-Fold CV mean accuracy: ', cv_11.mean())


cv_12 = cross_val_score(clf_1, X_1, y_1, cv=skf_1)
print('Accuracies: ', cv_12)
print('Stratified K-Fold CV mean accuracy: ', cv_12.mean())

#TODO: Need to find a way to find precision-recall and f1 score from cross_val_score.

Accuracies:  [0.62  0.655 0.67  0.58  0.58 ]
K-Fold CV mean accuracy:  0.621
Accuracies:  [0.64  0.65  0.59  0.625 0.615]
Stratified K-Fold CV mean accuracy:  0.624


### 1.2 - Parental education ordinal

In [11]:
# Minmaxscaling parent edu ordinal encoding to values between 0-1:

scaler = MinMaxScaler()
df2["parent edu sc"] = scaler.fit_transform(df2["parent edu"].values.reshape(-1,1))

df3 = df2.drop("parent edu", axis=1)
df2 = df2.drop("parent edu sc", axis=1)

In [12]:
# Define feature columns:
feature = ["parent edu sc", "race A", "race B", "race C", "race D", "gender", "standard lunch", "completed course"]
X_2 = df3[feature]

# Target score:
y_2 = df3['above avg score']

In [13]:
# Train test split:
X_2_train, X_2_test, y_2_train, y_2_test = train_test_split(X_2, y_2, test_size = 0.2, random_state = 34)

In [14]:
# Define K-Fold and Stratified K-Fold CV:
kf_2 = KFold(n_splits=5, shuffle=True, random_state=13)
skf_2 = StratifiedKFold(n_splits=5, shuffle=True, random_state=13)

In [15]:
# Get our model:
clf_2 = SVC()

In [16]:
# Training and evaluating model with train-test split:
clf_2.fit(X_2_train, y_2_train)
y_2_pred = clf_2.predict(X_2_test)

print('report:\n ', metrics.classification_report(y_2_test, y_2_pred))
print('confusion matrix \n', metrics.confusion_matrix(y_2_test, y_2_pred))

report:
                precision    recall  f1-score   support

           0       0.57      0.62      0.60        87
           1       0.69      0.65      0.67       113

    accuracy                           0.64       200
   macro avg       0.63      0.63      0.63       200
weighted avg       0.64      0.64      0.64       200

confusion matrix 
 [[54 33]
 [40 73]]


In [17]:
# Training and evaluation using Cross-Validation approach:
cv_21 = cross_val_score(clf_2, X_2, y_2, cv=kf_2)
print('Accuracies: ', cv_21)
print('K-Fold CV mean accuracy: ', cv_21.mean())


cv_22 = cross_val_score(clf_2, X_2, y_2, cv=skf_2)
print('Accuracies: ', cv_22)
print('Stratified K-Fold CV mean accuracy: ', cv_22.mean())


#TODO: Need to find a way to find precision-recall and f1 score from cross_val_score.

Accuracies:  [0.635 0.67  0.685 0.59  0.595]
K-Fold CV mean accuracy:  0.635
Accuracies:  [0.63  0.655 0.62  0.64  0.63 ]
Stratified K-Fold CV mean accuracy:  0.635


## 2 - Manual parameter tuning

### 2.1 - One-hot encoded

In [18]:
#Define feature variable
fcols1 = ["parent_associate's degree", "parent_bachelor's degree", "parent_high school", "parent_master's degree", "parent_some college", "race A", "race B", "race C", "race D", "gender", "standard lunch", "completed course"]
X1 = df1[fcols1]

# Target score:
y1 = df1['above avg score']

In [19]:
# Train test split:
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size = 0.2, random_state = 34)

In [20]:
# Define K-Fold and Stratified K-Fold CV:
kf1 = KFold(n_splits=5, shuffle=True, random_state=13)
skf1 = StratifiedKFold(n_splits=5, shuffle=True, random_state=13)

In [21]:
# Get model
clf1 = SVC(gamma=0.01, C=500, kernel='rbf')

In [22]:
# Training and evaluation with train-test split:
clf1.fit(X1_train, y1_train)
y1_pred = clf1.predict(X1_test)

print('report:\n ', metrics.classification_report(y1_test, y1_pred))
print('confusion matrix \n', metrics.confusion_matrix(y1_test, y1_pred))

report:
                precision    recall  f1-score   support

           0       0.55      0.61      0.58        87
           1       0.67      0.62      0.65       113

    accuracy                           0.61       200
   macro avg       0.61      0.61      0.61       200
weighted avg       0.62      0.61      0.62       200

confusion matrix 
 [[53 34]
 [43 70]]


In [23]:
# Training and evaluation using Cross-Validation approach:
cv11 = cross_val_score(clf1, X1, y1, cv=kf1)
print('Accuracies: ', cv11)
print('K-Fold CV mean accuracy: ', cv11.mean())


cv12 = cross_val_score(clf1, X1, y1, cv=skf1)
print('Accuracies: ', cv12)
print('Stratified K-Fold CV mean accuracy: ', cv12.mean())


#TODO: Need to find a way to find precision-recall and f1 score from cross_val_score.

Accuracies:  [0.64  0.665 0.68  0.585 0.58 ]
K-Fold CV mean accuracy:  0.6300000000000001
Accuracies:  [0.61  0.66  0.625 0.63  0.63 ]
Stratified K-Fold CV mean accuracy:  0.631


### 2.2 - Ordinal encoded parental edu

In [24]:
# use df3 from before, with scaled ordinal encoding

# Define feature columns:
fcols2= ["parent edu sc", "race A", "race B", "race C", "race D", "gender", "standard lunch", "completed course"]
X2 = df3[fcols2]

# Target score:
y2 = df3['above avg score']

In [25]:
# Train test split:
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size = 0.2, random_state = 34)

In [26]:
# Define K-Fold and Stratified K-Fold CV:
kf2 = KFold(n_splits=5, shuffle=True, random_state=13)
skf2 = StratifiedKFold(n_splits=5, shuffle=True, random_state=13)

In [27]:
# Get model
clf2 = SVC(gamma=0.01, C=500, kernel='rbf')

In [28]:
# Training and evaluating model with train-test split:
clf2.fit(X2_train, y2_train)
y2_pred = clf2.predict(X2_test)

print('report:\n ', metrics.classification_report(y2_test, y2_pred))
print('confusion matrix \n', metrics.confusion_matrix(y2_test, y2_pred))

report:
                precision    recall  f1-score   support

           0       0.59      0.59      0.59        87
           1       0.68      0.68      0.68       113

    accuracy                           0.64       200
   macro avg       0.63      0.63      0.63       200
weighted avg       0.64      0.64      0.64       200

confusion matrix 
 [[51 36]
 [36 77]]


In [29]:
# Training and evaluation using Cross-Validation approach:
cv21 = cross_val_score(clf2, X2, y2, cv=kf2)
print('Accuracies: ', cv21)
print('K-Fold CV mean accuracy: ', cv21.mean())


cv22 = cross_val_score(clf2, X2, y2, cv=skf2)
print('Accuracies: ', cv22)
print('Stratified K-Fold CV mean accuracy: ', cv22.mean())


#TODO: Need to find a way to find precision-recall and f1 score from cross_val_score.

Accuracies:  [0.635 0.665 0.665 0.6   0.59 ]
K-Fold CV mean accuracy:  0.631
Accuracies:  [0.625 0.64  0.645 0.66  0.64 ]
Stratified K-Fold CV mean accuracy:  0.6420000000000001
