# Students exam performance binary classification using Decision tree classifier

In [1]:
# Import libraries, not all are necessaily used in final version

import numpy as np
import pandas as pd

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score, KFold, StratifiedKFold, cross_validate
from sklearn import metrics
from sklearn import tree



## Getting the data

In [2]:
df1 = pd.read_csv('OnehotAll.csv')
df2 = pd.read_csv('OnehotOrd.csv')

In [3]:
df1.head() #Only one-hot encoding

Unnamed: 0,parent_associate's degree,parent_bachelor's degree,parent_high school,parent_master's degree,parent_some college,race A,race B,race C,race D,gender,standard lunch,completed course,above avg score
0,0,1,0,0,0,0,1,0,0,1,1,0,1
1,0,0,0,0,1,0,0,1,0,1,1,1,1
2,0,0,0,1,0,0,1,0,0,1,1,0,1
3,1,0,0,0,0,1,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,1,0,0,1,0,1


In [4]:
df2.head() #Parent edu is ordinal

Unnamed: 0,parent edu,race A,race B,race C,race D,gender,standard lunch,completed course,above avg score
0,4,0,1,0,0,1,1,0,1
1,2,0,0,1,0,1,1,1,1
2,5,0,1,0,0,1,1,0,1
3,3,1,0,0,0,0,0,0,0
4,2,0,0,1,0,0,1,0,1


## Model training and evaluation

Note that random_state values are consistent throughout the notebook, to ensure reproducability of results. For train/test splits and K-fold CV, the configurations in this final version of the notebook are simply the last ones used. They are varied throughout experimenting.

### 1 - No parameter tuning

### 1.1 - All one-hot encoded

In [5]:
# Define feature columns:
features = ["parent_associate's degree", "parent_bachelor's degree", "parent_high school", "parent_master's degree", "parent_some college", "race A", "race B", "race C", "race D", "gender", "standard lunch", "completed course"]
X_1 = df1[features]

# Target label:
y_1 = df1['above avg score']

In [6]:
# Train test split, 80/20, 90/10 and 70/30 used
X_1_train, X_1_test, y_1_train, y_1_test = train_test_split(X_1, y_1, test_size = 0.3, random_state = 34)

In [37]:
# Define K-Fold and Stratified K-Fold CV, 5- and 10-folds are used
kf_1 = KFold(n_splits=10, shuffle=True, random_state=13)
skf_1 = StratifiedKFold(n_splits=10, shuffle=True, random_state=13)

In [8]:
# Get our model:
clf_1 = DecisionTreeClassifier(random_state = 43)

In [9]:
# Training and evaluation with train-test split:
clf_1.fit(X_1_train, y_1_train)
y_1_pred = clf_1.predict(X_1_test)

print('report:\n ', metrics.classification_report(y_1_test, y_1_pred))
print('confusion matrix \n', metrics.confusion_matrix(y_1_test, y_1_pred))

report:
                precision    recall  f1-score   support

           0       0.50      0.66      0.57       130
           1       0.66      0.50      0.57       170

    accuracy                           0.57       300
   macro avg       0.58      0.58      0.57       300
weighted avg       0.59      0.57      0.57       300

confusion matrix 
 [[86 44]
 [85 85]]


In [38]:
# Training and evaluation using Cross-validate and specifying metrics. Macro averaged scores are found, as used for other metrics.
# Note that the average is found over all models to find a performance metrics for our setup as a whole:

#The defined Folds in cell above is changed and tested for 5 and 10 folds for both cross-validators. 

cv_11 = cross_validate(clf_1, X_1, y_1, cv=kf_1, scoring=('accuracy', 'precision_macro', 'recall_macro', 'f1_macro'))
print('K-Fold:')
print('Avg accuracy: ', cv_11['test_accuracy'].mean())
print('Avg macroavg-f1-score: ', cv_11['test_f1_macro'].mean())
print('Avg precision_macroavg score: ', cv_11['test_precision_macro'].mean())
print('Avg recall_macroavg score: ', cv_11['test_recall_macro'].mean())

cv_12 = cross_validate(clf_1, X_1, y_1, cv=skf_1, scoring=('accuracy', 'precision_macro', 'recall_macro', 'f1_macro'))
print('\nStratified K-Fold:')
print('Avg accuracy: ', cv_12['test_accuracy'].mean())
print('Avg macroavg-f1-score: ', cv_12['test_f1_macro'].mean())
print('Avg precision_macroavg score: ', cv_12['test_precision_macro'].mean())
print('Avg recall_macroavg score: ', cv_12['test_recall_macro'].mean())

K-Fold:
Avg accuracy:  0.5820000000000001
Avg macroavg-f1-score:  0.5805284441507442
Avg precision_macroavg score:  0.5849139816399406
Avg recall_macroavg score:  0.5843623947627199

Stratified K-Fold:
Avg accuracy:  0.579
Avg macroavg-f1-score:  0.5778129758632377
Avg precision_macroavg score:  0.5813909081831675
Avg recall_macroavg score:  0.5804454690732792


### 1.2 - All are one-hot encoded, parental education is ordinal encoded

In [11]:
# Define feature columns:
feature = ["parent edu", "race A", "race B", "race C", "race D", "gender", "standard lunch", "completed course"]
X_2 = df2[feature]

# Target score:
y_2 = df2['above avg score']

In [12]:
# Train test split:
X_2_train, X_2_test, y_2_train, y_2_test = train_test_split(X_2, y_2, test_size = 0.3, random_state = 34)

In [40]:
# Define K-Fold and Stratified K-Fold CV:
kf_2 = KFold(n_splits=10, shuffle=True, random_state=13)
skf_2 = StratifiedKFold(n_splits=10, shuffle=True, random_state=13)

In [14]:
# Get our model:
clf_2 = DecisionTreeClassifier(random_state = 43)

In [15]:
# Training and evaluating model with train-test split:
clf_2.fit(X_2_train, y_2_train)
y_2_pred = clf_2.predict(X_2_test)

print('report:\n ', metrics.classification_report(y_2_test, y_2_pred))
print('confusion matrix \n', metrics.confusion_matrix(y_2_test, y_2_pred))

report:
                precision    recall  f1-score   support

           0       0.50      0.67      0.57       130
           1       0.66      0.48      0.56       170

    accuracy                           0.56       300
   macro avg       0.58      0.58      0.56       300
weighted avg       0.59      0.56      0.56       300

confusion matrix 
 [[87 43]
 [88 82]]


In [41]:
# Training and evaluation using Cross-validate and specifying metrics. Macro averaged scores are found, as used for other metrics.
# Note that the average is found over all models to find a performance metrics for our setup as a whole:

#The defined Folds in cell above is changed and tested for 5 and 10 folds for both cross-validators. 

cv_21 = cross_validate(clf_2, X_2, y_2, cv=kf_2, scoring=('accuracy', 'precision_macro', 'recall_macro', 'f1_macro'))
print('K-Fold:')
print('Avg accuracy: ', cv_21['test_accuracy'].mean())
print('Avg macroavg-f1-score: ', cv_21['test_f1_macro'].mean())
print('Avg precision_macroavg score: ', cv_21['test_precision_macro'].mean())
print('Avg recall_macroavg score: ', cv_21['test_recall_macro'].mean())

cv_22 = cross_validate(clf_2, X_2, y_2, cv=skf_2, scoring=('accuracy', 'precision_macro', 'recall_macro', 'f1_macro'))
print('\nStratified K-Fold:')
print('Avg accuracy: ', cv_22['test_accuracy'].mean())
print('Avg macroavg-f1-score: ', cv_22['test_f1_macro'].mean())
print('Avg precision_macroavg score: ', cv_22['test_precision_macro'].mean())
print('Avg recall_macroavg score: ', cv_22['test_recall_macro'].mean())

K-Fold:
Avg accuracy:  0.577
Avg macroavg-f1-score:  0.5756938178185627
Avg precision_macroavg score:  0.5800949610551488
Avg recall_macroavg score:  0.5796290820464487

Stratified K-Fold:
Avg accuracy:  0.579
Avg macroavg-f1-score:  0.5778666851323463
Avg precision_macroavg score:  0.5815240121561822
Avg recall_macroavg score:  0.5805659026340981


### 2 - Manual parameter tuning

### 2.1 - All are one-hot encoded

In [42]:
# Define feature columns:
fcols1 = ["parent_associate's degree", "parent_bachelor's degree", "parent_high school", "parent_master's degree", "parent_some college", "race A", "race B", "race C", "race D", "gender", "standard lunch", "completed course"]
X1 = df1[fcols1]

# Define target variable
y1 = df1['above avg score']

In [138]:
# Train test split:
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size = 0.3, random_state = 34)

In [156]:
# Define K-Fold and Stratified K-Fold CV:
kf1 = KFold(n_splits=10, shuffle=True, random_state=13)
skf1 = StratifiedKFold(n_splits=10, shuffle=True, random_state=13)

In [148]:
# Import model
clf1 = DecisionTreeClassifier(criterion = 'gini', splitter = 'best', max_depth = 3, max_leaf_nodes = 6, random_state = 43)

In [149]:
# Train and evaluate model using train test split:
clf1.fit(X1_train, y1_train)
y1_pred = clf1.predict(X1_test)

print('report:\n ', metrics.classification_report(y1_test, y1_pred))
print('confusion matrix \n', metrics.confusion_matrix(y1_test, y1_pred))
#tree.plot_tree(clf1)

report:
                precision    recall  f1-score   support

           0       0.58      0.49      0.53       130
           1       0.65      0.73      0.69       170

    accuracy                           0.63       300
   macro avg       0.62      0.61      0.61       300
weighted avg       0.62      0.63      0.62       300

confusion matrix 
 [[ 64  66]
 [ 46 124]]


In [157]:
# Training and evaluation using Cross-validate and specifying metrics. Macro averaged scores are found, as used for other metrics.
# Note that the average is found over all models to find a performance metrics for our setup as a whole:

#The defined Folds in cell above is changed and tested for 5 and 10 folds for both cross-validators. 

cv11 = cross_validate(clf1, X1, y1, cv=kf1, scoring=('accuracy', 'precision_macro', 'recall_macro', 'f1_macro'))
print('K-Fold:')
print('Avg accuracy: ', cv11['test_accuracy'].mean())
print('Avg macroavg-f1-score: ', cv11['test_f1_macro'].mean())
print('Avg precision_macroavg score: ', cv11['test_precision_macro'].mean())
print('Avg recall_macroavg score: ', cv11['test_recall_macro'].mean())

cv12 = cross_validate(clf1, X1, y1, cv=skf1, scoring=('accuracy', 'precision_macro', 'recall_macro', 'f1_macro'))
print('\nStratified K-Fold:')
print('Avg accuracy: ', cv12['test_accuracy'].mean())
print('Avg macroavg-f1-score: ', cv12['test_f1_macro'].mean())
print('Avg precision_macroavg score: ', cv12['test_precision_macro'].mean())
print('Avg recall_macroavg score: ', cv12['test_recall_macro'].mean())

K-Fold:
Avg accuracy:  0.635
Avg macroavg-f1-score:  0.6070168541188498
Avg precision_macroavg score:  0.6588180554605744
Avg recall_macroavg score:  0.6222194291142402

Stratified K-Fold:
Avg accuracy:  0.629
Avg macroavg-f1-score:  0.6002194301792938
Avg precision_macroavg score:  0.6533168392216788
Avg recall_macroavg score:  0.6171271063006311


### 2.2 - All one-hot encoded except parental education

In [23]:
# Define feature variables
fcols2 = ["parent edu", "race A", "race B", "race C", "race D", "gender", "standard lunch", "completed course"]
X2 = df2[fcols2]

# Define target
y2 = df2['above avg score']

In [162]:
#Train-test split
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size = 0.3, random_state = 34)

In [165]:
# Define K-Fold and Stratified K-Fold CV:
kf2 = KFold(n_splits=10, shuffle=True, random_state=13)
skf2 = StratifiedKFold(n_splits=10, shuffle=True, random_state=13)

In [26]:
# Import model
clf2 = DecisionTreeClassifier(criterion = 'gini', splitter = 'best', max_depth = 3, max_leaf_nodes = 6, random_state = 43)

In [163]:
# Train and evaluate model using train test split:
clf2.fit(X2_train, y2_train)
y2_pred = clf2.predict(X2_test)

print('report:\n ', metrics.classification_report(y2_test, y2_pred))
print('confusion matrix \n', metrics.confusion_matrix(y2_test, y2_pred))
#tree.plot_tree(clf1)

report:
                precision    recall  f1-score   support

           0       0.62      0.40      0.49       130
           1       0.64      0.81      0.72       170

    accuracy                           0.63       300
   macro avg       0.63      0.61      0.60       300
weighted avg       0.63      0.63      0.62       300

confusion matrix 
 [[ 52  78]
 [ 32 138]]


In [166]:
# Training and evaluation using Cross-validate and specifying metrics. Macro averaged scores are found, as used for other metrics.
# Note that the average is found over all models to find a performance metrics for our setup as a whole:

#The defined Folds in cell above is changed and tested for 5 and 10 folds for both cross-validators. 

cv21 = cross_validate(clf2, X2, y2, cv=kf2, scoring=('accuracy', 'precision_macro', 'recall_macro', 'f1_macro'))
print('K-Fold:')
print('Avg accuracy: ', cv21['test_accuracy'].mean())
print('Avg macroavg-f1-score: ', cv21['test_f1_macro'].mean())
print('Avg precision_macroavg score: ', cv21['test_precision_macro'].mean())
print('Avg recall_macroavg score: ', cv21['test_recall_macro'].mean())

cv22 = cross_validate(clf2, X2, y2, cv=skf2, scoring=('accuracy', 'precision_macro', 'recall_macro', 'f1_macro'))
print('\nStratified K-Fold:')
print('Avg accuracy: ', cv22['test_accuracy'].mean())
print('Avg macroavg-f1-score: ', cv22['test_f1_macro'].mean())
print('Avg precision_macroavg score: ', cv22['test_precision_macro'].mean())
print('Avg recall_macroavg score: ', cv22['test_recall_macro'].mean())

K-Fold:
Avg accuracy:  0.6359999999999999
Avg macroavg-f1-score:  0.6101953272869526
Avg precision_macroavg score:  0.6622679833105682
Avg recall_macroavg score:  0.6235876117898005

Stratified K-Fold:
Avg accuracy:  0.624
Avg macroavg-f1-score:  0.5992952712506692
Avg precision_macroavg score:  0.6435527339939341
Avg recall_macroavg score:  0.6129604396339643
