# Students exam performance binary classification using Random forest classifier

In [1]:
## Import libraries, not all are necessarily used in the final version.

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, cross_val_score, cross_validate, GridSearchCV
from sklearn import tree

## Get data

In [2]:
# simply use pandas to read csv's into dataframe
df1 = pd.read_csv('OnehotAll.csv')
df2 = pd.read_csv('OnehotOrd.csv')

In [3]:
df1.head() # All one hot encoded

Unnamed: 0,parent_associate's degree,parent_bachelor's degree,parent_high school,parent_master's degree,parent_some college,race A,race B,race C,race D,gender,standard lunch,completed course,above avg score
0,0,1,0,0,0,0,1,0,0,1,1,0,1
1,0,0,0,0,1,0,0,1,0,1,1,1,1
2,0,0,0,1,0,0,1,0,0,1,1,0,1
3,1,0,0,0,0,1,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,1,0,0,1,0,1


In [4]:
df2.head() #Parent edu ordinal encoded

Unnamed: 0,parent edu,race A,race B,race C,race D,gender,standard lunch,completed course,above avg score
0,4,0,1,0,0,1,1,0,1
1,2,0,0,1,0,1,1,1,1
2,5,0,1,0,0,1,1,0,1
3,3,1,0,0,0,0,0,0,0
4,2,0,0,1,0,0,1,0,1


## Model training and evaluation

Note that throughout the code, random_state values are used consistently. This is to ensure reproducability of results by using the same randomization seeds. For train/test splits and K-fold CV, the configurations in this final version of the notebook are simply the last ones used. They are varied throughout experimenting.

## 1 - Default implementation

### 1.1 - One-hot encoded

In [5]:
# Define feature columns from the correct dataframe
features = ["parent_associate's degree", "parent_bachelor's degree", "parent_high school", "parent_master's degree", "parent_some college", "race A", "race B", "race C", "race D", "gender", "standard lunch", "completed course"]
X_1 = df1[features]

# Target label, our score class
y_1 = df1['above avg score']

In [6]:
# Train test split. It is tested with 80/20, 90/10 and 70/30 splits.
X_1_train, X_1_test, y_1_train, y_1_test = train_test_split(X_1, y_1, test_size = 0.3, random_state = 34)

In [7]:
# Define K-Fold and Stratified K-Fold CVs for use in CV evaluation. Splits that are used are 5 and 10.
kf_1 = KFold(n_splits=10, shuffle=True, random_state=13)
skf_1 = StratifiedKFold(n_splits=10, shuffle=True, random_state=13)

In [8]:
# Get our model from sklearn library.
clf_1 = RandomForestClassifier(random_state = 43)

In [9]:
# Training and evaluation with train-test split:
clf_1.fit(X_1_train, y_1_train)
y_1_pred = clf_1.predict(X_1_test)

print('report:\n ', metrics.classification_report(y_1_test, y_1_pred))
print('confusion matrix \n', metrics.confusion_matrix(y_1_test, y_1_pred))

report:
                precision    recall  f1-score   support

           0       0.52      0.61      0.56       130
           1       0.66      0.57      0.61       170

    accuracy                           0.59       300
   macro avg       0.59      0.59      0.59       300
weighted avg       0.60      0.59      0.59       300

confusion matrix 
 [[79 51]
 [73 97]]


In [10]:
# Training and evaluation using Cross-validate and specifying metrics. Macro averaged scores are found, as used for other metrics.
# Note that the average is found over all models to find a performance metrics for our setup as a whole:

#The defined Folds in cell above is changed and tested for 5 and 10 folds for both cross-validators. 

cv_11 = cross_validate(clf_1, X_1, y_1, cv=kf_1, scoring=('accuracy', 'precision_macro', 'recall_macro', 'f1_macro'))
print('K-Fold:')
print('Avg accuracy: ', cv_11['test_accuracy'].mean())
print('Avg macroavg-f1-score: ', cv_11['test_f1_macro'].mean())
print('Avg precision_macroavg score: ', cv_11['test_precision_macro'].mean())
print('Avg recall_macroavg score: ', cv_11['test_recall_macro'].mean())

cv_12 = cross_validate(clf_1, X_1, y_1, cv=skf_1, scoring=('accuracy', 'precision_macro', 'recall_macro', 'f1_macro'))
print('\nStratified K-Fold:')
print('Avg accuracy: ', cv_12['test_accuracy'].mean())
print('Avg macroavg-f1-score: ', cv_12['test_f1_macro'].mean())
print('Avg precision_macroavg score: ', cv_12['test_precision_macro'].mean())
print('Avg recall_macroavg score: ', cv_12['test_recall_macro'].mean())

K-Fold:
Avg accuracy:  0.585
Avg macroavg-f1-score:  0.581907520183236
Avg precision_macroavg score:  0.5844662648252062
Avg recall_macroavg score:  0.5830438238322958

Stratified K-Fold:
Avg accuracy:  0.5850000000000001
Avg macroavg-f1-score:  0.5818950120819546
Avg precision_macroavg score:  0.5839873834825363
Avg recall_macroavg score:  0.5832430724968863


### 1.2 - Parent edu ordinal

In [11]:
# Same procedure as above, for dataframe2, where parental education is different encoding.
# Define feature columns:
feature = ["parent edu", "race A", "race B", "race C", "race D", "gender", "standard lunch", "completed course"]
X_2 = df2[feature]

# Target class:
y_2 = df2['above avg score']

In [12]:
# Train test split:
X_2_train, X_2_test, y_2_train, y_2_test = train_test_split(X_2, y_2, test_size = 0.3, random_state = 34)

In [13]:
# Define K-Fold and Stratified K-Fold CV:
kf_2 = KFold(n_splits=10, shuffle=True, random_state=13)
skf_2 = StratifiedKFold(n_splits=10, shuffle=True, random_state=13)

In [14]:
# Get our model:
clf_2 = RandomForestClassifier(random_state = 43)

In [15]:
# Training and evaluating model with train-test split:
clf_2.fit(X_2_train, y_2_train)
y_2_pred = clf_2.predict(X_2_test)

print('report:\n ', metrics.classification_report(y_2_test, y_2_pred))
print('confusion matrix \n', metrics.confusion_matrix(y_2_test, y_2_pred))

report:
                precision    recall  f1-score   support

           0       0.53      0.61      0.56       130
           1       0.66      0.58      0.62       170

    accuracy                           0.59       300
   macro avg       0.59      0.60      0.59       300
weighted avg       0.60      0.59      0.60       300

confusion matrix 
 [[79 51]
 [71 99]]


In [16]:
# Training and evaluation using Cross-validate and specifying metrics. Macro averaged scores are found, as used for other metrics.
# Note that the average is found over all models to find a performance metrics for our setup as a whole:

#The defined Folds in cell above is changed and tested for 5 and 10 folds for both cross-validators. 

cv_21 = cross_validate(clf_2, X_2, y_2, cv=kf_2, scoring=('accuracy', 'precision_macro', 'recall_macro', 'f1_macro'))
print('K-Fold:')
print('Avg accuracy: ', cv_21['test_accuracy'].mean())
print('Avg macroavg-f1-score: ', cv_21['test_f1_macro'].mean())
print('Avg precision_macroavg score: ', cv_21['test_precision_macro'].mean())
print('Avg recall_macroavg score: ', cv_21['test_recall_macro'].mean())

cv_22 = cross_validate(clf_2, X_2, y_2, cv=skf_2, scoring=('accuracy', 'precision_macro', 'recall_macro', 'f1_macro'))
print('\nStratified K-Fold:')
print('Avg accuracy: ', cv_22['test_accuracy'].mean())
print('Avg macroavg-f1-score: ', cv_22['test_f1_macro'].mean())
print('Avg precision_macroavg score: ', cv_22['test_precision_macro'].mean())
print('Avg recall_macroavg score: ', cv_22['test_recall_macro'].mean())

K-Fold:
Avg accuracy:  0.59
Avg macroavg-f1-score:  0.5871545796500581
Avg precision_macroavg score:  0.5887288120856778
Avg recall_macroavg score:  0.5878651914073724

Stratified K-Fold:
Avg accuracy:  0.5850000000000001
Avg macroavg-f1-score:  0.5818667819702503
Avg precision_macroavg score:  0.5835959478907939
Avg recall_macroavg score:  0.5829402194052434


## 2 - Manual parameter tuning

### 2.1 - One hot encoded data

In [17]:
# Same process as before. This time, hyperparameters (not all) are tuned manually to find a seeming optimal setting
# Define feature columns:
fcols1 = ["parent_associate's degree", "parent_bachelor's degree", "parent_high school", "parent_master's degree", "parent_some college", "race A", "race B", "race C", "race D", "gender", "standard lunch", "completed course"]
X1 = df1[fcols1]

# Define target variable
y1 = df1['above avg score']

In [18]:
# Train test split:
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size = 0.3, random_state = 34)

In [19]:
# Define K-Fold and Stratified K-Fold CV:
kf1 = KFold(n_splits=10, shuffle=True, random_state=13)
skf1 = StratifiedKFold(n_splits=10, shuffle=True, random_state=13)

In [20]:
# get model. 
clf1 = RandomForestClassifier(n_estimators=50 , criterion='gini', max_depth =2, random_state = 43)

In [21]:
# Train and evaluate model using train test split:
clf1.fit(X1_train, y1_train)
y1_pred = clf1.predict(X1_test)

print('report:\n ', metrics.classification_report(y1_test, y1_pred))
print('confusion matrix \n', metrics.confusion_matrix(y1_test, y1_pred))


report:
                precision    recall  f1-score   support

           0       0.61      0.57      0.59       130
           1       0.69      0.72      0.70       170

    accuracy                           0.65       300
   macro avg       0.65      0.64      0.64       300
weighted avg       0.65      0.65      0.65       300

confusion matrix 
 [[ 74  56]
 [ 48 122]]


In [22]:
# Training and evaluation using Cross-validate and specifying metrics. Macro averaged scores are found, as used for other metrics.
# Note that the average is found over all models to find a performance metrics for our setup as a whole:

#The defined Folds in cell above is changed and tested for 5 and 10 folds for both cross-validators. 

cv11 = cross_validate(clf1, X1, y1, cv=kf1, scoring=('accuracy', 'precision_macro', 'recall_macro', 'f1_macro'))
print('K-Fold:')
print('Avg accuracy: ', cv11['test_accuracy'].mean())
print('Avg macroavg-f1-score: ', cv11['test_f1_macro'].mean())
print('Avg precision_macroavg score: ', cv11['test_precision_macro'].mean())
print('Avg recall_macroavg score: ', cv11['test_recall_macro'].mean())

cv12 = cross_validate(clf1, X1, y1, cv=skf1, scoring=('accuracy', 'precision_macro', 'recall_macro', 'f1_macro'))
print('\nStratified K-Fold:')
print('Avg accuracy: ', cv12['test_accuracy'].mean())
print('Avg macroavg-f1-score: ', cv12['test_f1_macro'].mean())
print('Avg precision_macroavg score: ', cv12['test_precision_macro'].mean())
print('Avg recall_macroavg score: ', cv12['test_recall_macro'].mean())

K-Fold:
Avg accuracy:  0.6420000000000001
Avg macroavg-f1-score:  0.6287502166791823
Avg precision_macroavg score:  0.6490952860578784
Avg recall_macroavg score:  0.6337567110291552

Stratified K-Fold:
Avg accuracy:  0.642
Avg macroavg-f1-score:  0.6270511749933887
Avg precision_macroavg score:  0.6509439259425355
Avg recall_macroavg score:  0.6335773528291593


### 2.2 - Ordinal encoded parent edu

In [23]:
# Define feature variables
fcols2 = ["parent edu", "race A", "race B", "race C", "race D", "gender", "standard lunch", "completed course"]
X2 = df2[fcols2]

# Define target
y2 = df2['above avg score']

In [24]:
#Train-test split
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size = 0.3, random_state = 34)

In [25]:
# Define K-Fold and Stratified K-Fold CV:
kf2 = KFold(n_splits=10, shuffle=True, random_state=13)
skf2 = StratifiedKFold(n_splits=10, shuffle=True, random_state=13)

In [26]:
# Import model
clf2 = RandomForestClassifier(n_estimators=40, criterion='gini', max_depth = 2, random_state = 43)

In [27]:
# Train and evaluate model using train test split:
clf2.fit(X2_train, y2_train)
y2_pred = clf2.predict(X2_test)

print('report:\n ', metrics.classification_report(y2_test, y2_pred))
print('confusion matrix \n', metrics.confusion_matrix(y2_test, y2_pred))

report:
                precision    recall  f1-score   support

           0       0.60      0.56      0.58       130
           1       0.68      0.72      0.70       170

    accuracy                           0.65       300
   macro avg       0.64      0.64      0.64       300
weighted avg       0.65      0.65      0.65       300

confusion matrix 
 [[ 73  57]
 [ 48 122]]


In [28]:
# Training and evaluation using Cross-validate and specifying metrics. Macro averaged scores are found, as used for other metrics.
# Note that the average is found over all models to find a performance metrics for our setup as a whole:

#The defined Folds in cell above is changed and tested for 5 and 10 folds for both cross-validators. 

cv21 = cross_validate(clf2, X2, y2, cv=kf2, scoring=('accuracy', 'precision_macro', 'recall_macro', 'f1_macro'))
print('K-Fold:')
print('Avg accuracy: ', cv21['test_accuracy'].mean())
print('Avg macroavg-f1-score: ', cv21['test_f1_macro'].mean())
print('Avg precision_macroavg score: ', cv21['test_precision_macro'].mean())
print('Avg recall_macroavg score: ', cv21['test_recall_macro'].mean())

cv22 = cross_validate(clf2, X2, y2, cv=skf2, scoring=('accuracy', 'precision_macro', 'recall_macro', 'f1_macro'))
print('\nStratified K-Fold:')
print('Avg accuracy: ', cv22['test_accuracy'].mean())
print('Avg macroavg-f1-score: ', cv22['test_f1_macro'].mean())
print('Avg precision_macroavg score: ', cv22['test_precision_macro'].mean())
print('Avg recall_macroavg score: ', cv22['test_recall_macro'].mean())

K-Fold:
Avg accuracy:  0.6529999999999999
Avg macroavg-f1-score:  0.6440071148135954
Avg precision_macroavg score:  0.6587410675854036
Avg recall_macroavg score:  0.6464232986607588

Stratified K-Fold:
Avg accuracy:  0.645
Avg macroavg-f1-score:  0.6352771340581659
Avg precision_macroavg score:  0.6507921199741812
Avg recall_macroavg score:  0.6385512524575652




## Attempt of further parameter optimization using Grid-Search CV


As the random forest classifier showed most promise of all our methods, this section will attempt to use the Grid-Search CV tool to find the best hyperparameter combination. Note that due to expensive computation time for GridSearchCV, the parameter grid is limited in possible values. However, the parametergrid has been iterated through a set of variations to try out different combinations.

In [29]:
#Get model, note the random_state consistency.

rfc1 = RandomForestClassifier(random_state=43)

In [75]:
#The optimization is tried for both dataset configurations (both CSV files, see df1 and df2 dataframes)
#to try with the other, simply uncomment/comment out sections of the code

#Parent edu ordinal
#feature
#fcols2 = ["parent edu", "race A", "race B", "race C", "race D", "gender", "standard lunch", "completed course"]
#XX = df2[fcols2]

# Define target
#yy = df2['above avg score']

#One hot encoded:
# Define feature columns:
fcols1 = ["parent_associate's degree", "parent_bachelor's degree", "parent_high school", "parent_master's degree", "parent_some college", "race A", "race B", "race C", "race D", "gender", "standard lunch", "completed course"]
XX = df1[fcols1]

# Define target variable
yy = df1['above avg score']

In [85]:
#Train-test split, also here different splits are attempted
XX_train, XX_test, yy_train, yy_test = train_test_split(XX, yy, test_size = 0.1, random_state = 34)

In [92]:
#We define a parametergrid, which will be input in our GridSearchCV. This grid lists up all parameters we want to optimize
# in the model. Further, the values/configurations which we want to try should be included. This can be changed
# to iterate through different configuration grids. 
#NOTE: DO NOT include too much data here, since the next step is expensive depending on the parameter grid. Hence, iterative 
#approach is better in our case. 

pgrid = {
    'n_estimators': [80,81,82,83,84,85,86,87],
    'max_depth': [2,3,4,5,6,7,8,9,10],
    'criterion': ['gini'],
    'max_leaf_nodes': [3,4,5,6,7]
    }

In [93]:
#We define a grid search cross validator, including our defined random forest classifier, parameter grid, and number of folds in
#a Stratified K-fold CV as parameters. It is then fitted to the model. Fitting the grid search will combine all the possible 
#values for our chosen hyperparameters in all possible ways, and then we print the optimal found.

CV_rfc1 = GridSearchCV(estimator=rfc1, param_grid=pgrid, cv= 2)
CV_rfc1.fit(XX_train, yy_train)
print(CV_rfc1.best_params_)

{'criterion': 'gini', 'max_depth': 2, 'max_leaf_nodes': 4, 'n_estimators': 80}


In [90]:
#We define a new model with our optimal hyperparameters
rfc1 = RandomForestClassifier(criterion ='gini', max_depth=2, max_leaf_nodes = 4, n_estimators=80, random_state=43)

In [91]:
#Do the standard training procedure, and extract our score metrics. 
rfc1.fit(XX_train, yy_train)
yy_pred = rfc1.predict(XX_test)

print('report:\n ', metrics.classification_report(yy_test, yy_pred))
print('confusion matrix \n', metrics.confusion_matrix(yy_test, yy_pred))

report:
                precision    recall  f1-score   support

           0       0.72      0.59      0.65        44
           1       0.72      0.82      0.77        56

    accuracy                           0.72       100
   macro avg       0.72      0.71      0.71       100
weighted avg       0.72      0.72      0.72       100

confusion matrix 
 [[26 18]
 [10 46]]


After iterating through a bunch of hyperparameter values, and thus combinations, no combinations were found that gave better score metrics than the highest one we already found in our manual parameter optimization. 

The metrics generated in the code-snippet above, with an Accuracy of 72%, macro averaged f1 score, recall and precision of respectively 0.71, 0.71, 0.72, is the exact same result we found to be our best in our first wave of experimenting and evaluation. NOTE that both of these are for the 90/10 split.

Hence, we conclude our experimenting here. Discussions of results, model building and data configurations are discussed in the report...