In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, r2_score
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier

In [2]:
# load csv
file_load = "../Resources/nhes_merged_pfi_cleaned.csv"
learning_data_df = pd.read_csv(file_load)
learning_data_df

Unnamed: 0.1,Unnamed: 0,index,SEGRADES_0,SEGRADES_1,CENREG_1,CENREG_2,CENREG_3,CENREG_4,SCHRTSCHL_1,SCHRTSCHL_2,...,TTLHHINC_6,TTLHHINC_7,TTLHHINC_8,TTLHHINC_9,TTLHHINC_10,TTLHHINC_11,TTLHHINC_12,OWNRNTHB_1,OWNRNTHB_2,OWNRNTHB_3
0,0,0,0,1,0,0,0,1,0,1,...,0,0,1,0,0,0,0,0,1,0
1,1,4,0,1,0,1,0,0,0,1,...,0,0,0,0,1,0,0,1,0,0
2,2,7,0,1,0,1,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
3,3,9,0,1,0,0,1,0,0,1,...,0,0,0,0,0,1,0,1,0,0
4,4,11,1,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25173,25173,48070,0,1,0,0,0,1,0,1,...,0,0,0,0,1,0,0,1,0,0
25174,25174,48073,1,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,1,0,0
25175,25175,48075,1,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,1,0,0
25176,25176,48079,1,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,1,0,0


In [3]:
# drop unnamed and index columns
learning_data_df = learning_data_df.drop(['Unnamed: 0','index'], axis=1)
learning_data_df

Unnamed: 0,SEGRADES_0,SEGRADES_1,CENREG_1,CENREG_2,CENREG_3,CENREG_4,SCHRTSCHL_1,SCHRTSCHL_2,SEENJOY_1,SEENJOY_2,...,TTLHHINC_6,TTLHHINC_7,TTLHHINC_8,TTLHHINC_9,TTLHHINC_10,TTLHHINC_11,TTLHHINC_12,OWNRNTHB_1,OWNRNTHB_2,OWNRNTHB_3
0,0,1,0,0,0,1,0,1,0,1,...,0,0,1,0,0,0,0,0,1,0
1,0,1,0,1,0,0,0,1,0,1,...,0,0,0,0,1,0,0,1,0,0
2,0,1,0,1,0,0,0,1,1,0,...,0,0,0,0,0,0,0,1,0,0
3,0,1,0,0,1,0,0,1,0,1,...,0,0,0,0,0,1,0,1,0,0
4,1,0,0,0,1,0,0,1,1,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25173,0,1,0,0,0,1,0,1,0,1,...,0,0,0,0,1,0,0,1,0,0
25174,1,0,0,0,0,1,0,1,0,1,...,0,0,0,0,0,0,0,1,0,0
25175,1,0,0,0,0,1,0,1,0,1,...,0,0,0,0,0,0,0,1,0,0
25176,1,0,0,0,0,1,0,1,0,1,...,0,0,0,0,0,0,0,1,0,0


In [4]:
list(learning_data_df.columns)

['SEGRADES_0',
 'SEGRADES_1',
 'CENREG_1',
 'CENREG_2',
 'CENREG_3',
 'CENREG_4',
 'SCHRTSCHL_1',
 'SCHRTSCHL_2',
 'SEENJOY_1',
 'SEENJOY_2',
 'SEENJOY_3',
 'SEENJOY_4',
 'SEABSNT_1',
 'SEABSNT_2',
 'SEABSNT_3',
 'SEABSNT_4',
 'FCSCHOOL_1',
 'FCSCHOOL_2',
 'FCSCHOOL_3',
 'FCSCHOOL_4',
 'FCTEACHR_1',
 'FCTEACHR_2',
 'FCTEACHR_3',
 'FCTEACHR_4',
 'FCSTDS_1',
 'FCSTDS_2',
 'FCSTDS_3',
 'FCSTDS_4',
 'FHHOME_1',
 'FHHOME_2',
 'FHHOME_3',
 'FHHOME_4',
 'FHWKHRS_1',
 'FHWKHRS_2',
 'FHWKHRS_3',
 'FHWKHRS_4',
 'FOSTORY2X_1',
 'FOSTORY2X_2',
 'FOCRAFTS_1',
 'FOCRAFTS_2',
 'FOGAMES_1',
 'FOGAMES_2',
 'FOBUILDX_1',
 'FOBUILDX_2',
 'FOSPORT_1',
 'FOSPORT_2',
 'FORESPON_1',
 'FORESPON_2',
 'FOHISTX_1',
 'FOHISTX_2',
 'FODINNERX_1',
 'FODINNERX_2',
 'FODINNERX_3',
 'FODINNERX_4',
 'FOLIBRAYX_1',
 'FOLIBRAYX_2',
 'FOBOOKSTX_1',
 'FOBOOKSTX_2',
 'FOCONCRTX_1',
 'FOCONCRTX_2',
 'FOMUSEUMX_1',
 'FOMUSEUMX_2',
 'FOZOOX_1',
 'FOZOOX_2',
 'FOGROUPX_1',
 'FOGROUPX_2',
 'HHENGLISH_1',
 'HHENGLISH_2',
 'CSPEAK

In [5]:
#Creating a correlation matrix
learning_data_df.corr()

Unnamed: 0,SEGRADES_0,SEGRADES_1,CENREG_1,CENREG_2,CENREG_3,CENREG_4,SCHRTSCHL_1,SCHRTSCHL_2,SEENJOY_1,SEENJOY_2,...,TTLHHINC_6,TTLHHINC_7,TTLHHINC_8,TTLHHINC_9,TTLHHINC_10,TTLHHINC_11,TTLHHINC_12,OWNRNTHB_1,OWNRNTHB_2,OWNRNTHB_3
SEGRADES_0,1.000000,-1.000000,-0.002561,-0.004609,-0.021285,0.028067,0.008308,-0.008308,-0.253430,0.154696,...,0.031715,0.017159,-0.014610,-0.061809,-0.086001,-0.045935,-0.070541,-0.085298,0.079925,0.025749
SEGRADES_1,-1.000000,1.000000,0.002561,0.004609,0.021285,-0.028067,-0.008308,0.008308,0.253430,-0.154696,...,-0.031715,-0.017159,0.014610,0.061809,0.086001,0.045935,0.070541,0.085298,-0.079925,-0.025749
CENREG_1,-0.002561,0.002561,1.000000,-0.342945,-0.237430,-0.251499,-0.030198,0.030198,0.002387,-0.002593,...,-0.018295,-0.017906,-0.007304,0.016382,0.073136,0.002532,0.028088,0.019517,-0.017951,-0.006972
CENREG_2,-0.004609,0.004609,-0.342945,1.000000,-0.409741,-0.434020,0.010886,-0.010886,-0.005914,-0.000870,...,0.003180,0.004223,-0.008657,-0.024155,-0.041257,-0.001231,-0.018414,0.002996,-0.004478,0.004457
CENREG_3,-0.021285,0.021285,-0.237430,-0.409741,1.000000,-0.300484,-0.054013,0.054013,-0.020033,0.011745,...,0.016809,0.022054,0.031277,0.034464,-0.009720,-0.010162,-0.018328,0.090036,-0.091258,-0.005065
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTLHHINC_11,-0.045935,0.045935,0.002532,-0.001231,-0.010162,0.009042,-0.000882,0.000882,0.003016,-0.005865,...,-0.037958,-0.046079,-0.062688,-0.069701,-0.058346,1.000000,-0.024079,0.045015,-0.041188,-0.016771
TTLHHINC_12,-0.070541,0.070541,0.028088,-0.018414,-0.018328,0.014155,-0.017491,0.017491,0.012008,-0.004589,...,-0.046339,-0.056254,-0.076530,-0.085092,-0.071230,-0.024079,1.000000,0.069829,-0.066250,-0.018449
OWNRNTHB_1,-0.085298,0.085298,0.019517,0.002996,0.090036,-0.107634,-0.050148,0.050148,-0.037965,0.030417,...,-0.031360,0.008676,0.074758,0.137188,0.140731,0.045015,0.069829,1.000000,-0.953351,-0.249460
OWNRNTHB_2,0.079925,-0.079925,-0.017951,-0.004478,-0.091258,0.109132,0.047014,-0.047014,0.036282,-0.026227,...,0.027368,-0.007027,-0.068944,-0.127471,-0.132910,-0.041188,-0.066250,-0.953351,1.000000,-0.054498


In [6]:
# remove allgrade columns and preserve in a separate df
#allgrade_df = learning_data_df.filter(['ALLGRADEX_2',
 #'ALLGRADEX_3',
 #'ALLGRADEX_4',
 #'ALLGRADEX_5',
 #'ALLGRADEX_6',
 #'ALLGRADEX_7',
 #'ALLGRADEX_8',
 #'ALLGRADEX_9',
 #'ALLGRADEX_10',
 #'ALLGRADEX_11',
 #'ALLGRADEX_12',
 #'ALLGRADEX_13',
 #'ALLGRADEX_14',
 #'ALLGRADEX_15'], axis=1)
#allgrade_df.head()

In [7]:
# remove allgrade columns from learning_data_df
#learning_data_df = learning_data_df.drop([ 'ALLGRADEX_2',
 #'ALLGRADEX_3',
 #'ALLGRADEX_4',
 #'ALLGRADEX_5',
 #'ALLGRADEX_6',
 #'ALLGRADEX_7',
 #'ALLGRADEX_8',
 #'ALLGRADEX_9',
 #'ALLGRADEX_10',
 #'ALLGRADEX_11',
 #'ALLGRADEX_12',
 #'ALLGRADEX_13',
 #'ALLGRADEX_14',
 #'ALLGRADEX_15'], axis=1)

In [8]:
# create features
X = learning_data_df.drop(['SEGRADES_0', 'SEGRADES_1'], axis=1)

# create target
y = learning_data_df['SEGRADES_1']
target_names = ["fail", "pass"]

In [9]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
def calc_vif(X):
    
    #calculating VIF
    vif = pd.DataFrame()
    vif["variables"] = X.columns
    vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    
    return(vif)

In [10]:
#X = learning_data_df.iloc[:,:-1]
vif_df = calc_vif(X)

  vif = 1. / (1. - r_squared_i)


In [11]:
vif_df.head(50)

Unnamed: 0,variables,VIF
0,CENREG_1,1943901000.0
1,CENREG_2,4552184.0
2,CENREG_3,1286919000.0
3,CENREG_4,251338000.0
4,SCHRTSCHL_1,inf
5,SCHRTSCHL_2,inf
6,SEENJOY_1,5524253.0
7,SEENJOY_2,427936.8
8,SEENJOY_3,13203980000.0
9,SEENJOY_4,13475760000.0


In [12]:
# drop columns with high vif
learning_data_df = learning_data_df.drop([ 'SCHRTSCHL_1',
 'SCHRTSCHL_2',
 'FCSTDS_1',
 'FCSTDS_2',
 'FCSTDS_3',
 'FCSTDS_4',
 'FOSTORY2X_1',
 'FOSTORY2X_2',], axis=1)

In [13]:
# prepare data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [14]:
# Fit a Random Forest Classifier model, and then print a classification report
clf = RandomForestClassifier(random_state=42).fit(X_train_scaled, y_train)
y_pred = clf.predict(X_test_scaled)
print(classification_report(y_test, y_pred, target_names=target_names))
print(f'Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')
print(r2_score(y_test, y_pred))

              precision    recall  f1-score   support

        fail       0.61      0.54      0.57      2672
        pass       0.69      0.75      0.72      3623

    accuracy                           0.66      6295
   macro avg       0.65      0.64      0.64      6295
weighted avg       0.65      0.66      0.65      6295

Training Score: 1.0
Testing Score: 0.6584590945194599
-0.3980715769675114


In [15]:
# Extremely Random Tree Classifier
clf = ExtraTreesClassifier(random_state=42).fit(X_train_scaled, y_train)
y_pred = clf.predict(X_test_scaled)
print(classification_report(y_test, y_pred, target_names=target_names))
print(f'Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')

              precision    recall  f1-score   support

        fail       0.59      0.54      0.56      2672
        pass       0.68      0.73      0.70      3623

    accuracy                           0.65      6295
   macro avg       0.64      0.63      0.63      6295
weighted avg       0.64      0.65      0.64      6295

Training Score: 1.0
Testing Score: 0.6474980142970611


In [16]:
# Adaptive Boosting Classifier
clf = AdaBoostClassifier(random_state=42).fit(X_train_scaled, y_train)
print(classification_report(y_test, y_pred, target_names=target_names))
print(f'Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')

              precision    recall  f1-score   support

        fail       0.59      0.54      0.56      2672
        pass       0.68      0.73      0.70      3623

    accuracy                           0.65      6295
   macro avg       0.64      0.63      0.63      6295
weighted avg       0.64      0.65      0.64      6295

Training Score: 0.6744690991897474
Testing Score: 0.6659253375694996


In [17]:
def model_tester(model, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
    scaler = StandardScaler().fit(X_train)
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    clf = model.fit(X_train_scaled, y_train)
    print(classification_report(y_test, y_pred, target_names=target_names))
    print(f'Training Score: {clf.score(X_train_scaled, y_train)}')
    print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')
    
model_tester(AdaBoostClassifier(random_state=42, n_estimators=100), X, y)
model_tester(AdaBoostClassifier(random_state=42, n_estimators=200), X, y)
model_tester(AdaBoostClassifier(random_state=42, n_estimators=200, learning_rate=0.1), X, y)
model_tester(AdaBoostClassifier(random_state=42, n_estimators=500, learning_rate=0.1), X, y)
model_tester(AdaBoostClassifier(random_state=42, n_estimators=1000, learning_rate=0.1), X, y)
model_tester(AdaBoostClassifier(random_state=42, n_estimators=2000, learning_rate=0.1), X, y)

              precision    recall  f1-score   support

        fail       0.45      0.40      0.42      2709
        pass       0.58      0.62      0.60      3586

    accuracy                           0.53      6295
   macro avg       0.51      0.51      0.51      6295
weighted avg       0.52      0.53      0.52      6295

Training Score: 0.679658952496955
Testing Score: 0.6627482128673551
              precision    recall  f1-score   support

        fail       0.45      0.40      0.42      2709
        pass       0.58      0.62      0.60      3586

    accuracy                           0.53      6295
   macro avg       0.51      0.51      0.51      6295
weighted avg       0.52      0.53      0.52      6295

Training Score: 0.6802944447386539
Testing Score: 0.6624305003971406
              precision    recall  f1-score   support

        fail       0.45      0.40      0.42      2709
        pass       0.58      0.62      0.60      3586

    accuracy                           0.53  