# Data Pre-Processing

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
with open('../data/features.csv') as f1:
    df = pd.read_csv(f1)

with open('../data/outcomes.csv') as f2:
    outcomes = pd.read_csv(f2, usecols=['student_id', 'outcome'])
                           
df = pd.merge(df, outcomes, on='student_id')
 

df.drop(['Unnamed: 0', 'course_id'], axis='columns', inplace=True)
labels = ['fail', 'pass']
target_variable_name = 'outcome'
df.dropna(axis='index', how='any', subset=['outcome'], inplace=True)
df.outcome = df.outcome.astype(int)

In [3]:
df.head()

Unnamed: 0,student_id,tot_page_views,average_daily_views,median_daily_views,max_daily_views,days_with_views,hours_with_views,max_views_per_hour,avg_views_per_hour,median_views_per_hour,avg_hours_with_views_per_day,max_hours_with_views_per_day,median_hours_with_views_per_day,tot_participations,average_daily_participations,median_daily_participations,max_daily_participations,days_with_participations,outcome
0,47649,1153,22.607843,16.0,118.0,51,71,88.0,16.239437,15.0,1.392157,1.392157,1.0,23,0.46,0.0,4.0,50,1
1,52263,744,15.183673,0.0,93.0,49,67,48.0,11.104478,8.0,1.367347,1.367347,0.0,20,0.434783,0.0,3.0,46,1
2,24036,341,6.686275,0.0,80.0,51,27,48.0,12.62963,11.0,0.529412,0.529412,0.0,15,0.333333,0.0,3.0,45,1
3,52137,237,5.042553,0.0,68.0,47,19,32.0,12.473684,9.0,0.404255,0.404255,0.0,6,0.1875,0.0,3.0,32,1
4,52267,665,13.3,0.0,161.0,50,42,118.0,15.833333,8.0,0.84,0.84,0.0,26,0.52,0.0,5.0,50,1


## Missing Values

Since there are only a few missing values we can simply discard them

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 422 entries, 0 to 424
Data columns (total 19 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   student_id                       422 non-null    int64  
 1   tot_page_views                   422 non-null    int64  
 2   average_daily_views              419 non-null    float64
 3   median_daily_views               419 non-null    float64
 4   max_daily_views                  419 non-null    float64
 5   days_with_views                  422 non-null    int64  
 6   hours_with_views                 422 non-null    int64  
 7   max_views_per_hour               419 non-null    float64
 8   avg_views_per_hour               419 non-null    float64
 9   median_views_per_hour            419 non-null    float64
 10  avg_hours_with_views_per_day     419 non-null    float64
 11  max_hours_with_views_per_day     419 non-null    float64
 12  median_hours_with_view

In [5]:
df.dropna(axis='index', how='any', inplace=True)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 416 entries, 0 to 424
Data columns (total 19 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   student_id                       416 non-null    int64  
 1   tot_page_views                   416 non-null    int64  
 2   average_daily_views              416 non-null    float64
 3   median_daily_views               416 non-null    float64
 4   max_daily_views                  416 non-null    float64
 5   days_with_views                  416 non-null    int64  
 6   hours_with_views                 416 non-null    int64  
 7   max_views_per_hour               416 non-null    float64
 8   avg_views_per_hour               416 non-null    float64
 9   median_views_per_hour            416 non-null    float64
 10  avg_hours_with_views_per_day     416 non-null    float64
 11  max_hours_with_views_per_day     416 non-null    float64
 12  median_hours_with_view

## Feature Selection


In [7]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel, RFECV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [8]:
x = df.drop(['student_id','outcome'], axis='columns')
y = df['outcome'].values.reshape(-1,1)

In [9]:
x.shape, y.shape

((416, 17), (416, 1))

In [10]:
Min_Max = MinMaxScaler()
X = Min_Max.fit_transform(x)
Y= Min_Max.fit_transform(y)

# Split the data into test and training
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)

X_train.shape, X_test.shape

((291, 17), (125, 17))

In [None]:
scores_df = pd.DataFrame()
# Create a naive random forest classifier
clf = RandomForestClassifier(n_estimators=1000, random_state=0, n_jobs=-1)
# Train the classifier
cv_scores = cross_val_score(clf, X_train,np.ravel(Y_train), cv=10)

print("Naive Random Forest all features")
print(f"Mean Accuracy: {np.mean(cv_scores)} \nStandard Deviation : {np.std(cv_scores)}")

In [None]:
scores_df = scores_df.append({'# features':'all',
               'Mean Accuracy':np.mean(cv_scores),
               'Std':np.std(cv_scores)}, ignore_index=True)

In [None]:
scores_df

### Lasso

In [None]:
sel = SelectFromModel(LogisticRegression(C=1, penalty='l1', solver='liblinear'))
sel.fit(X_train, np.ravel(Y_train))

In [None]:
print("Selected features :")
_ = [print(f"- {c}") for c in x.loc[:, sel.get_support()].columns]

In [None]:
X_train_df = pd.DataFrame(X_train)
selected_features = X_train_df.columns[(sel.get_support())]
print(f'total features: {(X_train_df.shape[1])}')
print(f'selected features: {len(selected_features)}')
print(f'features with coefficients shrank to zero: {np.sum(sel.estimator_.coef_ == 0)}')

In [None]:
removed_features = X_train_df.columns[(sel.estimator_.coef_ == 0).ravel().tolist()]
removed_features

In [None]:
X_train_lasso_selected = sel.transform(X_train_df)
X_test_lasso_selected = sel.transform(X_test)

In [None]:
# Create a naive random forest classifier
#clf = RandomForestClassifier(n_estimators=1000, random_state=0, n_jobs=-1)
# Train the classifier
cv_scores = cross_val_score(clf, X_train_lasso_selected, np.ravel(Y_train), cv=10)

print("Naive Random Forest all features")
print(f"Mean Accuracy: {np.mean(cv_scores)} \nStandard Deviation : {np.std(cv_scores)}")

In [None]:
scores_df = scores_df.append({'# features':'lasso',
               'Mean Accuracy':np.mean(cv_scores),
               'Std':np.std(cv_scores)}, ignore_index=True)
scores_df.head()

### RFE

In [None]:
#clf = RandomForestClassifier(n_estimators=1000, random_state=0, n_jobs=-1)
rfecv = RFECV(estimator=clf, step=1, cv=StratifiedKFold(10), scoring='accuracy')
rfecv.fit(X_train, np.ravel(Y_train))

In [None]:
print(f'Optimal number of features: {rfecv.n_features_}')

In [None]:
print("Selected features :")
_ = [print(f"- {c}") for c in x.loc[:, rfecv.support_].columns]

In [None]:
plt.figure(figsize=(16, 9))
plt.title('Recursive Feature Elimination with Cross-Validation', fontsize=20)
plt.xlabel('Number of features selected', fontsize=14, labelpad=20)
plt.ylabel('% Correct Classification', fontsize=14, labelpad=20)
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_, color='#303F9F', linewidth=3)

plt.show()

In [None]:
X_train_rfe_selected = rfecv.transform(X_train_df)
X_test_rfe_selected = rfecv.transform(X_test)

In [None]:
dset = pd.DataFrame()
dset['attr'] = x.loc[:, rfecv.support_].columns
dset['importance'] = rfecv.estimator_.feature_importances_

dset = dset.sort_values(by='importance', ascending=False)


plt.figure(figsize=(16, 14))
plt.barh(y=dset['attr'], width=dset['importance'], color='#1976D2')
plt.title('RFECV - Feature Importances', fontsize=20, pad=20)
plt.xlabel('Importance', fontsize=14, labelpad=20)
plt.show()

In [None]:
# Create a naive random forest classifier
#clf = RandomForestClassifier(n_estimators=1000, random_state=0, n_jobs=-1)
# Train the classifier
cv_scores = cross_val_score(clf, X_train_rfe_selected, np.ravel(Y_train), cv=10)

print("Naive Random Forest all features")
print(f"Mean Accuracy: {np.mean(cv_scores)} \nStandard Deviation : {np.std(cv_scores)}")

In [None]:
scores_df = scores_df.append({'# features':'rfe',
               'Mean Accuracy':np.mean(cv_scores),
               'Std':np.std(cv_scores)}, ignore_index=True)
scores_df.head()

In [None]:
plt.errorbar(scores_df['# features'], scores_df['Mean Accuracy'], yerr=scores_df['Std'], fmt="o")
plt.title('Feature Selection Strategy Comparison')
plt.show()

## Modeling

In [None]:
# Let's use the result of the Recursive Feature Elimination 
X_train = X_train_rfe_selected.copy()
#np.ravel(Y_train)
X_test = X_test_rfe_selected.copy()
y_test = Y_test.copy()

## GridSearch CV: XGB and RF compared

### XGB Parameter Tuning

In [None]:
import xgboost as xgb 

In [None]:
#The data is stored in a DMatrix object 
#label is used to define our outcome variable
dtrain=xgb.DMatrix(X_train,label=Y_train)
dtest=xgb.DMatrix(X_test, label=Y_test)

In [None]:
#setting parameters for xgboost
parameters={'max_depth':7, 'eta':1, 'objective':'binary:logistic','eval_metric':'auc','learning_rate':.05}

In [None]:
#training our model 
num_round=50
from datetime import datetime 
start = datetime.now() 
xg=xgb.train(parameters,dtrain,num_round) 
stop = datetime.now()

In [None]:
#Execution time of the model 
execution_time_xgb = stop-start 
print(f"Execution time :{execution_time_xgb.total_seconds()} s")

In [None]:
#now predicting our model on test set 
ypred=xg.predict(dtest) 
ypred

In [None]:
len(ypred)

In [None]:
#Converting probabilities into 1 or 0  
for i in range(0,125): 
    if ypred[i]>=.5:       # setting threshold to .5 
       ypred[i]=1 
    else: 
       ypred[i]=0  

In [None]:
#calculating accuracy of our model 
from sklearn.metrics import accuracy_score 
accuracy_xgb = accuracy_score(y_test,ypred) 
accuracy_xgb


In [None]:
# simple xgb

In [None]:
import xgboost as xgb

# # Let's use the result of the Recursive Feature Elimination 
# X_train = X_train_rfe_selected.copy()
# #np.ravel(Y_train)
# X_test = X_test_rfe_selected.copy()
# y_test = Y_test.copy()


model=xgb.XGBClassifier(random_state=1,learning_rate=0.01)
model.fit(X_train, Y_train)
model.score(X_test, Y_test)

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

import joblib
import pickle

from timeit import default_timer as timer
from datetime import datetime, date
from tqdm.auto import tqdm

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer

from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer


# model = XGBClassifier()

# parameters = {
#     'booster':'gbtree',
#     'learning_rate':
#     'thread':,
#     'eta':,
#     'min_child_weight':,
#     'max_depth':,
#     'max_leaf_nodes':,
#     'gamma':,
#     'subsample':,
#     'colsample_bytree':
# }

In [None]:
# fine tune XGB and RF

# define search space 
params_list = [{'learning_rate':Real(1e-2,1e+0,prior='log-uniform'),
                'n_estimators':Integer(100,1000,prior='uniform'),
                'max_depth':Integer(3,10,prior='uniform'),
                'subsample':Real(0.8,1.0,prior='uniform'),
                'gamma':Integer(0,5,prior='uniform')},
               {'n_estimators':Integer(100,1000,prior='uniform'),
                'max_depth':Integer(3,10,prior='uniform'),
                'min_samples_split':Integer(2,10,prior='uniform'),
                'min_samples_leaf':Integer(1,10,prior='uniform'),  
                'class_weight':Categorical(['balanced'])}]

estimator_list = [XGBClassifier(),RandomForestClassifier()]

# define the search
for model, param_grid in zip(estimator_list,params_list):
    print("Searching best parameters for "+ str(model))
    print("Start Time : "+ str(datetime.now()))
    search = BayesSearchCV(estimator=model,
                         search_spaces=param_grid,
                         n_jobs=-1,
                         n_iter=50,
                         n_points=1,
                         random_state=0,
                         cv=3,
                         verbose=0,
                         refit=True,
                         return_train_score = True)
    # perform bayesian optimization
    print("Start Time : "+ str(datetime.now()))
    search.fit(X_train, Y_train)
    print("End Time : "+ str(datetime.now()))
    pkl_filename = "pickle_bayes_search_model_"+ str(datetime.now())  +".pkl"
    pkl_path = "../models/"+pkl_filename
    print(type(search))
    with open(pkl_path, 'wb') as file:
        pickle.dump(search, file)
    print("|----Best Score:" + str(search.best_score_))
    print("|----Best Parameters: " + str(search.best_params_))

In [None]:
#look at confusion matrix
# minimize erros on failure
# 70% errors basically data imblance... so maybe balance data?

### RandomForest Parameter Tuning

In [None]:
RandomForestClassifier().get_params().keys()

In [None]:
XGBClassifier().get_params().keys()