# SKYNET - AI Hackathon

## Standard import of libraries

In [1]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix,accuracy_score,roc_auc_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

!pip install joblib
import joblib

  from numpy.core.umath_tests import inner1d


[33mYou are using pip version 10.0.1, however version 19.1.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


## Loading the excel sheet

In [2]:
file = 'https://s3.ap-south-1.amazonaws.com/skynetdataset/Program+Data+(Scrubbed)+-+Problem+Statement+1.xlsx'

# Table 1
df1 = pd.read_excel(file,sheet_name=2)

# Table 2
df2 = pd.read_excel(file,sheet_name=3)

## Pre-Processing

In [3]:
df3 = df1.iloc[1:,[4,8,15,16,-1]]

# PreProcessing Attendance
df3.Attendance = df3.Attendance.fillna(0)
df3.Attendance = df3.Attendance.apply(lambda x: 0 if (x==0 or x=='Absent' or x=='No Data') else 1)

# Renaming columns
df3.columns =['ID','RecordType','Submitted','Attendance','Completed']

# PreProcessing RecordType
df3.iloc[:,2] = df3.iloc[:,2].fillna(0)
df3.RecordType = df3.RecordType.apply(lambda x: 1 if (x=='Large Assignment' or x=='Small Assignment' or x=='Attendance') else 0)

# Removing zero values and summing up
df3 = df3[df3.RecordType != 0]
tab1 = df3.groupby(by='ID').sum()
tab1 = tab1.drop(['RecordType'],axis=1)

tab1.Completed = tab1.Completed.apply(lambda x : 0 if x==0 else 1)

# Dropping duplicates
df2 = df2.drop_duplicates()

tab2 = df2.iloc[1:,[4,8,9,10,11,12,17,19,20,21,22,25,36,37]]

# Renaming the columns
tab2.columns = ['ID','Age','Race','Gender','Income','Education','Promise','MaxPts','Interest',
                'HoursCd','TimeSpt','HkrScr','EssScr','Completed2']

# Preprocessing Race column
tab2.Race = tab2.Race.apply(lambda x : 'Other' if x=='Prefer not to answer' else x)
temp = pd.get_dummies(tab2.Race,drop_first=True)
tab2 = tab2.reset_index(drop=True)
temp = temp.reset_index(drop=True)
tab2 = pd.concat([tab2,temp],axis=1,join_axes=[tab2.index])

# Preprocessing Gender column
tab2.Gender = tab2.Gender.replace('Prefer not to answer','Other')

temp = pd.get_dummies(tab2.Gender,drop_first=True)
tab2 = tab2.reset_index(drop=True)
temp = temp.reset_index(drop=True)
tab2 = pd.concat([tab2,temp],axis=1,join_axes=[tab2.index])

# Preprocessing Education column
tab2.Education = tab2.Education.fillna('Some high school')
tab2.Education = tab2.Education.replace('Some high school','High school or GED')
tab2.Education = tab2.Education.replace("Some college","Bachelor's")
tab2.Education = tab2.Education.map({'High school or GED':1, "Associate's":2, "Bachelor's":3,
                                     "Master's":4, 'Ph.D / Doctorate':5})

tab2 = tab2.drop(['MaxPts'],axis=1)

# PreProcessing Interest column
temp = pd.get_dummies(tab2.Interest,drop_first=True)
tab2 = tab2.reset_index(drop=True)
temp = temp.reset_index(drop=True)
tab2 = pd.concat([tab2,temp],axis=1,join_axes=[tab2.index])

# Preprocessing HoursCd column
tab2.HoursCd = tab2.HoursCd.map({'0-5':1, '6-50':2, '51-250':3, '250+':4})

# Preprocessing TimeSpt column
tab2.TimeSpt = tab2.TimeSpt.map({'0-5':1, '6-10':2, '11-20':3, '21-40':4, '40+':5})

# Preprocessing Income tab
tab2.Income = tab2.Income.map({'Less than $5,000':1, 'Prefer not to answer':1, '$5,000 - $10,000':2, '$10,000 - $20,000':3, '$20,000 - $30,000':4,
                '$30,000 - $40,000':5, '$40,000 - $50,000':6, '$50,000 - $60,000':7, '$60,000 - $70,000':8,
                '$70,000 - $80,000':9, '$80,000 - $90,000':10, '$90,000 - $100,000':11, '$100,000+':12})

# Dropping columns for which dummy variables have been created
tab2.drop(['Race','Gender','Interest'],axis=1,inplace=True)

# Preprocessing EssScr column
tab2.EssScr = tab2.EssScr.fillna(0)

# Resetting the index before merge
tab1 = tab1.reset_index()
tab2 = tab2.reset_index()

# Merging tables on ID
final_df = pd.merge(tab1,tab2,on='ID',how='inner')

## Preparing the X and y Variables

In [4]:
X = final_df.iloc[:,[1,2,4,5,6,7,8,9,10,11,13,14,15,16,17,18,19,20,21,22,23,24,25,26]]
y = final_df.iloc[:,3].values

# Scaling X and y
sc_X = StandardScaler()
X = sc_X.fit_transform(X)

X = pd.DataFrame(X)

# Performing the train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## Model selection

In [5]:
# Performing initial model selection based on 10 fold stratified shuffle split

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import accuracy_score,roc_auc_score
def validate(data, labels,clf):
    '''
    Ten-fold cross-validation with stratified sampling.
    Returned metrics : Accuracy
    '''
    accuracy_scores = []
    roc_auc_scores = []

    sss = StratifiedShuffleSplit(n_splits=10)
    for train_index, test_index in sss.split(data, labels):
        x_train, x_test = data.iloc[train_index], data.iloc[test_index]
        y_train, y_test = labels[train_index], labels[test_index]
        clf.fit(x_train, y_train)
        y_pred = clf.predict(x_test)
        accuracy_scores.append(accuracy_score(y_test, y_pred))
        roc_auc_scores.append(roc_auc_score(y_test,y_pred))

    print('Accuracy', np.mean(accuracy_scores))
    print('ROC',np.mean(roc_auc_scores))

In [6]:
# Accuracy and ROC for LogisticRegression

clf1 = LogisticRegression(random_state = 0)
validate(X,y,clf1)

Accuracy 0.9828402366863903
ROC 0.9831230726100364


In [7]:
# Accuracy and ROC for GBC

from sklearn.ensemble import GradientBoostingClassifier
clf4 = GradientBoostingClassifier()
validate(X,y,clf4)

Accuracy 0.9911242603550295
ROC 0.9911340061676478


In [8]:
# Accuracy and ROC for SVC

clf2 = SVC(C = 6, kernel = 'rbf', random_state = 0)
validate(X,y,clf2)

Accuracy 0.9792899408284024
ROC 0.979779927109616


In [9]:
# Accuracy and ROC for RabdomForest

clf3 = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 123,
                                       min_samples_leaf = 1, min_samples_split = 3, n_jobs = -1)
validate(X,y,clf3)

Accuracy 0.9846153846153844
ROC 0.9849172974488365


The GradientBoostingClassifier gives the best performance for the given dataset. Further tests have to be performed on various tree and boosting models to identify the best model.

In [10]:
# Estimator helper class is used to perform grid search on multiple models.

class EstimatorSelectionHelper:

    def __init__(self, models, params):
        if not set(models.keys()).issubset(set(params.keys())):
            missing_params = list(set(models.keys()) - set(params.keys()))
            raise ValueError("Some estimators are missing parameters: %s" % missing_params)
        self.models = models
        self.params = params
        self.keys = models.keys()
        self.grid_searches = {}

    def fit(self, X, y, cv=3, n_jobs=3, verbose=1, scoring=None, refit=False):
        for key in self.keys:
            print("Running GridSearchCV for %s." % key)
            model = self.models[key]
            params = self.params[key]
            gs = GridSearchCV(model, params, cv=cv, n_jobs=n_jobs,
                              verbose=verbose, scoring=scoring, refit=refit,
                              return_train_score=True)
            gs.fit(X,y)
            self.grid_searches[key] = gs 
    def score_summary(self, sort_by='mean_score'):
        def row(key, scores, params):
            d = {
                 'estimator': key,
                 'min_score': min(scores),
                 'max_score': max(scores),
                 'mean_score': np.mean(scores),
                 'std_score': np.std(scores),
            }
            return pd.Series({**params,**d})

        rows = []
        for k in self.grid_searches:
            print(k)
            params = self.grid_searches[k].cv_results_['params']
            scores = []
            for i in range(self.grid_searches[k].cv):
                key = "split{}_test_score".format(i)
                r = self.grid_searches[k].cv_results_[key]        
                scores.append(r.reshape(len(params),1))

            all_scores = np.hstack(scores)
            for p, s in zip(params,all_scores):
                rows.append((row(k, s, p)))

        df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False)

        columns = ['estimator', 'min_score', 'mean_score', 'max_score', 'std_score']
        columns = columns + [c for c in df.columns if c not in columns]

        return df[columns]

In [11]:
# Standard imports and preparation of parameters for the model

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.svm import SVC

models1 = {
    'ExtraTreesClassifier': ExtraTreesClassifier(),
    'RandomForestClassifier': RandomForestClassifier(),
    'AdaBoostClassifier': AdaBoostClassifier(),
    'GradientBoostingClassifier': GradientBoostingClassifier(),
    'SVC': SVC()
}

params1 = {
    'ExtraTreesClassifier': { 'n_estimators': [16, 32] },
    'RandomForestClassifier': { 'n_estimators': [16, 32] },
    'AdaBoostClassifier':  { 'n_estimators': [16, 32] },
    'GradientBoostingClassifier': { 'n_estimators': [1,5,6,8,9,10,16, 32], 'learning_rate': [0.1,0.5,0.6,0.7,0.8, 1.0]},
    'SVC': [
        {'kernel': ['linear'], 'C': [1, 10]},
        {'kernel': ['rbf'], 'C': [1, 6, 10], 'gamma': [0.001, 0.0001]},
    ]
}

In [12]:
# Executing the helper class

helper1 = EstimatorSelectionHelper(models1, params1)
helper1.fit(X_train, y_train, scoring='f1', n_jobs=2)

Running GridSearchCV for ExtraTreesClassifier.
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Running GridSearchCV for RandomForestClassifier.
Fitting 3 folds for each of 2 candidates, totalling 6 fits


[Parallel(n_jobs=2)]: Done   3 out of   6 | elapsed:    0.1s remaining:    0.1s
[Parallel(n_jobs=2)]: Done   6 out of   6 | elapsed:    0.2s finished


Running GridSearchCV for AdaBoostClassifier.
Fitting 3 folds for each of 2 candidates, totalling 6 fits


[Parallel(n_jobs=2)]: Done   3 out of   6 | elapsed:    0.1s remaining:    0.1s
[Parallel(n_jobs=2)]: Done   6 out of   6 | elapsed:    0.2s finished
[Parallel(n_jobs=2)]: Done   3 out of   6 | elapsed:    0.1s remaining:    0.1s
[Parallel(n_jobs=2)]: Done   6 out of   6 | elapsed:    0.2s finished


Running GridSearchCV for GradientBoostingClassifier.
Fitting 3 folds for each of 48 candidates, totalling 144 fits
Running GridSearchCV for SVC.
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=2)]: Done 144 out of 144 | elapsed:    1.4s finished
[Parallel(n_jobs=2)]: Done  24 out of  24 | elapsed:    1.1s finished


In [13]:
# Getting the helper score.

helper1.score_summary(sort_by='max_score')

ExtraTreesClassifier
RandomForestClassifier
AdaBoostClassifier
GradientBoostingClassifier
SVC


Unnamed: 0,estimator,min_score,mean_score,max_score,std_score,C,gamma,kernel,learning_rate,n_estimators
38,GradientBoostingClassifier,0.980973,0.986571,0.995763,0.0065507,,,,0.8,1.0
22,GradientBoostingClassifier,0.980973,0.986571,0.995763,0.0065507,,,,0.6,1.0
30,GradientBoostingClassifier,0.980973,0.986571,0.995763,0.0065507,,,,0.7,1.0
14,GradientBoostingClassifier,0.980973,0.986571,0.995763,0.0065507,,,,0.5,1.0
6,GradientBoostingClassifier,0.980973,0.986571,0.995763,0.0065507,,,,0.1,1.0
46,GradientBoostingClassifier,0.980973,0.986571,0.995763,0.0065507,,,,1.0,1.0
5,AdaBoostClassifier,0.978448,0.985718,0.991525,0.00543786,,,,,32.0
2,RandomForestClassifier,0.982833,0.987887,0.991489,0.00368022,,,,,16.0
7,GradientBoostingClassifier,0.980973,0.984475,0.989474,0.00362827,,,,0.1,5.0
8,GradientBoostingClassifier,0.980973,0.984475,0.989474,0.00362827,,,,0.1,6.0


## Final model

The final model selected based on the above test is GradientBoostingClassifier. This is pickled.

In [14]:
from sklearn.ensemble import GradientBoostingClassifier
clf_fin = GradientBoostingClassifier(learning_rate=0.8,n_estimators=16)
clf_fin.fit(X,y)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.8, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=16,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [15]:
# Creating the dumping

joblib.dump(clf_fin, 'model_joblib')

['model_joblib']