In [1]:
# Initial modules
import numpy as np
import pandas as pd
import sklearn

In [2]:
#load data

traindata= pd.read_csv('C:/Users/amatu/Documents/train_imperson_without4n7_balanced_data.csv')
testdata= pd.read_csv('C:/Users/amatu/Documents/test_imperson_without4n7_balanced_data.csv')

In [3]:
# Separate X and Y

X_train, Y_train = traindata.loc[:, traindata.columns != '155'], traindata['155']

X_test, Y_test = testdata.loc[:, testdata.columns != '155'], testdata['155']


In [4]:
#Required modules


from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Normalizer

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_classif
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression


from sklearn.pipeline import Pipeline





### Pipeline 1: Var Threshold, Kbest  f_classif, AdaBoost

In [5]:


from sklearn.model_selection import GridSearchCV


pipeline1 = Pipeline([
    ('zero variance', VarianceThreshold()),
    ('top features', SelectKBest(f_classif)),
    ('ada',AdaBoostClassifier())])


grid=GridSearchCV(cv=10,
            estimator=pipeline1,
              param_grid={'ada__n_estimators': [10,50,100,150],
                  'ada__learning_rate':[0.001,0.1,0.01],
                   'top features__k':[15,20,30,35]},
                  scoring = 'accuracy',
                 n_jobs=-1)
    
    
grid.fit(X_train, Y_train)
#sorted(pipeline1.get_params().keys()) list of the parameters you can tune 
print(grid.best_params_)
print(grid.best_score_)



{'ada__learning_rate': 0.01, 'ada__n_estimators': 100, 'top features__k': 35}
0.9389967437451053


In [7]:

#optimizing k produces a model overfitting train performing poorly on test

pipeline1 = Pipeline([
    ('zero variance', VarianceThreshold()),
    ('top20 features', SelectKBest(f_classif, k=20)),
    ('ada',AdaBoostClassifier(n_estimators = 150, learning_rate = 0.1))])


pipeline1.fit(X_train, Y_train)


Y_predict=pipeline1.predict(X_test)
accuracy_score(Y_test, Y_predict)*100


99.07614921061806

### Pipeline2: Var Threshold, Normalizer, Kbest20 chi2, logit

In [10]:

from sklearn.model_selection import GridSearchCV


pipeline2= Pipeline([
    ('zero variance', VarianceThreshold()),
    ('norm 1', Normalizer()),
    ('top features', SelectKBest(chi2)),
    ('model',LogisticRegression())])


grid=GridSearchCV(cv=10,
            estimator=pipeline2,
             param_grid={'top features__k':[15,20,30,35],
                 'model__C': [0.01, 0.1, 1, 10, 100, 1000]},
                  scoring = 'accuracy',
                 n_jobs=-1)
    
    
grid.fit(X_train, Y_train)
# sorted(pipeline2.get_params().keys()) #list of the parameters you can tune 
print(grid.best_params_)
print(grid.best_score_)






{'model__C': 1000, 'top features__k': 20}
0.98313136309303


In [17]:
pipeline2= Pipeline([
    ('zero variance', VarianceThreshold()),
    ('norm 1', Normalizer()),
    ('top20 features', SelectKBest(chi2, k=20)),
    ('model',LogisticRegression(C = 1000))])



pipeline2.fit(X_train, Y_train)


Y_predict=pipeline2.predict(X_test)
accuracy_score(Y_test, Y_predict)*100





88.75691020469147

### Pipeline 2a: Var Threshold, MinMax, Kbest20 chi2, logit

In [12]:
from sklearn.model_selection import GridSearchCV

pipeline2a= Pipeline([
    ('zero variance', VarianceThreshold()),
    ('minmax', MinMaxScaler()),
    ('top features', SelectKBest(chi2)),
    ('model',LogisticRegression())])


grid=GridSearchCV(cv=10,
            estimator=pipeline2a,
             param_grid={'top features__k':[15,20,30,35],'model__C': [0.01, 0.1, 1, 10, 100, 1000]},
                  scoring = 'accuracy',
                 n_jobs=-1)
    
    
grid.fit(X_train, Y_train)
#sorted(pipeline1.get_params().keys()) #list of the parameters you can tune 
print(grid.best_params_)
print(grid.best_score_)





{'model__C': 1000, 'top features__k': 20}
0.9875520382506904


In [18]:
pipeline2a= Pipeline([
    ('zero variance', VarianceThreshold()),
    ('minmax', MinMaxScaler()),
    ('top20 features', SelectKBest(chi2, k=20)),
    ('model',LogisticRegression(C=1000))])


pipeline2a.fit(X_train, Y_train)


Y_predict=pipeline2a.predict(X_test)
accuracy_score(Y_test, Y_predict)*100





98.59056725932567

### Pipeline 3: Var Threshold, MinMax, Kbest30 chi2, Ada

In [13]:
pipeline3 = Pipeline([
    ('zero variance', VarianceThreshold()),
    ('scale 0_1', MinMaxScaler()),
    ('top features', SelectKBest(f_classif)),
    ('ada',AdaBoostClassifier())])




grid=GridSearchCV(cv=10,
            estimator=pipeline3,
              param_grid={'ada__n_estimators': [10,50,100,150],
                  'ada__learning_rate':[0.001,0.1,0.01],
                         'top features__k':[15,20,30,35]},
                  scoring = 'accuracy',
                 n_jobs=-1)
    
    
grid.fit(X_train, Y_train)
#sorted(pipeline1.get_params().keys()) #list of the parameters you can tune 
print(grid.best_params_)
print(grid.best_score_)



{'ada__learning_rate': 0.01, 'ada__n_estimators': 100, 'top features__k': 35}
0.9389967437451053


In [21]:
pipeline3 = Pipeline([
    ('zero variance', VarianceThreshold()),
    ('scale 0_1', MinMaxScaler()),
    ('top20 features', SelectKBest(f_classif, k=30)),
    ('ada',AdaBoostClassifier(n_estimators=150,learning_rate=0.1))])

pipeline3.fit(X_train,Y_train)

Y_predict=pipeline3.predict(X_test)
accuracy_score(Y_test, Y_predict)*100

98.91677872404004

### Different optimum learning rate actually yields worse performance on test set 

In [14]:
pipeline3 = Pipeline([
    ('zero variance', VarianceThreshold()),
    ('scale 0_1', MinMaxScaler()),
    ('top20 features', SelectKBest(f_classif, k=35)),
    ('ada',AdaBoostClassifier(n_estimators=100,learning_rate=0.01))])

pipeline3.fit(X_train,Y_train)

Y_predict=pipeline3.predict(X_test)
accuracy_score(Y_test, Y_predict)*100

50.0