In [117]:
# Initial modules
import numpy as np
import pandas as pd
import sklearn
np.random.seed(404) 

In [118]:
#load data

traindata= pd.read_csv('datasets/train_imperson_without4n7_balanced_data.csv')
testdata= pd.read_csv('datasets/test_imperson_without4n7_balanced_data.csv')

In [119]:
# Separate X and Y

X_train, Y_train = traindata.loc[:, traindata.columns != '155'], traindata['155']

X_test, Y_test = testdata.loc[:, testdata.columns != '155'], testdata['155']


In [120]:
#Preprocessing pipeline

from sklearn.feature_selection import RFE
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import Normalizer

from sklearn.pipeline import Pipeline


preprocessing_pipeline = Pipeline([
    ('zero variance', VarianceThreshold()),
    ('scale 0_1', MinMaxScaler()),
    ('top20 features', SelectKBest(chi2, k=20))])

pipe2= Pipeline([
    ('zero variance', VarianceThreshold()),
    ('norm 1', Normalizer()),
    ('top20 features', SelectKBest(chi2, k=20))])


In [121]:
X_train_ready = preprocessing_pipeline.fit_transform(X_train, Y_train)

In [122]:
xtrainp2=pipe2.fit_transform(X_train, Y_train)

In [123]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score


from sklearn.linear_model import LogisticRegression




num_folds = 10
seed = 7
kfold = KFold(n_splits=num_folds, random_state=seed)
model = LogisticRegression(C=1000)
results = cross_val_score(model, X_train_ready, Y_train, cv=kfold)
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))




Accuracy: 97.198% (5.518%)


In [77]:

num_folds = 10
seed = 7
kfold = KFold(n_splits=num_folds, random_state=seed)
model2 = LogisticRegression()


results = cross_val_score(model2, xtrainp2, Y_train, cv=kfold)
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))




Accuracy: 94.518% (9.900%)


In [78]:
#Evaluate on test set

from sklearn.metrics import confusion_matrix

model.fit(X_train_ready, Y_train)

X_test_ready =  preprocessing_pipeline.transform(X_test)

predicted = model.predict(X_test_ready)

matrix = confusion_matrix(Y_test, predicted)
tn, fp, fn, tp = matrix.ravel()
tp, tn




(19938, 19654)

###  hyper parameter tuning logistic regression


### cv on training data via gridsearch showed good accuracy for all datasets, but when it came to the test data only min max scaler performed well 

In [79]:
from sklearn.model_selection import GridSearchCV

## grid searchCV
pipeline1 = Pipeline([
    ('zero variance', VarianceThreshold()),
    ('scale 0_1', MinMaxScaler()),
    ('top20 features', SelectKBest(chi2, k=20)),
    ('model',LogisticRegression())])



grid=GridSearchCV(cv=10,
            estimator=pipeline1,
             param_grid={'model__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]},
                  scoring = 'accuracy',
                 n_jobs=-1)
    
    
grid.fit(X_train, Y_train)
#sorted(pipeline1.get_params().keys()) #list of the parameters you can tune 
print(grid.best_params_)
print(grid.best_score_)




{'model__C': 1000}
0.9875520382506904


In [88]:
pipeline2= Pipeline([
    ('zero variance', VarianceThreshold()),
    ('norm 1', Normalizer()),
    ('top20 features', SelectKBest(chi2, k=20)),
    ('model',LogisticRegression())])

grid=GridSearchCV(cv=10,
            estimator=pipeline1,
             param_grid={'model__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]},
                  scoring = 'accuracy',
                 n_jobs=-1)

grid.fit(X_train, Y_train)

print(grid.best_params_)
print(grid.best_score_)



{'model__C': 1000}
0.9875520382506904


In [133]:
from sklearn.metrics import accuracy_score
pipeline1 = Pipeline([
    ('zero variance', VarianceThreshold()),
    ('scale 0_1', MinMaxScaler()),
    ('top20 features', SelectKBest(chi2, k=20)),
    ('model',LogisticRegression(C=1000))])

pipeline1.fit(X_train,Y_train)

Y_predict=pipeline1.predict(X_test)
accuracy_score(Y_test, Y_predict)

report1= classification_report(Y_test, Y_predict)
print(report1)



              precision    recall  f1-score   support

           0       0.99      0.98      0.99     20079
           1       0.98      0.99      0.99     20079

    accuracy                           0.99     40158
   macro avg       0.99      0.99      0.99     40158
weighted avg       0.99      0.99      0.99     40158



In [139]:
from sklearn.metrics import accuracy_score
pipeline2 = Pipeline([
    ('zero variance', VarianceThreshold()),
    ('norm 1', Normalizer()),
    ('top20 features', SelectKBest(chi2, k=20)),
    ('model',LogisticRegression(C=1000))])

pipeline2.fit(X_train,Y_train)

Y_predict=pipeline2.predict(X_test)
accuracy_score(Y_test, Y_predict)

report12= classification_report(Y_test, Y_predict)
print(report2)



              precision    recall  f1-score   support

           0       0.51      0.96      0.66     20079
           1       0.64      0.07      0.12     20079

    accuracy                           0.51     40158
   macro avg       0.57      0.51      0.39     40158
weighted avg       0.57      0.51      0.39     40158



In [140]:
from sklearn.metrics import accuracy_score
pipeline3 = Pipeline([
    ('zero variance', VarianceThreshold()),
    ('top20 features', SelectKBest(chi2, k=20)),
    ('model',LogisticRegression(C=1000))])

pipeline3.fit(X_train,Y_train)

Y_predict=pipeline3.predict(X_test)
accuracy_score(Y_test, Y_predict)

report3= classification_report(Y_test, Y_predict)
print(report3)



              precision    recall  f1-score   support

           0       0.52      0.97      0.67     20079
           1       0.75      0.09      0.15     20079

    accuracy                           0.53     40158
   macro avg       0.63      0.53      0.41     40158
weighted avg       0.63      0.53      0.41     40158



In [141]:
from sklearn.preprocessing import PowerTransformer
from sklearn.feature_selection import f_classif
pipeline4 = Pipeline([
    ('zero variance', VarianceThreshold()),
    ('power', PowerTransformer(method = 'yeo-johnson')),
    ('top20 features', SelectKBest(f_classif, k=20)),
    ('model',LogisticRegression(C=1000))])

pipeline4.fit(X_train,Y_train)

Y_predict=pipeline4.predict(X_test)
accuracy_score(Y_test, Y_predict)

report4= classification_report(Y_test, Y_predict)
print(report4)

  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims)


              precision    recall  f1-score   support

           0       1.00      0.78      0.88     20079
           1       0.82      1.00      0.90     20079

    accuracy                           0.89     40158
   macro avg       0.91      0.89      0.89     40158
weighted avg       0.91      0.89      0.89     40158



# Adaboost classifier

In [87]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import AdaBoostClassifier

clf = AdaBoostClassifier(n_estimators=100)
scores = cross_val_score(clf, X_train_ready, Y_train, cv=5)
scores.mean()

0.987479427106229

In [None]:
clf = AdaBoostClassifier(n_estimators=100)
scores = cross_val_score(clf, X_test_ready,Y_test, cv=5)
scores.mean()

# Adaboost trained on training data and evaluated on test data

In [102]:
%time
clf = AdaBoostClassifier(n_estimators=100)
clf.fit(X_train_ready, Y_train)
X_test_ready =  preprocessing_pipeline.transform(X_test)
scores = cross_val_score(clf, X_test_ready,Y_test, cv=5)
scores.mean()

Wall time: 0 ns


0.9918077503187748

### ada boost did well on varience threshold only and minimaxscaler

In [111]:
pipeline5 = Pipeline([
    ('zero variance', VarianceThreshold()),
    ('scale 0_1', MinMaxScaler()),
    ('top20 features', SelectKBest(chi2, k=20)),
    ('ada',AdaBoostClassifier())])



grid=GridSearchCV(cv=10,
            estimator=pipeline5,
             param_grid={'ada__n_estimators': [10,50,100,150],
                  'ada__learning_rate':[0.001,0.1,0.01]},
                  scoring = 'accuracy',
                 n_jobs=-1)
    
    
grid.fit(X_train, Y_train)
sorted(pipeline5.get_params().keys()) #list of the parameters you can tune 
print(grid.best_params_)
print(grid.best_score_)

{'ada__learning_rate': 0.1, 'ada__n_estimators': 150}
0.9856869049091134


In [108]:
sorted(pipeline5.get_params().keys())

['ada',
 'ada__algorithm',
 'ada__base_estimator',
 'ada__learning_rate',
 'ada__n_estimators',
 'ada__random_state',
 'memory',
 'scale 0_1',
 'scale 0_1__copy',
 'scale 0_1__feature_range',
 'steps',
 'top20 features',
 'top20 features__k',
 'top20 features__score_func',
 'verbose',
 'zero variance',
 'zero variance__threshold']

In [143]:
pipeline6 = Pipeline([
    ('zero variance', VarianceThreshold()),
    ('scale 0_1', MinMaxScaler()),
    ('top20 features', SelectKBest(f_classif, k=20)),
    ('ada',AdaBoostClassifier(n_estimators=150,learning_rate=0.1))])

pipeline6.fit(X_train,Y_train)

Y_predict=pipeline6.predict(X_test)
accuracy_score(Y_test, Y_predict)

report6= classification_report(Y_test, Y_predict)
print(report6)

              precision    recall  f1-score   support

           0       1.00      0.98      0.99     20079
           1       0.98      1.00      0.99     20079

    accuracy                           0.99     40158
   macro avg       0.99      0.99      0.99     40158
weighted avg       0.99      0.99      0.99     40158



In [144]:
pipeline7 = Pipeline([
    ('zero variance', VarianceThreshold()),
    ('top20 features', SelectKBest(f_classif, k=20)),
    ('ada',AdaBoostClassifier(n_estimators=150,learning_rate=0.1))])

pipeline7.fit(X_train,Y_train)

Y_predict=pipeline7.predict(X_test)
accuracy_score(Y_test, Y_predict)

report7= classification_report(Y_test, Y_predict)
print(report7)

              precision    recall  f1-score   support

           0       1.00      0.98      0.99     20079
           1       0.98      1.00      0.99     20079

    accuracy                           0.99     40158
   macro avg       0.99      0.99      0.99     40158
weighted avg       0.99      0.99      0.99     40158



In [145]:
pipeline8 = Pipeline([
    ('zero variance', VarianceThreshold()),
    ('norm 1', Normalizer()),
    ('top20 features', SelectKBest(f_classif, k=20)),
    ('ada',AdaBoostClassifier(n_estimators=150,learning_rate=0.1))])

pipeline8.fit(X_train,Y_train)

Y_predict=pipeline8.predict(X_test)
accuracy_score(Y_test, Y_predict)
report8= classification_report(Y_test, Y_predict)
print(report8)

              precision    recall  f1-score   support

           0       0.50      0.99      0.66     20079
           1       0.00      0.00      0.00     20079

    accuracy                           0.49     40158
   macro avg       0.25      0.49      0.33     40158
weighted avg       0.25      0.49      0.33     40158

