In [98]:
# Initial modules
import numpy as np
import pandas as pd
import sklearn

In [99]:
#load data

traindata= pd.read_csv('C:/Users/amatu/Documents/train_imperson_without4n7_balanced_data.csv')
testdata= pd.read_csv('C:/Users/amatu/Documents/test_imperson_without4n7_balanced_data.csv')

In [100]:
# Separate X and Y

X_train, Y_train = traindata.loc[:, traindata.columns != '155'], traindata['155']

X_test, Y_test = testdata.loc[:, testdata.columns != '155'], testdata['155']


In [139]:
#Preprocessing pipeline


from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import Normalizer

from sklearn.pipeline import Pipeline


preprocessing_pipeline = Pipeline([
    ('zero variance', VarianceThreshold()),
    ('scale 0_1', MinMaxScaler()),
    ('top10 features', SelectKBest(chi2, k=20))])

pipe2= Pipeline([
    ('zero variance', VarianceThreshold()),
    ('norm 1', Normalizer()),
    ('top10 features', SelectKBest(chi2, k=20))])


In [140]:
X_train_ready = preprocessing_pipeline.fit_transform(X_train, Y_train)

In [141]:
xtrainp2=pipe2.fit_transform(X_train, Y_train)

In [142]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score


from sklearn.ensemble import ExtraTreesClassifier




num_folds = 10
seed = 7
kfold = KFold(n_splits=num_folds, random_state=seed)
model = ExtraTreesClassifier()
results = cross_val_score(model, X_train_ready, Y_train, cv=kfold)
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))




Accuracy: 98.753% (3.524%)




In [143]:
results = cross_val_score(model, xtrainp2, Y_train, cv=kfold)
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))




Accuracy: 97.064% (5.377%)


In [144]:
#Evaluate on test set

from sklearn.metrics import confusion_matrix

model.fit(X_train_ready, Y_train)

X_test_ready =  preprocessing_pipeline.transform(X_test)

predicted = model.predict(X_test_ready)

matrix = confusion_matrix(Y_test, predicted)
tn, fp, fn, tp = matrix.ravel()
tp, tn




(1468, 20068)

In [145]:
from sklearn.metrics import classification_report


report = classification_report(Y_test, predicted)
print(report)


              precision    recall  f1-score   support

           0       0.52      1.00      0.68     20079
           1       0.99      0.07      0.14     20079

    accuracy                           0.54     40158
   macro avg       0.76      0.54      0.41     40158
weighted avg       0.76      0.54      0.41     40158



In [146]:
model.fit(xtrainp2, Y_train)

X_test_ready2 =  pipe2.transform(X_test)

predicted = model.predict(X_test_ready2)

matrix = confusion_matrix(Y_test, predicted)
tn, fp, fn, tp = matrix.ravel()
tp, tn


(1380, 20067)

In [147]:


report2= classification_report(Y_test, predicted)
print(report2)


              precision    recall  f1-score   support

           0       0.52      1.00      0.68     20079
           1       0.99      0.07      0.13     20079

    accuracy                           0.53     40158
   macro avg       0.75      0.53      0.41     40158
weighted avg       0.75      0.53      0.41     40158



In [148]:

#check trainset columns with var>0

step1 = preprocessing_pipeline.named_steps['zero variance'].get_support() 

x_cols = X_train.columns.values[step1==True] 

In [149]:
step10= pipe2.named_steps['zero variance'].get_support() 

x_cols10 = X_train.columns.values[step10==True] 

In [150]:
x_cols

array(['5', '6', '8', '9', '14', '15', '16', '18', '20', '26', '29', '38',
       '43', '47', '48', '50', '51', '52', '61', '62', '64', '66', '67',
       '68', '70', '71', '72', '73', '75', '76', '77', '78', '79', '80',
       '82', '83', '84', '86', '88', '89', '90', '93', '94', '97', '98',
       '104', '105', '106', '107', '108', '109', '110', '111', '112',
       '113', '117', '118', '119', '120', '121', '122', '123', '125',
       '126', '127', '128', '129', '130', '133', '138', '140', '141',
       '142', '143', '144', '145', '146', '154'], dtype=object)

In [151]:
x_cols10

array(['5', '6', '8', '9', '14', '15', '16', '18', '20', '26', '29', '38',
       '43', '47', '48', '50', '51', '52', '61', '62', '64', '66', '67',
       '68', '70', '71', '72', '73', '75', '76', '77', '78', '79', '80',
       '82', '83', '84', '86', '88', '89', '90', '93', '94', '97', '98',
       '104', '105', '106', '107', '108', '109', '110', '111', '112',
       '113', '117', '118', '119', '120', '121', '122', '123', '125',
       '126', '127', '128', '129', '130', '133', '138', '140', '141',
       '142', '143', '144', '145', '146', '154'], dtype=object)

In [152]:

#check trainset columns after applying select k best

step3=preprocessing_pipeline.named_steps['top10 features'].get_support()

xk10= x_cols[step3==True] 

In [153]:
step30=pipe2.named_steps['top10 features'].get_support()

xk100= x_cols10[step30==True] 

In [154]:
xk100

array(['8', '9', '47', '50', '51', '66', '67', '68', '70', '71', '73',
       '77', '82', '94', '130', '140', '142', '145', '146', '154'],
      dtype=object)

In [155]:
xk10

array(['8', '9', '47', '50', '51', '66', '67', '68', '70', '71', '73',
       '82', '94', '110', '122', '129', '130', '145', '146', '154'],
      dtype=object)

In [156]:
x2df = pd.DataFrame(xtrainp2, columns = xk100)

In [157]:
x2df.head()

Unnamed: 0,8,9,47,50,51,66,67,68,70,71,73,77,82,94,130,140,142,145,146,154
0,0.002125,0.002125,0.205949,0.0,0.23224,0.23224,0.071458,0.11612,0.0,0.23224,0.0,0.002732,0.219765,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.109462,0.0,0.25224,0.12612,0.25224,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.016454,0.016454,0.0,0.233105,0.0,0.0,0.143448,0.0,0.0,0.0,0.0,0.001371,0.059542,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.021168,0.021168,0.0,0.223356,0.0,0.0,0.137449,0.0,0.0,0.0,0.0,0.001314,0.016254,0.223356,0.223356,0.0,0.0,0.0,0.0,0.0
4,0.016439,0.016439,0.0,0.232884,0.0,0.0,0.143312,0.0,0.0,0.0,0.0,0.00137,0.059828,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [158]:
X_train.loc[0:4, '68']

0    0.5
1    0.0
2    0.0
3    0.0
4    0.0
Name: 68, dtype: float64

In [159]:
#check testset columns with var>0

s1test = preprocessing_pipeline.named_steps['zero variance'].get_support() 

xtest_cols = X_test.columns.values[s1test==True] 

In [160]:
xtest_cols10 = X_test.columns.values[step10==True] 

In [161]:
xtest_cols10

array(['5', '6', '8', '9', '14', '15', '16', '18', '20', '26', '29', '38',
       '43', '47', '48', '50', '51', '52', '61', '62', '64', '66', '67',
       '68', '70', '71', '72', '73', '75', '76', '77', '78', '79', '80',
       '82', '83', '84', '86', '88', '89', '90', '93', '94', '97', '98',
       '104', '105', '106', '107', '108', '109', '110', '111', '112',
       '113', '117', '118', '119', '120', '121', '122', '123', '125',
       '126', '127', '128', '129', '130', '133', '138', '140', '141',
       '142', '143', '144', '145', '146', '154'], dtype=object)

In [162]:
xtest_cols

array(['5', '6', '8', '9', '14', '15', '16', '18', '20', '26', '29', '38',
       '43', '47', '48', '50', '51', '52', '61', '62', '64', '66', '67',
       '68', '70', '71', '72', '73', '75', '76', '77', '78', '79', '80',
       '82', '83', '84', '86', '88', '89', '90', '93', '94', '97', '98',
       '104', '105', '106', '107', '108', '109', '110', '111', '112',
       '113', '117', '118', '119', '120', '121', '122', '123', '125',
       '126', '127', '128', '129', '130', '133', '138', '140', '141',
       '142', '143', '144', '145', '146', '154'], dtype=object)

In [163]:
#check testset columns after applying select k best

s3test=preprocessing_pipeline.named_steps['top10 features'].get_support()

xtestk10= xtest_cols[s3test==True] 

In [164]:
xtestk100= xtest_cols[step30==True] 

In [165]:
xtestk10

array(['8', '9', '47', '50', '51', '66', '67', '68', '70', '71', '73',
       '82', '94', '110', '122', '129', '130', '145', '146', '154'],
      dtype=object)

In [166]:
xtestk100

array(['8', '9', '47', '50', '51', '66', '67', '68', '70', '71', '73',
       '77', '82', '94', '130', '140', '142', '145', '146', '154'],
      dtype=object)

In [167]:
X_test_ready[1:3, 0]

array([0.98824, 0.98824])

In [168]:
X_test.loc[1:3, '8']

1    0.98824
2    0.98824
3    0.00000
Name: 8, dtype: float64

In [169]:
X_testdf = pd.DataFrame(X_test_ready2, columns = xtestk100)

In [170]:
X_testdf.loc[0:4, '67']

0    0.128264
1    0.128572
2    0.127882
3    0.248559
4    0.123110
Name: 67, dtype: float64

In [171]:
X_test.loc[0:4, '67']

0    0.61538
1    0.61538
2    0.61538
3    1.00000
4    0.61538
Name: 67, dtype: float64