### 3. Model Training

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from matplotlib import pyplot

In [2]:
import warnings
warnings.filterwarnings('ignore')

#### Load dataset

In [3]:
X = pd.read_csv('X.csv')
y = pd.read_csv('y.csv')

#### Train test split

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.15,random_state=5)

#### Random Forest model

In [6]:
# Random Forest
rf = RandomForestClassifier()
# fit RF to training set
rf.fit(X_train, y_train)

RandomForestClassifier()

In [7]:
y_pred = rf.predict(X_test)

In [8]:
accuracy = accuracy_score(y_test, y_pred)
accuracy 

0.6412825651302605

In [9]:
matrix = confusion_matrix(y_test, y_pred)
print(matrix)

[[ 56   0   0  21   6   2   4   0   0]
 [  1  31   0   0   2   0  34   0   0]
 [  2   0   2   2   1   0   4   0   0]
 [ 15   2   1  73   2   1  14   0   0]
 [ 10   2   3   8  11   0   4   0   0]
 [  1   1   0   2   2  24  10   0   0]
 [  3   9   3   0   1   1 121   0   0]
 [  1   1   0   0   0   0   1   0   0]
 [  1   0   0   1   0   0   0   0   2]]


#### Feature Importance

In [10]:
# Top 25 features
features = pd.Series(rf.feature_importances_, index=X_train.columns)
features.sort_values(ascending=False, inplace=True)
print(features.head(25))

component №2     0.023604
component №3     0.017685
component №4     0.014677
component №16    0.014261
component №13    0.014184
component №12    0.013470
component №35    0.013255
component №9     0.012900
component №18    0.012722
component №25    0.012618
component №8     0.012500
component №19    0.012488
component №7     0.012446
component №15    0.012137
component №22    0.012102
component №33    0.012025
component №14    0.011817
component №10    0.011727
component №5     0.011548
component №64    0.011408
component №29    0.011276
component №11    0.011205
component №17    0.011064
component №70    0.011011
component №36    0.010919
dtype: float64


In [16]:
feature_list = []
for i in range(0, 25):
    feature_list.append(features.index[i])

In [18]:
X_feature = X[feature_list].copy()

In [19]:
X_feature

Unnamed: 0,component №2,component №3,component №4,component №16,component №13,component №12,component №35,component №9,component №18,component №25,...,component №33,component №14,component №10,component №5,component №64,component №29,component №11,component №17,component №70,component №36
0,-0.052967,-0.009110,-0.048373,0.053959,0.027003,-0.039624,0.101617,-0.093022,-0.019615,-0.050326,...,0.015592,0.028114,0.023448,-0.021367,0.016322,-0.157155,-0.032858,0.021163,-0.001972,0.015169
1,-0.107192,-0.067519,0.096123,-0.013960,0.022043,-0.019943,0.009838,-0.037674,0.043535,-0.003936,...,0.023506,0.017955,0.025788,0.095367,-0.019337,-0.041335,-0.041207,-0.007358,-0.046151,0.018553
2,-0.107192,-0.067519,0.096123,-0.013960,0.022043,-0.019943,0.009838,-0.037674,0.043535,-0.003936,...,0.023506,0.017955,0.025788,0.095367,-0.019337,-0.041335,-0.041207,-0.007358,-0.046151,0.018553
3,-0.078922,-0.050858,0.058433,-0.035953,-0.046607,0.132207,-0.022108,-0.020254,0.014758,-0.045953,...,-0.009628,0.016455,0.036479,0.001345,-0.032662,0.030128,-0.140624,0.019857,-0.023420,-0.018075
4,-0.027762,0.017160,0.046482,-0.004018,0.027140,-0.028391,-0.090351,-0.042904,0.015645,-0.033890,...,0.031928,-0.027954,-0.008979,0.069179,-0.024677,-0.040416,-0.020957,-0.002719,0.036321,0.009030
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3316,-0.056060,-0.011623,-0.006953,0.013618,-0.015145,-0.020178,-0.015675,-0.027799,-0.000163,-0.067528,...,-0.034576,0.003825,0.018952,0.021266,0.031151,0.061524,-0.075737,0.023422,0.021556,-0.047815
3317,-0.050790,-0.020150,-0.025461,0.048647,-0.008414,-0.017241,0.023434,-0.034639,0.002886,-0.060133,...,-0.053346,0.013372,0.022062,0.003461,0.064309,0.077412,-0.087247,0.023834,0.012669,-0.035728
3318,0.027652,0.049176,-0.077737,-0.013794,-0.083186,-0.049573,-0.049746,-0.005252,0.039897,0.110985,...,-0.052938,-0.053569,-0.088748,0.025920,0.021523,-0.027130,-0.029378,-0.025919,-0.001277,-0.055269
3319,-0.038513,0.008694,-0.079274,-0.104432,-0.023401,-0.016747,-0.025761,0.087190,0.004224,0.064069,...,0.028334,-0.073494,-0.057118,-0.083006,0.031735,-0.019661,0.008252,0.019637,-0.010940,-0.010206


In [33]:
X_feature.columns=X_feature.columns.str.replace('component №','')

In [34]:
X_feature

Unnamed: 0,2,3,4,16,13,12,35,9,18,25,...,33,14,10,5,64,29,11,17,70,36
0,-0.052967,-0.009110,-0.048373,0.053959,0.027003,-0.039624,0.101617,-0.093022,-0.019615,-0.050326,...,0.015592,0.028114,0.023448,-0.021367,0.016322,-0.157155,-0.032858,0.021163,-0.001972,0.015169
1,-0.107192,-0.067519,0.096123,-0.013960,0.022043,-0.019943,0.009838,-0.037674,0.043535,-0.003936,...,0.023506,0.017955,0.025788,0.095367,-0.019337,-0.041335,-0.041207,-0.007358,-0.046151,0.018553
2,-0.107192,-0.067519,0.096123,-0.013960,0.022043,-0.019943,0.009838,-0.037674,0.043535,-0.003936,...,0.023506,0.017955,0.025788,0.095367,-0.019337,-0.041335,-0.041207,-0.007358,-0.046151,0.018553
3,-0.078922,-0.050858,0.058433,-0.035953,-0.046607,0.132207,-0.022108,-0.020254,0.014758,-0.045953,...,-0.009628,0.016455,0.036479,0.001345,-0.032662,0.030128,-0.140624,0.019857,-0.023420,-0.018075
4,-0.027762,0.017160,0.046482,-0.004018,0.027140,-0.028391,-0.090351,-0.042904,0.015645,-0.033890,...,0.031928,-0.027954,-0.008979,0.069179,-0.024677,-0.040416,-0.020957,-0.002719,0.036321,0.009030
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3316,-0.056060,-0.011623,-0.006953,0.013618,-0.015145,-0.020178,-0.015675,-0.027799,-0.000163,-0.067528,...,-0.034576,0.003825,0.018952,0.021266,0.031151,0.061524,-0.075737,0.023422,0.021556,-0.047815
3317,-0.050790,-0.020150,-0.025461,0.048647,-0.008414,-0.017241,0.023434,-0.034639,0.002886,-0.060133,...,-0.053346,0.013372,0.022062,0.003461,0.064309,0.077412,-0.087247,0.023834,0.012669,-0.035728
3318,0.027652,0.049176,-0.077737,-0.013794,-0.083186,-0.049573,-0.049746,-0.005252,0.039897,0.110985,...,-0.052938,-0.053569,-0.088748,0.025920,0.021523,-0.027130,-0.029378,-0.025919,-0.001277,-0.055269
3319,-0.038513,0.008694,-0.079274,-0.104432,-0.023401,-0.016747,-0.025761,0.087190,0.004224,0.064069,...,0.028334,-0.073494,-0.057118,-0.083006,0.031735,-0.019661,0.008252,0.019637,-0.010940,-0.010206


In [35]:
X_feature.to_csv('X_feature.csv', index=False)

#### Random Forest Model- top 25 features

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X_feature,y,test_size=0.15,random_state=5)

In [37]:
# Random Forest
rf = RandomForestClassifier()
# fit RF to training set
rf.fit(X_train, y_train)

RandomForestClassifier()

In [38]:
y_pred = rf.predict(X_test)

In [39]:
accuracy = accuracy_score(y_test, y_pred)
accuracy 

0.6192384769539078

In [40]:
matrix = confusion_matrix(y_test, y_pred)
print(matrix)

[[ 54   0   0  19   4   2  10   0   0]
 [  1  34   0   2   1   0  30   0   0]
 [  2   0   1   3   1   0   4   0   0]
 [ 23   1   1  72   1   0  10   0   0]
 [ 11   2   1   9   8   1   6   0   0]
 [  2   1   0   0   2  26   9   0   0]
 [  5  12   2   2   2   1 114   0   0]
 [  0   1   0   0   0   0   2   0   0]
 [  2   0   0   2   0   0   0   0   0]]


#### Save Random Forest model binary

In [41]:
import joblib
joblib.dump(rf, "rf_model.pkl")

['rf_model.pkl']

#### Export data column/ feature names

In [42]:
feature_name = X_feature.columns.values.tolist()

In [43]:
import json
with open('feature_name.json', 'w') as f:
    json.dump(feature_name, f)

#### Prep. test set for top 25 features

In [44]:
X_test = pd.read_csv('X_test.csv')

In [45]:
X_test_f = X_test[feature_list].copy()

In [46]:
X_test_f

Unnamed: 0,component №2,component №3,component №4,component №16,component №13,component №12,component №35,component №9,component №18,component №25,...,component №33,component №14,component №10,component №5,component №64,component №29,component №11,component №17,component №70,component №36
0,0.014929,-0.060907,0.034164,-0.100529,-0.026622,-0.025436,0.042891,0.034505,-0.026351,0.062754,...,0.150755,-0.025685,0.004785,-0.010582,-0.004006,0.056398,-0.011909,-0.015788,-0.059728,0.016516
1,-0.060327,0.049973,0.038471,-0.068424,-0.062152,-0.049553,0.014523,-0.023428,0.020097,0.003693,...,-0.003643,-0.019748,0.017536,-0.000872,-0.033475,-0.021050,-0.057650,0.026305,0.009957,-0.015070
2,-0.045953,-0.037609,0.018070,-0.019940,-0.047387,-0.037130,0.040003,-0.051081,0.001171,0.003366,...,0.008851,0.014285,0.061721,0.003177,0.004805,0.022143,-0.054455,-0.014218,0.015962,-0.005024
3,-0.046223,-0.006951,-0.128170,-0.022122,0.012852,-0.026477,0.040470,0.073448,-0.010406,-0.014131,...,0.026171,-0.036414,-0.048588,-0.098044,0.004155,-0.134896,-0.002387,0.013311,0.047101,-0.011256
4,-0.063641,0.016393,-0.049943,0.052360,-0.034871,0.115220,0.024476,-0.224386,-0.004639,0.017464,...,0.023132,0.012919,-0.092244,-0.044240,-0.002582,-0.028865,0.266745,-0.006083,-0.006686,0.026869
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
981,-0.054216,-0.020065,-0.013974,-0.035452,-0.040972,-0.041540,0.019752,-0.005455,0.001656,0.005040,...,0.048731,-0.014375,0.002483,-0.011606,0.007260,0.073266,-0.040885,0.009371,0.014802,-0.013577
982,-0.025311,-0.001032,-0.034549,0.019150,-0.058504,-0.063369,-0.021830,-0.030238,0.025039,0.076040,...,-0.057442,-0.001270,-0.009495,0.010613,0.003303,-0.043513,-0.054483,-0.038135,-0.011107,0.013881
983,-0.117473,-0.050821,0.065298,-0.171218,-0.131419,0.077380,0.052945,-0.204790,0.050285,-0.049738,...,0.019404,0.031340,0.006319,-0.047217,0.014478,0.050597,0.212493,-0.072811,0.015554,-0.031302
984,-0.048879,-0.010323,-0.033686,0.031977,0.025786,-0.041604,-0.006537,-0.041886,-0.028209,0.005868,...,0.079645,0.029850,0.025600,0.036017,-0.005230,-0.053971,-0.047757,-0.020019,0.006406,0.048128


In [47]:
X_test_f.columns=X_test_f.columns.str.replace('component №','')

In [48]:
X_test_f

Unnamed: 0,2,3,4,16,13,12,35,9,18,25,...,33,14,10,5,64,29,11,17,70,36
0,0.014929,-0.060907,0.034164,-0.100529,-0.026622,-0.025436,0.042891,0.034505,-0.026351,0.062754,...,0.150755,-0.025685,0.004785,-0.010582,-0.004006,0.056398,-0.011909,-0.015788,-0.059728,0.016516
1,-0.060327,0.049973,0.038471,-0.068424,-0.062152,-0.049553,0.014523,-0.023428,0.020097,0.003693,...,-0.003643,-0.019748,0.017536,-0.000872,-0.033475,-0.021050,-0.057650,0.026305,0.009957,-0.015070
2,-0.045953,-0.037609,0.018070,-0.019940,-0.047387,-0.037130,0.040003,-0.051081,0.001171,0.003366,...,0.008851,0.014285,0.061721,0.003177,0.004805,0.022143,-0.054455,-0.014218,0.015962,-0.005024
3,-0.046223,-0.006951,-0.128170,-0.022122,0.012852,-0.026477,0.040470,0.073448,-0.010406,-0.014131,...,0.026171,-0.036414,-0.048588,-0.098044,0.004155,-0.134896,-0.002387,0.013311,0.047101,-0.011256
4,-0.063641,0.016393,-0.049943,0.052360,-0.034871,0.115220,0.024476,-0.224386,-0.004639,0.017464,...,0.023132,0.012919,-0.092244,-0.044240,-0.002582,-0.028865,0.266745,-0.006083,-0.006686,0.026869
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
981,-0.054216,-0.020065,-0.013974,-0.035452,-0.040972,-0.041540,0.019752,-0.005455,0.001656,0.005040,...,0.048731,-0.014375,0.002483,-0.011606,0.007260,0.073266,-0.040885,0.009371,0.014802,-0.013577
982,-0.025311,-0.001032,-0.034549,0.019150,-0.058504,-0.063369,-0.021830,-0.030238,0.025039,0.076040,...,-0.057442,-0.001270,-0.009495,0.010613,0.003303,-0.043513,-0.054483,-0.038135,-0.011107,0.013881
983,-0.117473,-0.050821,0.065298,-0.171218,-0.131419,0.077380,0.052945,-0.204790,0.050285,-0.049738,...,0.019404,0.031340,0.006319,-0.047217,0.014478,0.050597,0.212493,-0.072811,0.015554,-0.031302
984,-0.048879,-0.010323,-0.033686,0.031977,0.025786,-0.041604,-0.006537,-0.041886,-0.028209,0.005868,...,0.079645,0.029850,0.025600,0.036017,-0.005230,-0.053971,-0.047757,-0.020019,0.006406,0.048128


In [49]:
X_test_f.to_csv('X_test_f.csv', index=False)

#### Prep. drift dataset

In [50]:
X_test_f['16'] = X_test_f['16']*200

In [52]:
X_test_f['35'] = X_test_f['35']*200
X_test_f['64'] = X_test_f['64']*200*(-1)

In [53]:
X_test_f

Unnamed: 0,2,3,4,16,13,12,35,9,18,25,...,33,14,10,5,64,29,11,17,70,36
0,0.014929,-0.060907,0.034164,-20.105779,-0.026622,-0.025436,8.578163,0.034505,-0.026351,0.062754,...,0.150755,-0.025685,0.004785,-0.010582,0.801247,0.056398,-0.011909,-0.015788,-0.059728,0.016516
1,-0.060327,0.049973,0.038471,-13.684769,-0.062152,-0.049553,2.904611,-0.023428,0.020097,0.003693,...,-0.003643,-0.019748,0.017536,-0.000872,6.695024,-0.021050,-0.057650,0.026305,0.009957,-0.015070
2,-0.045953,-0.037609,0.018070,-3.988092,-0.047387,-0.037130,8.000593,-0.051081,0.001171,0.003366,...,0.008851,0.014285,0.061721,0.003177,-0.961077,0.022143,-0.054455,-0.014218,0.015962,-0.005024
3,-0.046223,-0.006951,-0.128170,-4.424435,0.012852,-0.026477,8.093953,0.073448,-0.010406,-0.014131,...,0.026171,-0.036414,-0.048588,-0.098044,-0.831023,-0.134896,-0.002387,0.013311,0.047101,-0.011256
4,-0.063641,0.016393,-0.049943,10.472032,-0.034871,0.115220,4.895274,-0.224386,-0.004639,0.017464,...,0.023132,0.012919,-0.092244,-0.044240,0.516368,-0.028865,0.266745,-0.006083,-0.006686,0.026869
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
981,-0.054216,-0.020065,-0.013974,-7.090472,-0.040972,-0.041540,3.950411,-0.005455,0.001656,0.005040,...,0.048731,-0.014375,0.002483,-0.011606,-1.452063,0.073266,-0.040885,0.009371,0.014802,-0.013577
982,-0.025311,-0.001032,-0.034549,3.829902,-0.058504,-0.063369,-4.366064,-0.030238,0.025039,0.076040,...,-0.057442,-0.001270,-0.009495,0.010613,-0.660617,-0.043513,-0.054483,-0.038135,-0.011107,0.013881
983,-0.117473,-0.050821,0.065298,-34.243587,-0.131419,0.077380,10.589075,-0.204790,0.050285,-0.049738,...,0.019404,0.031340,0.006319,-0.047217,-2.895621,0.050597,0.212493,-0.072811,0.015554,-0.031302
984,-0.048879,-0.010323,-0.033686,6.395419,0.025786,-0.041604,-1.307376,-0.041886,-0.028209,0.005868,...,0.079645,0.029850,0.025600,0.036017,1.046068,-0.053971,-0.047757,-0.020019,0.006406,0.048128


In [54]:
X_test_f.to_csv('X_drift.csv', index=False)