# Output from Transformer + LSTM into Random Forest ?


In [2]:
import sklearn
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.feature_selection import SelectFromModel
from sklearn import metrics

In [791]:
X_train = pd.read_pickle('../X_train.pkl')
X_test = pd.read_pickle('../X_test.pkl')
y_train = pd.read_pickle('../y_train.pkl')
y_test = pd.read_pickle('../y_test.pkl')

In [792]:
X_train = X_train.iloc[1:,:]
X_test = X_test.iloc[1:,:]
y_train = y_train.iloc[1:]
y_test = y_test.iloc[1:]

In [793]:
y_train = list(y_train[0])
y_test = list(y_test[0])

In [6]:
X_train['label']=y_train

In [7]:
X_test['label']=y_test

In [8]:
true = X_train[X_train.label==1]
false = X_train[X_train.label==0]

In [9]:
train = false[0:20000].append(true[0:1333])
test  = false[20000:22494].append(true[2000:2166])

In [10]:
train = train.sample(frac=1).reset_index(drop=True)
test = test.sample(frac=1).reset_index(drop=True)

In [11]:
X_train = train.iloc[:,:-1]
X_test = test.iloc[:,:-1]
y_train = list(train.iloc[:,-1])
y_test = list(test.iloc[:,-1])

In [11]:
weights = sklearn.utils.class_weight.compute_class_weight(class_weight='balanced',classes=[0.0,1.0],y=y_train)
weights

array([0.533325  , 8.00187547])

In [48]:
model = RandomForestClassifier(class_weight={0.0:weights[0], 1.0:weights[1]}, oob_score=True, n_jobs=-1)
model

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                       class_weight={0.0: 0.533325, 1.0: 8.001875468867217},
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=True, random_state=None, verbose=0,
                       warm_start=False)

In [14]:
model = model.fit(X=X_train,y=y_train)

In [15]:
model.score(X=X_train,y=y_train)

0.9999531242675667

In [49]:
sel = SelectFromModel(model)
sel = sel.fit(X_train, y_train)

In [50]:
sel.get_support()

array([False, False, False, False, False, False, False, False, False,
       False,  True, False, False, False, False, False, False, False,
       False, False, False,  True, False, False, False, False, False,
       False,  True, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False,  True, False,
       False, False, False, False, False, False, False,  True, False,
       False,  True, False, False, False, False, False, False, False,
        True, False, False, False, False, False, False, False, False,
        True, False, False, False, False, False, False, False, False,
       False,  True,  True, False, False, False, False, False, False,
       False, False, False, False, False,  True, False,  True, False,
       False, False, False, False, False,  True, False, False, False,
       False,  True, False, False, False,  True, False, False, False,
       False,  True,

In [51]:
col = X_train.columns[sel.get_support()]

In [12]:
model = RandomForestClassifier(class_weight={0:1, 1:5.5}, oob_score=True, n_jobs=-1,n_estimators=500,criterion='gini')


# Gradient Boosted Trees

In [153]:
model = GradientBoostingClassifier(loss='deviance', learning_rate=1,n_estimators=300, random_state=1234, max_depth=3)


In [154]:
model = model.fit(X=X_train,y=y_train)

### Training set

In [155]:
output = model.predict(X_train)
output_prob = model.predict_proba(X_train)

In [156]:
confusion = metrics.confusion_matrix(y_true=y_train, y_pred=output)
print('Confusion matrix: \n',confusion)

tn, fp, fn, tp = confusion.ravel()
print('\nTP:',tp)
print('FP:',fp)
print('TN:',tn)
print('FN:',fn)

## Performance measure
print('\nAccuracy: '+ str(metrics.accuracy_score(y_true=y_train, y_pred=output)))
print('Precision: '+ str(metrics.precision_score(y_true=y_train, y_pred=output)))
print('Recall: '+ str(metrics.recall_score(y_true=y_train, y_pred=output)))
print('F-measure: '+ str(metrics.f1_score(y_true=y_train, y_pred=output)))
print('Area Under the Curve: '+ str(metrics.roc_auc_score(y_true=y_train, y_score=output_prob[:,1])))
print('Precision-Recall AUC: '+ str(metrics.average_precision_score(y_true=y_train, y_score=output_prob[:,1])))
print('Matthew Correlation Coefficient: '+ str(metrics.matthews_corrcoef(y_true=y_train, y_pred=output)))
print('\n\n')

Confusion matrix: 
 [[18959  1041]
 [  238  1095]]

TP: 1095
FP: 1041
TN: 18959
FN: 238

Accuracy: 0.9400459382177847
Precision: 0.5126404494382022
Recall: 0.8214553638409603
F-measure: 0.6313058518304987
Area Under the Curve: 0.951338728432108
Precision-Recall AUC: 0.4674908371116769
Matthew Correlation Coefficient: 0.6203945522075344





### Testing set

In [157]:
output = model.predict(X_test)
output_prob = model.predict_proba(X_test)

In [158]:
confusion = metrics.confusion_matrix(y_true=y_test, y_pred=output)
print('Confusion matrix: \n',confusion)

tn, fp, fn, tp = confusion.ravel()
print('\nTP:',tp)
print('FP:',fp)
print('TN:',tn)
print('FN:',fn)

## Performance measure
print('\nAccuracy: '+ str(metrics.accuracy_score(y_true=y_test, y_pred=output)))
print('Precision: '+ str(metrics.precision_score(y_true=y_test, y_pred=output)))
print('Recall: '+ str(metrics.recall_score(y_true=y_test, y_pred=output)))
print('F-measure: '+ str(metrics.f1_score(y_true=y_test, y_pred=output)))
print('Area Under the Curve: '+ str(metrics.roc_auc_score(y_true=y_test, y_score=output_prob[:,1])))
print('Precision-Recall AUC: '+ str(metrics.average_precision_score(y_true=y_test, y_score=output_prob[:,1])))
print('Matthew Correlation Coefficient: '+ str(metrics.matthews_corrcoef(y_true=y_test, y_pred=output)))
print('\n\n')

Confusion matrix: 
 [[2344  150]
 [  45  121]]

TP: 121
FP: 150
TN: 2344
FN: 45

Accuracy: 0.9266917293233082
Precision: 0.44649446494464945
Recall: 0.7289156626506024
F-measure: 0.5537757437070938
Area Under the Curve: 0.8768393542091379
Precision-Recall AUC: 0.3869063837409769
Matthew Correlation Coefficient: 0.534794812673835





learning_rate=1,n_estimators=300
Confusion matrix: 
 [[2347  147]
 [  46  120]]

TP: 120
FP: 147
TN: 2347
FN: 46

Accuracy: 0.9274436090225564
Precision: 0.449438202247191
Recall: 0.7228915662650602
F-measure: 0.5542725173210162
Area Under the Curve: 0.8812185389513145
Precision-Recall AUC: 0.39039287153076524
Matthew Correlation Coefficient: 0.5344544945611271



# Extreme Gradient Boost (XGBOOST)

In [794]:
import xgboost as xgb

In [807]:
model = xgb.XGBClassifier(learning_rate=0.05, n_jobs=-1, random_state =1234, objective='binary:logistic',
                          eval_metric='auc', num_boost_round=10, max_depth=6, booster='gbtree',
                         importance_type ='gain',reg_alpha=0.3, reg_lambda=1,base_score=0.5, n_estimators=100,
                         gamma=0.1)

In [808]:
model = model.fit(X=X_train,y=y_train)

In [809]:
output = model.predict(X_train)
output_prob = model.predict_proba(X_train)

In [810]:
confusion = metrics.confusion_matrix(y_true=y_train, y_pred=output)
print('Confusion matrix: \n',confusion)

tn, fp, fn, tp = confusion.ravel()
print('\nTP:',tp)
print('FP:',fp)
print('TN:',tn)
print('FN:',fn)

## Performance measure
print('\nAccuracy: '+ str(metrics.accuracy_score(y_true=y_train, y_pred=output)))
print('Precision: '+ str(metrics.precision_score(y_true=y_train, y_pred=output)))
print('Recall: '+ str(metrics.recall_score(y_true=y_train, y_pred=output)))
print('F-measure: '+ str(metrics.f1_score(y_true=y_train, y_pred=output)))
print('Area Under the Curve: '+ str(metrics.roc_auc_score(y_true=y_train, y_score=output_prob[:,1])))
print('Precision-Recall AUC: '+ str(metrics.average_precision_score(y_true=y_train, y_score=output_prob[:,1])))
print('Matthew Correlation Coefficient: '+ str(metrics.matthews_corrcoef(y_true=y_train, y_pred=output)))
print('\n\n')

Confusion matrix: 
 [[937174  16393]
 [ 38949  26955]]

TP: 26955
FP: 16393
TN: 937174
FN: 38949

Accuracy: 0.9457149835551968
Precision: 0.6218279966780474
Recall: 0.40900400582665697
F-measure: 0.49344634423168454
Area Under the Curve: 0.9386740392303465
Precision-Recall AUC: 0.571352036552205
Matthew Correlation Coefficient: 0.4774998828162578





In [811]:
output = model.predict(X_test)
output_prob = model.predict_proba(X_test)

In [812]:
confusion = metrics.confusion_matrix(y_true=y_test, y_pred=output)
print('Confusion matrix: \n',confusion)

tn, fp, fn, tp = confusion.ravel()
print('\nTP:',tp)
print('FP:',fp)
print('TN:',tn)
print('FN:',fn)

## Performance measure
print('\nAccuracy: '+ str(metrics.accuracy_score(y_true=y_test, y_pred=output)))
print('Precision: '+ str(metrics.precision_score(y_true=y_test, y_pred=output)))
print('Recall: '+ str(metrics.recall_score(y_true=y_test, y_pred=output)))
print('F-measure: '+ str(metrics.f1_score(y_true=y_test, y_pred=output)))
print('Area Under the Curve: '+ str(metrics.roc_auc_score(y_true=y_test, y_score=output_prob[:,1])))
print('Precision-Recall AUC: '+ str(metrics.average_precision_score(y_true=y_test, y_score=output_prob[:,1])))
print('Matthew Correlation Coefficient: '+ str(metrics.matthews_corrcoef(y_true=y_test, y_pred=output)))
print('\n\n')

Confusion matrix: 
 [[116838   2328]
 [  5177   3076]]

TP: 3076
FP: 2328
TN: 116838
FN: 5177

Accuracy: 0.9410998359742268
Precision: 0.5692079940784603
Recall: 0.3727129528656246
F-measure: 0.45046496302262573
Area Under the Curve: 0.9109861921752845
Precision-Recall AUC: 0.49956204527735787
Matthew Correlation Coefficient: 0.43133078398377234



