# Output from Transformer + LSTM into Random Forest ?


In [578]:
import sklearn
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.feature_selection import SelectFromModel
from sklearn import metrics

In [2]:
X_train = pd.read_pickle('../X_train.pkl')
X_test = pd.read_pickle('../X_test.pkl')
y_train = pd.read_pickle('../y_train.pkl')
y_test = pd.read_pickle('../y_test.pkl')

In [3]:
X_train = X_train.iloc[1:,:]
X_test = X_test.iloc[1:,:]
y_train = y_train.iloc[1:]
y_test = y_test.iloc[1:]

In [4]:
y_train = list(y_train[0])
y_test = list(y_test[0])

In [5]:
X_train['label']=y_train

In [6]:
X_test['label']=y_test

In [7]:
true = X_train[X_train.label==1]
false = X_train[X_train.label==0]

In [8]:
train = false[0:20000].append(true[0:1333])
test  = false[20000:22494].append(true[2000:2166])

In [9]:
train = train.sample(frac=1).reset_index(drop=True)
test = test.sample(frac=1).reset_index(drop=True)

In [10]:
X_train = train.iloc[:,:-1]
X_test = test.iloc[:,:-1]
y_train = list(train.iloc[:,-1])
y_test = list(test.iloc[:,-1])

In [11]:
weights = sklearn.utils.class_weight.compute_class_weight(class_weight='balanced',classes=[0.0,1.0],y=y_train)
weights

array([0.533325  , 8.00187547])

In [48]:
model = RandomForestClassifier(class_weight={0.0:weights[0], 1.0:weights[1]}, oob_score=True, n_jobs=-1)
model

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                       class_weight={0.0: 0.533325, 1.0: 8.001875468867217},
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=True, random_state=None, verbose=0,
                       warm_start=False)

In [14]:
model = model.fit(X=X_train,y=y_train)

In [15]:
model.score(X=X_train,y=y_train)

0.9999531242675667

In [49]:
sel = SelectFromModel(model)
sel = sel.fit(X_train, y_train)

In [50]:
sel.get_support()

array([False, False, False, False, False, False, False, False, False,
       False,  True, False, False, False, False, False, False, False,
       False, False, False,  True, False, False, False, False, False,
       False,  True, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False,  True, False,
       False, False, False, False, False, False, False,  True, False,
       False,  True, False, False, False, False, False, False, False,
        True, False, False, False, False, False, False, False, False,
        True, False, False, False, False, False, False, False, False,
       False,  True,  True, False, False, False, False, False, False,
       False, False, False, False, False,  True, False,  True, False,
       False, False, False, False, False,  True, False, False, False,
       False,  True, False, False, False,  True, False, False, False,
       False,  True,

In [51]:
col = X_train.columns[sel.get_support()]

In [572]:
model = RandomForestClassifier(class_weight={0:1, 1:5.5}, oob_score=True, n_jobs=-1,n_estimators=500,criterion='gini')


In [620]:
model = GradientBoostingClassifier(learning_rate=1,n_estimators=200)


In [621]:
model = model.fit(X=X_train,y=y_train)

KeyboardInterrupt: 

### Training set

In [None]:
output = model.predict(X_train)
output_prob = model.predict_proba(X_train)

In [None]:
confusion = metrics.confusion_matrix(y_true=y_train, y_pred=output)
print('Confusion matrix: \n',confusion)

tn, fp, fn, tp = confusion.ravel()
print('\nTP:',tp)
print('FP:',fp)
print('TN:',tn)
print('FN:',fn)

## Performance measure
print('\nAccuracy: '+ str(metrics.accuracy_score(y_true=y_train, y_pred=output)))
print('Precision: '+ str(metrics.precision_score(y_true=y_train, y_pred=output)))
print('Recall: '+ str(metrics.recall_score(y_true=y_train, y_pred=output)))
print('F-measure: '+ str(metrics.f1_score(y_true=y_train, y_pred=output)))
print('Area Under the Curve: '+ str(metrics.roc_auc_score(y_true=y_train, y_score=output_prob[:,1])))
print('Precision-Recall AUC: '+ str(metrics.average_precision_score(y_true=y_train, y_score=output_prob[:,1])))
print('Matthew Correlation Coefficient: '+ str(metrics.matthews_corrcoef(y_true=y_train, y_pred=output)))
print('\n\n')

### Testing set

In [None]:
output = model.predict(X_test)
output_prob = model.predict_proba(X_test)

In [None]:
confusion = metrics.confusion_matrix(y_true=y_test, y_pred=output)
print('Confusion matrix: \n',confusion)

tn, fp, fn, tp = confusion.ravel()
print('\nTP:',tp)
print('FP:',fp)
print('TN:',tn)
print('FN:',fn)

## Performance measure
print('\nAccuracy: '+ str(metrics.accuracy_score(y_true=y_test, y_pred=output)))
print('Precision: '+ str(metrics.precision_score(y_true=y_test, y_pred=output)))
print('Recall: '+ str(metrics.recall_score(y_true=y_test, y_pred=output)))
print('F-measure: '+ str(metrics.f1_score(y_true=y_test, y_pred=output)))
print('Area Under the Curve: '+ str(metrics.roc_auc_score(y_true=y_test, y_score=output_prob[:,1])))
print('Precision-Recall AUC: '+ str(metrics.average_precision_score(y_true=y_test, y_score=output_prob[:,1])))
print('Matthew Correlation Coefficient: '+ str(metrics.matthews_corrcoef(y_true=y_test, y_pred=output)))
print('\n\n')