# Balanced Random Forest Classifier With ELMO

**ROC-AUC:** 0.83428
**F1 Score:** 0.174638

In [1]:
import pandas as pd
import numpy as np

from sklearn.pipeline import make_pipeline
from sklearn import metrics
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.model_selection import GridSearchCV 
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [2]:
x = np.loadtxt('data/toxic_elmo_matrix.out', delimiter=',')
df = pd.read_csv('data/toxic-train-clean.csv')
y = df.iloc[:, 2:8]

In [3]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=.2)

In [4]:
y_train.shape, X_train.shape, y_test.shape, X_test.shape

((127656, 6), (127656, 128), (31915, 6), (31915, 128))

In [6]:
# with gridsearch 
pipe = make_pipeline(OneVsRestClassifier(BalancedRandomForestClassifier()))

param_grid = {
              'onevsrestclassifier__estimator__class_weight' : ['balanced_subsample',
                                                                'balanced'],
              'onevsrestclassifier__estimator__max_depth': [25, 50, 75],
              'onevsrestclassifier__estimator__max_leaf_nodes' : [5, 10, 15],
              'onevsrestclassifier__estimator__criterion' : ['entropy', 'gini'],
              'onevsrestclassifier__estimator__sampling_strategy' : ['not majority',
                                                                    'all']

              }

grid = GridSearchCV(pipe, param_grid, cv=3, scoring='roc_auc', n_jobs=-1, verbose=3)
BalancedRandomForestClassifier.get_params(grid).keys()

dict_keys(['cv', 'error_score', 'estimator__memory', 'estimator__steps', 'estimator__verbose', 'estimator__onevsrestclassifier', 'estimator__onevsrestclassifier__estimator__bootstrap', 'estimator__onevsrestclassifier__estimator__ccp_alpha', 'estimator__onevsrestclassifier__estimator__class_weight', 'estimator__onevsrestclassifier__estimator__criterion', 'estimator__onevsrestclassifier__estimator__max_depth', 'estimator__onevsrestclassifier__estimator__max_features', 'estimator__onevsrestclassifier__estimator__max_leaf_nodes', 'estimator__onevsrestclassifier__estimator__max_samples', 'estimator__onevsrestclassifier__estimator__min_impurity_decrease', 'estimator__onevsrestclassifier__estimator__min_samples_leaf', 'estimator__onevsrestclassifier__estimator__min_samples_split', 'estimator__onevsrestclassifier__estimator__min_weight_fraction_leaf', 'estimator__onevsrestclassifier__estimator__n_estimators', 'estimator__onevsrestclassifier__estimator__n_jobs', 'estimator__onevsrestclassifier_

In [7]:
grid2 = grid.fit(X_train, y_train)

Fitting 3 folds for each of 72 candidates, totalling 216 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 104 tasks      | elapsed: 37.8min
[Parallel(n_jobs=-1)]: Done 216 out of 216 | elapsed: 77.0min finished


In [8]:
grid2.best_params_

{'onevsrestclassifier__estimator__class_weight': 'balanced_subsample',
 'onevsrestclassifier__estimator__criterion': 'entropy',
 'onevsrestclassifier__estimator__max_depth': 50,
 'onevsrestclassifier__estimator__max_leaf_nodes': 15,
 'onevsrestclassifier__estimator__sampling_strategy': 'all'}

In [9]:
grid.score(X_test, y_test)

0.8342757395901627

In [10]:
from sklearn.metrics import f1_score, recall_score 

y_pred = grid.predict(X_test)

In [11]:
f1_score(y_test, y_pred, average = 'micro')

0.17463822270534682