# Balanced Random Forest Classifier With BERT

**ROC-AUC:** 0.93230
**F1 Score:** 0.38242

In [1]:
import pandas as pd
import numpy as np 
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

Using TensorFlow backend.


In [2]:
df = pd.read_csv('data/train.csv')

In [3]:
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
df['none'] = 1-df[label_cols].max(axis=1)

In [4]:
x = np.loadtxt('data/toxic_bert_matrix.out', delimiter=',')
y = df.iloc[:, 2:8] 

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size= 0.2, random_state=13)

In [6]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
pipe = make_pipeline(OneVsRestClassifier(BalancedRandomForestClassifier()))
param_grid = {
              'onevsrestclassifier__estimator__class_weight' : ['balanced_subsample',
                                                                'balanced'],
              'onevsrestclassifier__estimator__max_depth': [25, 50, 75],
              'onevsrestclassifier__estimator__max_leaf_nodes' : [5, 10, 15],
              'onevsrestclassifier__estimator__criterion' : ['entropy', 'gini'],
              'onevsrestclassifier__estimator__sampling_strategy' : ['not majority',
                                                                    'all']

              }
grid = GridSearchCV(pipe, param_grid, cv=3, scoring='roc_auc', verbose=10, n_jobs=-2)

grid.fit(X_train, y_train)

Fitting 3 folds for each of 72 candidates, totalling 216 fits


[Parallel(n_jobs=-2)]: Using backend LokyBackend with 11 concurrent workers.
[Parallel(n_jobs=-2)]: Done   3 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-2)]: Done  10 tasks      | elapsed: 18.8min
[Parallel(n_jobs=-2)]: Done  19 tasks      | elapsed: 24.5min
[Parallel(n_jobs=-2)]: Done  28 tasks      | elapsed: 29.8min
[Parallel(n_jobs=-2)]: Done  39 tasks      | elapsed: 48.9min
[Parallel(n_jobs=-2)]: Done  50 tasks      | elapsed: 55.9min
[Parallel(n_jobs=-2)]: Done  63 tasks      | elapsed: 71.2min
[Parallel(n_jobs=-2)]: Done  76 tasks      | elapsed: 83.5min
[Parallel(n_jobs=-2)]: Done  91 tasks      | elapsed: 93.9min
[Parallel(n_jobs=-2)]: Done 106 tasks      | elapsed: 103.9min
[Parallel(n_jobs=-2)]: Done 123 tasks      | elapsed: 121.2min
[Parallel(n_jobs=-2)]: Done 140 tasks      | elapsed: 138.9min
[Parallel(n_jobs=-2)]: Done 159 tasks      | elapsed: 159.6min
[Parallel(n_jobs=-2)]: Done 178 tasks      | elapsed: 177.3min
[Parallel(n_jobs=-2)]: Done 216 out of 216 | elaps

GridSearchCV(cv=3, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('onevsrestclassifier',
                                        OneVsRestClassifier(estimator=BalancedRandomForestClassifier(bootstrap=True,
                                                                                                     ccp_alpha=0.0,
                                                                                                     class_weight=None,
                                                                                                     criterion='gini',
                                                                                                     max_depth=None,
                                                                                                     max_features='auto',
                                                                                                     max_leaf_nodes=None,
                       

In [7]:
grid.best_params_

{'onevsrestclassifier__estimator__class_weight': 'balanced_subsample',
 'onevsrestclassifier__estimator__criterion': 'entropy',
 'onevsrestclassifier__estimator__max_depth': 25,
 'onevsrestclassifier__estimator__max_leaf_nodes': 15,
 'onevsrestclassifier__estimator__sampling_strategy': 'all'}

In [8]:
grid.score(X_test, y_test)

0.9323000292148212

In [9]:
from sklearn.metrics import f1_score, recall_score 

y_pred = grid.predict(X_test)

In [10]:
f1_score(y_test, y_pred, average = 'weighted')

0.3824175040658867