# RUSBOOST Classifier With BERT

**ROC-AUC:** 0.95087
**F1-score:** 0.47602

In [10]:
import pandas as pd
import numpy as np 
from imblearn.ensemble import RUSBoostClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [11]:
df = pd.read_csv('data/train.csv')

In [12]:
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
df['none'] = 1-df[label_cols].max(axis=1)

In [13]:
x = np.loadtxt('data/toxic_bert_matrix.out', delimiter=',')
y = df.iloc[:, 2:8] 

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size= 0.2, random_state=13)

In [15]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
pipe = make_pipeline(OneVsRestClassifier(RUSBoostClassifier()))
param_grid = {
              'onevsrestclassifier__estimator__algorithm': ['SAMME', 'SAMME.R'],
              'onevsrestclassifier__estimator__sampling_strategy': ['majority', 'not minority', 'not majority'],
              'onevsrestclassifier__estimator__n_estimators': [10, 50, 100, 250],
              'onevsrestclassifier__estimator__learning_rate': [0.25, 0.5, 0.75, 1]
             } 
grid = GridSearchCV(pipe, param_grid, cv=3, scoring='roc_auc', verbose=10, n_jobs=-2)

grid.fit(X_train, y_train)

Fitting 3 folds for each of 96 candidates, totalling 288 fits


[Parallel(n_jobs=-2)]: Using backend LokyBackend with 11 concurrent workers.
[Parallel(n_jobs=-2)]: Done   3 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-2)]: Done  10 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-2)]: Done  19 tasks      | elapsed:  5.3min
[Parallel(n_jobs=-2)]: Done  28 tasks      | elapsed: 17.3min
[Parallel(n_jobs=-2)]: Done  39 tasks      | elapsed: 39.6min
[Parallel(n_jobs=-2)]: Done  50 tasks      | elapsed: 82.1min
[Parallel(n_jobs=-2)]: Done  63 tasks      | elapsed: 158.5min
[Parallel(n_jobs=-2)]: Done  76 tasks      | elapsed: 181.3min
[Parallel(n_jobs=-2)]: Done  91 tasks      | elapsed: 274.4min
[Parallel(n_jobs=-2)]: Done 106 tasks      | elapsed: 400.2min
[Parallel(n_jobs=-2)]: Done 123 tasks      | elapsed: 421.5min
[Parallel(n_jobs=-2)]: Done 140 tasks      | elapsed: 544.5min
[Parallel(n_jobs=-2)]: Done 159 tasks      | elapsed: 654.5min
[Parallel(n_jobs=-2)]: Done 178 tasks      | elapsed: 768.2min
[Parallel(n_jobs=-2)]: Done 199 tasks      | el

GridSearchCV(cv=3, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('onevsrestclassifier',
                                        OneVsRestClassifier(estimator=RUSBoostClassifier(algorithm='SAMME.R',
                                                                                         base_estimator=None,
                                                                                         learning_rate=1.0,
                                                                                         n_estimators=50,
                                                                                         random_state=None,
                                                                                         replacement=False,
                                                                                         sampling_strategy='auto'),
                                                            n_jobs=None))],
              

In [16]:
grid.best_params_

{'onevsrestclassifier__estimator__algorithm': 'SAMME.R',
 'onevsrestclassifier__estimator__learning_rate': 0.25,
 'onevsrestclassifier__estimator__n_estimators': 250,
 'onevsrestclassifier__estimator__sampling_strategy': 'majority'}

In [17]:
grid.score(X_test, y_test)

0.9508725975441016

In [18]:
from sklearn.metrics import f1_score, recall_score 

y_pred = grid.predict(X_test)

In [23]:
f1_score(y_test, y_pred, average = 'weighted')

0.47602045198053483