# Balanced Random Forest Classifier With TF-IDF

**ROC-AUC:** 0.94207
**F1-score:** 0.28251

In [6]:
import pandas as pd
import numpy as np


from sklearn.pipeline import make_pipeline
from sklearn import metrics
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.model_selection import GridSearchCV 
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
df = pd.read_csv('data/toxic-train-clean.csv')
y = df.iloc[:, 2:8]
X = df['comment_text']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

In [7]:
# with gridsearch 
pipe = make_pipeline(
                    TfidfVectorizer(stop_words='english',
                                   strip_accents='unicode',
                                   token_pattern= r'\w{1,}',
                                   analyzer='word',
                                   ngram_range=(1,1),
                                   min_df=5),
                       OneVsRestClassifier(BalancedRandomForestClassifier()))

param_grid = {
              'onevsrestclassifier__estimator__class_weight' : ['balanced_subsample',
                                                                'balanced'],
              'onevsrestclassifier__estimator__max_depth': [25, 50, 75],
              'onevsrestclassifier__estimator__max_leaf_nodes' : [5, 10, 15],
              'onevsrestclassifier__estimator__criterion' : ['entropy', 'gini'],
              'onevsrestclassifier__estimator__sampling_strategy' : ['not majority',
                                                                    'all']

              }

grid = GridSearchCV(pipe, param_grid, cv=3, scoring='roc_auc', n_jobs=-1, verbose=3)
BalancedRandomForestClassifier.get_params(grid).keys()

dict_keys(['cv', 'error_score', 'estimator__memory', 'estimator__steps', 'estimator__verbose', 'estimator__tfidfvectorizer', 'estimator__onevsrestclassifier', 'estimator__tfidfvectorizer__analyzer', 'estimator__tfidfvectorizer__binary', 'estimator__tfidfvectorizer__decode_error', 'estimator__tfidfvectorizer__dtype', 'estimator__tfidfvectorizer__encoding', 'estimator__tfidfvectorizer__input', 'estimator__tfidfvectorizer__lowercase', 'estimator__tfidfvectorizer__max_df', 'estimator__tfidfvectorizer__max_features', 'estimator__tfidfvectorizer__min_df', 'estimator__tfidfvectorizer__ngram_range', 'estimator__tfidfvectorizer__norm', 'estimator__tfidfvectorizer__preprocessor', 'estimator__tfidfvectorizer__smooth_idf', 'estimator__tfidfvectorizer__stop_words', 'estimator__tfidfvectorizer__strip_accents', 'estimator__tfidfvectorizer__sublinear_tf', 'estimator__tfidfvectorizer__token_pattern', 'estimator__tfidfvectorizer__tokenizer', 'estimator__tfidfvectorizer__use_idf', 'estimator__tfidfvector

In [8]:
grid2 = grid.fit(X_train, y_train)

Fitting 3 folds for each of 72 candidates, totalling 216 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 104 tasks      | elapsed: 15.9min
[Parallel(n_jobs=-1)]: Done 216 out of 216 | elapsed: 30.4min finished


In [9]:
grid2.best_params_

{'onevsrestclassifier__estimator__class_weight': 'balanced_subsample',
 'onevsrestclassifier__estimator__criterion': 'entropy',
 'onevsrestclassifier__estimator__max_depth': 25,
 'onevsrestclassifier__estimator__max_leaf_nodes': 15,
 'onevsrestclassifier__estimator__sampling_strategy': 'all'}

In [10]:
grid.score(X_test, y_test)

0.94207910210815

In [11]:
from sklearn.metrics import f1_score, recall_score 

y_pred = grid.predict(X_test)

In [12]:
f1_score(y_test, y_pred, average = 'micro')

0.28250906354743827