# Decision Tree Classifier With BERT

**ROC-AUC:** 0.791093
**F1-score:** 0.273122

In [1]:
import re
import string

import pandas as pd
import numpy as np 
from sklearn.tree import DecisionTreeClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [2]:
df = pd.read_csv('data/train.csv')

In [3]:
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
df['none'] = 1-df[label_cols].max(axis=1)

In [4]:
x = np.loadtxt('data/toxic_bert_matrix.out', delimiter=',')
y = df.iloc[:, 2:8] 

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size= 0.2, random_state=13)

In [12]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
pipe = make_pipeline(OneVsRestClassifier(DecisionTreeClassifier(class_weight='balanced')))
param_grid = {
              'onevsrestclassifier__estimator__criterion': ['gini', 'entropy'],
              'onevsrestclassifier__estimator__max_features': [None, 'auto', 'sqrt', 'log2'],
              'onevsrestclassifier__estimator__max_depth': [None, 10, 50, 100, 250],
             } 
grid = GridSearchCV(pipe, param_grid, cv=3, scoring='roc_auc', verbose=10, n_jobs=-1)

grid.fit(X_train, y_train)

Fitting 3 folds for each of 40 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   38.8s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed: 15.7min
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed: 21.1min
[Parallel(n_jobs=-1)]: Done  61 tasks      | elapsed: 22.9min
[Parallel(n_jobs=-1)]: Done  74 tasks      | elapsed: 24.9min
[Parallel(n_jobs=-1)]: Done  89 tasks      | elapsed: 37.7min
[Parallel(n_jobs=-1)]: Done 110 out of 120 | elapsed: 40.8min remaining:  3.7min
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed: 49.5min finished


GridSearchCV(cv=3, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('onevsrestclassifier',
                                        OneVsRestClassifier(estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                                                             class_weight='balanced',
                                                                                             criterion='gini',
                                                                                             max_depth=None,
                                                                                             max_features=None,
                                                                                             max_leaf_nodes=None,
                                                                                             min_impurity_decrease=0.0,
                                                                

In [13]:
grid.score(X_test, y_test)

0.7910932057406367

In [14]:
from sklearn.metrics import f1_score, recall_score 

y_pred = grid.predict(X_test)

In [17]:
f1_score(y_test, y_pred, average = 'micro')

0.2731222490416016

In [18]:
grid.best_params_

{'onevsrestclassifier__estimator__criterion': 'entropy',
 'onevsrestclassifier__estimator__max_depth': 10,
 'onevsrestclassifier__estimator__max_features': None}