# Decision Tree Classifier With small BERT

**ROC-AUC:** 0.79749
**F1-score:** 0.31962

In [3]:
import pandas as pd
import numpy as np 
from sklearn.tree import DecisionTreeClassifier
from sklearn.multiclass import OneVsRestClassifier

In [4]:
x = np.loadtxt('data/toxic_bert_matrix_small.out', delimiter=',')

In [9]:
df = pd.read_csv('data/toxic-train-clean-small.csv')

In [10]:
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
df['none'] = 1-df[label_cols].max(axis=1)

In [11]:
y = df.iloc[:, 2:8] 

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size= 0.2, random_state=13)

In [13]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
pipe = make_pipeline(OneVsRestClassifier(DecisionTreeClassifier(class_weight='balanced')))
param_grid = {
              'onevsrestclassifier__estimator__criterion': ['gini', 'entropy'],
              'onevsrestclassifier__estimator__max_features': [None, 'auto', 'sqrt', 'log2'],
              'onevsrestclassifier__estimator__max_depth': [None, 10, 50, 100, 250],
             } 
grid = GridSearchCV(pipe, param_grid, cv=3, scoring='roc_auc', verbose=10, n_jobs=-1)

grid.fit(X_train, y_train)

Fitting 3 folds for each of 40 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   33.3s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:   53.9s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed: 10.3min
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed: 13.1min
[Parallel(n_jobs=-1)]: Done  61 tasks      | elapsed: 14.5min
[Parallel(n_jobs=-1)]: Done  74 tasks      | elapsed: 16.0min
[Parallel(n_jobs=-1)]: Done  89 tasks      | elapsed: 25.4min
[Parallel(n_jobs=-1)]: Done 110 out of 120 | elapsed: 27.5min remaining:  2.5min
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed: 33.8min finished


GridSearchCV(cv=3, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('onevsrestclassifier',
                                        OneVsRestClassifier(estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                                                             class_weight='balanced',
                                                                                             criterion='gini',
                                                                                             max_depth=None,
                                                                                             max_features=None,
                                                                                             max_leaf_nodes=None,
                                                                                             min_impurity_decrease=0.0,
                                                                

In [14]:
grid.best_params_

{'onevsrestclassifier__estimator__criterion': 'entropy',
 'onevsrestclassifier__estimator__max_depth': 10,
 'onevsrestclassifier__estimator__max_features': None}

In [15]:
grid.score(X_test, y_test)

0.7974883190231447

In [16]:
from sklearn.metrics import f1_score, recall_score 

y_pred = grid.predict(X_test)

In [17]:
f1_score(y_test, y_pred, average = 'micro')

0.31962080060893333