# Decision Tree Classifier With TF-IDF

**ROC-AUC:** 0.79163
**F1-score:** 0.500716

In [1]:
import re
import string

import pandas as pd
import numpy as np 
from sklearn.tree import DecisionTreeClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [2]:
df = pd.read_csv('data/train.csv')

In [3]:
printable = set(string.printable)

def clean_text(x):
    
    # remove newline characters
    x = re.sub('\\n',' ',x)
    # remove return characters
    x = re.sub('\\r',' ',x)
    x = x.strip()
    # remove any text starting with User... 
    x = re.sub("\[\[User.*", ' ', x)
    # remove IP addresses or user IDs
    x = re.sub("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", ' ', x)
    # remove URLs
    x = re.sub("(http://.*?\s)|(http://.*)", ' ', x)
    # remove non_printable characters eg unicode
    x = "".join(list(filter(lambda c: c in printable, x)))
    return x

In [4]:
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
df['none'] = 1-df[label_cols].max(axis=1)
df.describe()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate,none
count,159571.0,159571.0,159571.0,159571.0,159571.0,159571.0,159571.0
mean,0.095844,0.009996,0.052948,0.002996,0.049364,0.008805,0.898321
std,0.294379,0.099477,0.223931,0.05465,0.216627,0.09342,0.302226
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [7]:
x = df['comment_text'].apply(clean_text)
y = df.iloc[:, 2:8] 

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size= 0.2, random_state=13)

In [30]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
pipe = make_pipeline(TfidfVectorizer(
                                    stop_words='english',
                                    strip_accents='unicode',
                                    token_pattern=r'\w{1,}', #accept tokens that have 1 or more characters
                                    analyzer='word',
                                    ngram_range=(1, 1),
                                    min_df=5),
                      OneVsRestClassifier(DecisionTreeClassifier(class_weight='balanced')))
param_grid = {'tfidfvectorizer__max_features': [10000, 30000],
              'onevsrestclassifier__estimator__criterion': ['gini', 'entropy'],
              'onevsrestclassifier__estimator__max_features': [None, 'auto', 'sqrt', 'log2'],
              'onevsrestclassifier__estimator__max_depth': [None, 10, 50, 100, 250],
             } 
grid = GridSearchCV(pipe, param_grid, cv=3, scoring='roc_auc', verbose=10, n_jobs=-1)

grid.fit(X_train, y_train)

Fitting 3 folds for each of 80 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   35.4s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:  6.7min
[Parallel(n_jobs=-1)]: Done  61 tasks      | elapsed:  7.2min
[Parallel(n_jobs=-1)]: Done  74 tasks      | elapsed:  7.7min
[Parallel(n_jobs=-1)]: Done  89 tasks      | elapsed:  8.4min
[Parallel(n_jobs=-1)]: Done 104 tasks      | elapsed: 11.7min
[Parallel(n_jobs=-1)]: Done 121 tasks      | elapsed: 13.7min
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed: 14.7min
[Parallel(n_jobs=-1)]: Done 157 tasks      | elapsed: 15.9min
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed: 17.2min
[Parallel(n_jobs=-1)]: Done 197 tasks      | elapsed: 1

GridSearchCV(cv=3, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('tfidfvectorizer',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=5,
                                                        ngram_range=(1, 1),
                                               

In [31]:
grid.best_score_

0.7945774844290078

In [32]:
grid.score(X_test, y_test)

0.7916340799172333

In [33]:
grid.best_params_

{'onevsrestclassifier__estimator__criterion': 'entropy',
 'onevsrestclassifier__estimator__max_depth': 10,
 'onevsrestclassifier__estimator__max_features': None,
 'tfidfvectorizer__max_features': 30000}

In [34]:
from sklearn.metrics import f1_score, recall_score 

y_pred = grid.predict(X_test)

In [37]:
f1_score(y_test, y_pred, average = 'micro')

0.5007163323782235

In [38]:
recall_score(y_test, y_pred, average = 'micro')

0.5151068533529846