# RUSBOOST Classifier With TFIDF

**ROC-AUC:** 0.94934
**F1-score:** 0.62395

In [1]:
import re
import string

import pandas as pd
import numpy as np 
from imblearn.ensemble import RUSBoostClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

Using TensorFlow backend.


In [2]:
df = pd.read_csv('data/train.csv')

In [3]:
printable = set(string.printable)

def clean_text(x):
    
    # remove newline characters
    x = re.sub('\\n',' ',x)
    # remove return characters
    x = re.sub('\\r',' ',x)
    x = x.strip()
    # remove any text starting with User... 
    x = re.sub("\[\[User.*", ' ', x)
    # remove IP addresses or user IDs
    x = re.sub("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", ' ', x)
    # remove URLs
    x = re.sub("(http://.*?\s)|(http://.*)", ' ', x)
    # remove non_printable characters eg unicode
    x = "".join(list(filter(lambda c: c in printable, x)))
    return x

In [4]:
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
df['none'] = 1-df[label_cols].max(axis=1)

In [5]:
x = df['comment_text'].apply(clean_text)
y = df.iloc[:, 2:8] 

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size= 0.2, random_state=13)

In [8]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
pipe = make_pipeline(TfidfVectorizer(
                                    stop_words='english',
                                    strip_accents='unicode',
                                    token_pattern=r'\w{1,}', #accept tokens that have 1 or more characters
                                    analyzer='word',
                                    ngram_range=(1, 1),
                                    min_df=5),
                      OneVsRestClassifier(RUSBoostClassifier()))
param_grid = {'tfidfvectorizer__max_features': [10000, 30000],
              'onevsrestclassifier__estimator__algorithm': ['SAMME', 'SAMME.R'],
              'onevsrestclassifier__estimator__sampling_strategy': ['majority', 'not minority', 'not majority'],
              'onevsrestclassifier__estimator__n_estimators': [10, 50, 100, 250],
              'onevsrestclassifier__estimator__learning_rate': [0.25, 0.5, 0.75, 1]
             } 
grid = GridSearchCV(pipe, param_grid, cv=5, scoring='roc_auc', verbose=10, n_jobs=-1)

grid.fit(X_train, y_train)

Fitting 5 folds for each of 192 candidates, totalling 960 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   16.4s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:   19.8s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   32.4s
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done  61 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done  74 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done  89 tasks      | elapsed:  8.0min
[Parallel(n_jobs=-1)]: Done 104 tasks      | elapsed: 15.1min
[Parallel(n_jobs=-1)]: Done 121 tasks      | elapsed: 18.0min
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed: 23.7min
[Parallel(n_jobs=-1)]: Done 157 tasks      | elapsed: 26.8min
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed: 37.2min
[Parallel(n_jobs=-1)]: Done 197 tasks      | elapsed: 4

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('tfidfvectorizer',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=5,
                                                        ngram_range=(1, 1),
                                               

In [13]:
grid.best_params_

{'onevsrestclassifier__estimator__algorithm': 'SAMME.R',
 'onevsrestclassifier__estimator__learning_rate': 0.5,
 'onevsrestclassifier__estimator__n_estimators': 250,
 'onevsrestclassifier__estimator__sampling_strategy': 'not majority',
 'tfidfvectorizer__max_features': 30000}

In [14]:
grid.score(X_test, y_test)

0.9493412966535814

In [15]:
from sklearn.metrics import f1_score, recall_score 

y_pred = grid.predict(X_test)

In [16]:
f1_score(y_test, y_pred, average = 'micro')

0.6239501745777106