## Logistic Regression with TFIDF

### Updated:
This model was ran with the cleaned dataset so results are slightly different from initial TFIDF and logistic regression model. Also since OnevsRest was implemented, the solver changed from sag to liblinear.

In [14]:
import pandas as pd
import numpy as np
import time
# for model:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.multiclass import OneVsRestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
# for scoring:
from sklearn.metrics import f1_score
from sklearn import metrics

In [3]:
df = pd.read_csv('./toxic-train-clean.csv')

In [4]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation Why the edits made under my userna...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,""" More I can't make any real suggestions on im...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [5]:
x = df['comment_text']
y = df.iloc[:, 2:8] 

In [6]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size= 0.2, random_state=42)

In [7]:
start_time=time.time()

pipe = make_pipeline(TfidfVectorizer(
                                    stop_words='english',
                                    strip_accents='unicode',
                                    token_pattern=r'\w{1,}', #accept tokens that have 1 or more characters
                                    analyzer='word',
                                    ngram_range=(1, 1),
                                    min_df=5),
                     OneVsRestClassifier(LogisticRegression()))
param_grid = {'tfidfvectorizer__max_features': [20000, 30000, 35000],
              'onevsrestclassifier__estimator__solver': ['liblinear', 'sag'],
             } 
grid = GridSearchCV(pipe, param_grid, cv=5, scoring='roc_auc', n_jobs=-1)

grid3 = grid.fit(X_train, y_train)

end_time=time.time()
print("total time",end_time-start_time)

total time 175.2308270931244


In [8]:
# the initial tfidf logistic regression model had sag as the best parameter, but that was the only one.
grid3.best_params_

{'onevsrestclassifier__estimator__solver': 'liblinear',
 'tfidfvectorizer__max_features': 30000}

In [10]:
predicted_y_test = grid3.predict(X_test)
predicted_y_test[:1]

array([[0, 0, 0, 0, 0, 0]])

In [11]:
y_pred_prob = grid3.predict_proba(X_test)
y_pred_prob[:1]

array([[0.09761086, 0.0064352 , 0.02422327, 0.00283611, 0.03197065,
        0.00933325]])

We focused on ROC_AUC and F1 scores because of the largely imbalanced classes, the main difference in scores across all our different models is the F1 score.

In [12]:
auc_score = metrics.roc_auc_score(y_test, y_pred_prob)
auc_score

0.9788037596244529

In [13]:
f1_score(y_test, predicted_y_test, average='micro')

0.6711021623041232