## Logistic Regression with TFIDF

This model was used as the initial model in our API. We reported the positive recall scores initially because of the problem and identifying as many instances as possible of toxicity was important.

In [35]:
import pandas as pd
import numpy as np
import time
import re
# for model:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression 
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
# for scoring:
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
# for pickling:
from sklearn.externals import joblib 
import pickle 

In [2]:
df = pd.read_csv('./train.csv')

In [3]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [5]:
# to illustrate how imbalanced the data is:
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
df['none'] = 1-df[label_cols].max(axis=1)
df.describe()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate,none
count,159571.0,159571.0,159571.0,159571.0,159571.0,159571.0,159571.0
mean,0.095844,0.009996,0.052948,0.002996,0.049364,0.008805,0.898321
std,0.294379,0.099477,0.223931,0.05465,0.216627,0.09342,0.302226
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [6]:
# remove \\n's, Username, IP and http links (this was before we implemented standardized pre-processing &
# created a cleaned dataset)

start_time=time.time()
# remove '\\n'
df['comment_text'] =df['comment_text'].map(lambda x: re.sub('\\n',' ',str(x)))
    
# remove any text starting with User... 
df['comment_text'] = df['comment_text'].map(lambda x: re.sub("\[\[User.*",'',str(x)))
    
# remove IP addresses or user IDs
df['comment_text'] = df['comment_text'].map(lambda x: re.sub("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}",'',str(x)))
    
#remove http links in the text
df['comment_text'] = df['comment_text'].map(lambda x: re.sub("(http://.*?\s)|(http://.*)",'',str(x)))

end_time=time.time()
print("total time",end_time-start_time)

total time 4.355303049087524


In [7]:
x = df['comment_text']
y = df.iloc[:, 2:8]  

In [8]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size= 0.2, random_state=13)

In [9]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((127656,), (127656, 6), (31915,), (31915, 6))

In [10]:
# instantiate the vectorizer to test 1 class at a time:
word_vectorizer = TfidfVectorizer(
    stop_words='english',
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{2,}',  #vectorize 2-character words or more
    ngram_range=(1, 1),
    max_features=30000)

# fit and transform on it the training features
word_vectorizer.fit(X_train)
X_train_word_features = word_vectorizer.transform(X_train)

#transform the test features to sparse matrix
test_features = word_vectorizer.transform(X_test)

In [23]:
# get confusion matrix for each class to report Recall score:

class_names = ['toxic','severe_toxic','obscene', 'threat', 'insult', 'identity_hate']

auc = []

for class_name in class_names:
    #call the labels one column at a time so we can run the classifier on them
    train_target = y_train[class_name]
    test_target = y_test[class_name]
    classifier = LogisticRegression(solver='sag', C=10)
  
    classifier.fit(X_train_word_features, train_target)
    y_pred = classifier.predict(test_features)
    y_pred_prob = classifier.predict_proba(test_features)[:, 1]
    auc_score = metrics.roc_auc_score(test_target, y_pred_prob)
    auc.append(auc_score)
    print(class_name)
    print("ROC_AUC score {}\n".format(auc_score))
    
    print(confusion_matrix(test_target, y_pred))
    print(classification_report(test_target, y_pred))

print('Total average ROC_AUC score is {}'.format(np.mean(auc)))

toxic
ROC_AUC score 0.965497630813178

[[28568   359]
 [  969  2019]]
              precision    recall  f1-score   support

           0       0.97      0.99      0.98     28927
           1       0.85      0.68      0.75      2988

    accuracy                           0.96     31915
   macro avg       0.91      0.83      0.86     31915
weighted avg       0.96      0.96      0.96     31915

severe_toxic
ROC_AUC score 0.9820299994696421

[[31533    79]
 [  214    89]]
              precision    recall  f1-score   support

           0       0.99      1.00      1.00     31612
           1       0.53      0.29      0.38       303

    accuracy                           0.99     31915
   macro avg       0.76      0.65      0.69     31915
weighted avg       0.99      0.99      0.99     31915

obscene
ROC_AUC score 0.9825040396332911

[[30132   159]
 [  472  1152]]
              precision    recall  f1-score   support

           0       0.98      0.99      0.99     30291
           1    

In [24]:
start_time=time.time()

pipe = make_pipeline(TfidfVectorizer(
                                    stop_words='english',
                                    strip_accents='unicode',
                                    token_pattern=r'\w{1,}', #accept tokens that have 1 or more characters
                                    analyzer='word',
                                    ngram_range=(1, 1),
                                    min_df=5),
                     OneVsRestClassifier(LogisticRegression()))
param_grid = {'tfidfvectorizer__max_features': [10000, 30000],
              'onevsrestclassifier__estimator__solver': ['liblinear', 'sag'],
             } 
grid = GridSearchCV(pipe, param_grid, cv=3, scoring='roc_auc')

grid3 = grid.fit(X_train, y_train)

end_time=time.time()
print("total time",end_time-start_time)

total time 155.0808470249176


In [25]:
grid3.best_params_

{'onevsrestclassifier__estimator__solver': 'sag',
 'tfidfvectorizer__max_features': 30000}

In [26]:
grid3.best_score_

0.9778246754155863

In [27]:
predicted_y_test = grid3.predict(X_test)
predicted_y_test[:1]

array([[0, 0, 0, 0, 0, 0]])

In [28]:
y_pred_prob = grid3.predict_proba(X_test)
y_pred_prob[:1]

array([[0.09263419, 0.00483361, 0.02309179, 0.00201297, 0.0287851 ,
        0.00462837]])

In [29]:
auc_score = metrics.roc_auc_score(y_test, y_pred_prob)
auc_score

0.979483629043321

In [32]:
f1_score(y_test, predicted_y_test, average='micro')

0.6683720080394664

In [37]:
# pipeline with best parameters (there was an issue with pickled model from gridsearchCV with API implementation):
pipe = make_pipeline(TfidfVectorizer(
                                    stop_words='english',
                                    strip_accents='unicode',
                                    token_pattern=r'\w{1,}', #accept tokens that have 1 or more characters
                                    analyzer='word',
                                    ngram_range=(1, 1),
                                    min_df=5,
                                    max_features=30000),
                     OneVsRestClassifier(LogisticRegression(solver='sag')))

grid4 = pipe.fit(X_train, y_train)

In [31]:
# save classifier to a file:
save_classifier = open("Tfidf_LogR_3.pickle", 'wb') #wb= write in bytes. 
pickle.dump(grid3, save_classifier) #use pickle to dump the grid3 we trained, as 'Tfidf_LogR.pickle' in wb format
save_classifier.close() 

In [38]:
# in case we run into size issues, this is a compressed model.
joblib.dump(grid3, 'compressed_model.pkl', compress=3)

['compressed_model.pkl']

In [38]:
# compressed model without gridsearchCV:
joblib.dump(grid4, 'compressed_pipeline1.pkl', compress=3)

['compressed_pipeline1.pkl']

## Update: 
with cleaned dataset: