In [8]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score, accuracy_score,precision_score, recall_score, hamming_loss
from sklearn.externals import joblib
import re
import pickle

In [53]:
toxic_comments = pd.read_csv("toxic_comments_cleaned_df.csv")

In [54]:
y = toxic_comments[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]]

In [55]:
y.head(5)

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0,0,0,0,0,0
1,0,0,0,0,0,0
2,0,0,0,0,0,0
3,0,0,0,0,0,0
4,0,0,0,0,0,0


In [56]:
y = y.values

In [57]:
y

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]])

In [58]:
# split dataset into training and validation set
xtrain, xval, ytrain, yval = train_test_split(toxic_comments['clean_comment_text'], y, test_size=0.2, random_state=10)

In [59]:
tfidf_vectorizer_toxic = TfidfVectorizer(analyzer = 'word', max_df=0.8, max_features=10000)
# Dump the file
pickle.dump(tfidf_vectorizer_toxic, open("tfidf_vectorizer_toxic.pkl", "wb"))
#load saved tfidfvectorizer
tfidf_vectorizer_toxic = pickle.load(open("tfidf_vectorizer_toxic.pkl", 'rb'))

In [60]:
# create TF-IDF features
xtrain_tfidf = tfidf_vectorizer_toxic.fit_transform(xtrain.values.astype('U'))
xval_tfidf = tfidf_vectorizer_toxic.transform(xval.values.astype('U'))

In [61]:
import time
score = 0
best_param = [0,0]
scores = {'newton-cg':[],'lbfgs':[],'liblinear':[],'saga':[],'sag':[]}
for c in [0.01, 1, 10, 100]:
    for s in ('newton-cg', 'lbfgs', 'liblinear','saga','sag'):
        print('Doing ',c, s)
        lr = LogisticRegression(C=c, solver = s,penalty = 'l2')
        clf = OneVsRestClassifier(lr)
        clf.fit(xtrain_tfidf, ytrain)
        y_pred = clf.predict(xval_tfidf)
        test_accuracy = accuracy_score(yval, y_pred)
        test_precision = precision_score(yval, y_pred, average = "micro")
        test_recall = recall_score(yval, y_pred, average = "micro")
        test_f1_score = f1_score(yval, y_pred, average = "micro")
        hamming_score = hamming_loss(yval, y_pred)
        val_score = clf.score(xval_tfidf,yval)
        if val_score > score:
            score = val_score
            best_param[0] = c
            best_param[1] = s
        print('curr_score ',val_score)
        scores[s].append((c,val_score, test_accuracy, test_precision, test_recall, test_f1_score, hamming_score))
        print('best_params ',best_param)
        print('')
                                
            

Doing  0.01 newton-cg
curr_score  0.9016449945166849
best_params  [0.01, 'newton-cg']

Doing  0.01 lbfgs
curr_score  0.9016449945166849
best_params  [0.01, 'newton-cg']

Doing  0.01 liblinear
curr_score  0.9017076609744634
best_params  [0.01, 'liblinear']

Doing  0.01 saga
curr_score  0.9016449945166849
best_params  [0.01, 'liblinear']

Doing  0.01 sag
curr_score  0.9016449945166849
best_params  [0.01, 'liblinear']

Doing  1 newton-cg
curr_score  0.9216669277769074
best_params  [1, 'newton-cg']

Doing  1 lbfgs
curr_score  0.9216669277769074
best_params  [1, 'newton-cg']

Doing  1 liblinear
curr_score  0.9216669277769074
best_params  [1, 'newton-cg']

Doing  1 saga
curr_score  0.9216982610057967
best_params  [1, 'saga']

Doing  1 sag
curr_score  0.9216669277769074
best_params  [1, 'saga']

Doing  10 newton-cg
curr_score  0.920162932790224
best_params  [1, 'saga']

Doing  10 lbfgs
curr_score  0.9201002663324456
best_params  [1, 'saga']

Doing  10 liblinear
curr_score  0.920162932790224
b



curr_score  0.9202255992480025
best_params  [1, 'saga']

Doing  10 sag
curr_score  0.920162932790224
best_params  [1, 'saga']

Doing  100 newton-cg
curr_score  0.9110449631834561
best_params  [1, 'saga']

Doing  100 lbfgs
curr_score  0.9122356258812471
best_params  [1, 'saga']

Doing  100 liblinear
curr_score  0.9111076296412345
best_params  [1, 'saga']

Doing  100 saga




curr_score  0.911828293905687
best_params  [1, 'saga']

Doing  100 sag




curr_score  0.9110136299545668
best_params  [1, 'saga']





In [62]:
scores

{'newton-cg': [(0.01,
   0.9016449945166849,
   0.9016449945166849,
   0.9740518962075848,
   0.07035755478662054,
   0.13123571332526557,
   0.033740665308893414),
  (1,
   0.9216669277769074,
   0.9216669277769074,
   0.8754725372470535,
   0.5676182237600923,
   0.6887081256013294,
   0.018585826936132435),
  (10,
   0.920162932790224,
   0.920162932790224,
   0.8225594903503841,
   0.6329296424452133,
   0.7153915098183004,
   0.01824116141835083),
  (100,
   0.9110449631834561,
   0.9110449631834561,
   0.7623302973019419,
   0.6395617070357554,
   0.6955703645629164,
   0.020277821296151233)],
 'lbfgs': [(0.01,
   0.9016449945166849,
   0.9016449945166849,
   0.9740518962075848,
   0.07035755478662054,
   0.13123571332526557,
   0.033740665308893414),
  (1,
   0.9216669277769074,
   0.9216669277769074,
   0.8754725372470535,
   0.5676182237600923,
   0.6887081256013294,
   0.018585826936132435),
  (10,
   0.9201002663324456,
   0.9201002663324456,
   0.8221432579016271,
   0.6337

In [63]:
df = pd.DataFrame([0,0,0,0,0,0])
for i in scores:
    print(i)
    for inte in scores[i]:
        
        print(inte)
#         print(df)


newton-cg
(0.01, 0.9016449945166849, 0.9016449945166849, 0.9740518962075848, 0.07035755478662054, 0.13123571332526557, 0.033740665308893414)
(1, 0.9216669277769074, 0.9216669277769074, 0.8754725372470535, 0.5676182237600923, 0.6887081256013294, 0.018585826936132435)
(10, 0.920162932790224, 0.920162932790224, 0.8225594903503841, 0.6329296424452133, 0.7153915098183004, 0.01824116141835083)
(100, 0.9110449631834561, 0.9110449631834561, 0.7623302973019419, 0.6395617070357554, 0.6955703645629164, 0.020277821296151233)
lbfgs
(0.01, 0.9016449945166849, 0.9016449945166849, 0.9740518962075848, 0.07035755478662054, 0.13123571332526557, 0.033740665308893414)
(1, 0.9216669277769074, 0.9216669277769074, 0.8754725372470535, 0.5676182237600923, 0.6887081256013294, 0.018585826936132435)
(10, 0.9201002663324456, 0.9201002663324456, 0.8221432579016271, 0.6337946943483276, 0.7157860457542945, 0.018230717008721083)
(100, 0.9122356258812471, 0.9122356258812471, 0.7662828664719024, 0.6428777393310265, 0.699

In [64]:
#reducing hyperparam search to 1 to 9 in steps of 1
import time
score = 0
best_param_2 = [0,0]
scores_2 = {'newton-cg':[],'lbfgs':[],'liblinear':[],'saga':[],'sag':[]}
for c in np.linspace(0.1,2,5):
    for s in ('newton-cg', 'lbfgs', 'liblinear','saga','sag'):
        print('Doing ',c, s)
        time_start=time.time()
        lr = LogisticRegression(C=c, solver = s,penalty = 'l2', max_iter = 1000)
        clf = OneVsRestClassifier(lr)
        clf.fit(xtrain_tfidf, ytrain)
        y_pred = clf.predict(xval_tfidf)
        test_accuracy = accuracy_score(yval, y_pred)
        test_precision = precision_score(yval, y_pred, average = "micro")
        test_recall = recall_score(yval, y_pred, average = "micro")
        test_f1_score = f1_score(yval, y_pred, average = "micro")
        hamming_score = hamming_loss(yval, y_pred)
        val_score = clf.score(xval_tfidf,yval)
        if val_score > score:
            score = val_score
            best_param[0] = c
            best_param[1] = s
        print('curr_score ',val_score)
        scores_2[s].append((c,val_score, test_accuracy, test_precision, test_recall, test_f1_score, hamming_score))
        print('best_params ',best_param)
        print('')                             

Doing  0.1 newton-cg
curr_score  0.9128622904590318
best_params  [0.1, 'newton-cg']

Doing  0.1 lbfgs
curr_score  0.9128622904590318
best_params  [0.1, 'newton-cg']

Doing  0.1 liblinear
curr_score  0.9128622904590318
best_params  [0.1, 'newton-cg']

Doing  0.1 saga
curr_score  0.9128622904590318
best_params  [0.1, 'newton-cg']

Doing  0.1 sag
curr_score  0.9128622904590318
best_params  [0.1, 'newton-cg']

Doing  0.575 newton-cg
curr_score  0.920037599874667
best_params  [0.575, 'newton-cg']

Doing  0.575 lbfgs
curr_score  0.920037599874667
best_params  [0.575, 'newton-cg']

Doing  0.575 liblinear
curr_score  0.920037599874667
best_params  [0.575, 'newton-cg']

Doing  0.575 saga
curr_score  0.9200689331035563
best_params  [0.575, 'saga']

Doing  0.575 sag
curr_score  0.920037599874667
best_params  [0.575, 'saga']

Doing  1.05 newton-cg
curr_score  0.9217922606924643
best_params  [1.05, 'newton-cg']

Doing  1.05 lbfgs
curr_score  0.9217922606924643
best_params  [1.05, 'newton-cg']

Doin

In [65]:
df = pd.DataFrame([0,0,0,0,0,0])
for i in scores_2:
    print(i)
    for inte in scores_2[i]:
        
        print(inte)

newton-cg
(0.1, 0.9128622904590318, 0.9128622904590318, 0.9325546345139413, 0.3568339100346021, 0.5161626694473409, 0.024231030341009974)
(0.575, 0.920037599874667, 0.920037599874667, 0.8896969696969697, 0.529123414071511, 0.6635928035439833, 0.019431824116141836)
(1.05, 0.9217922606924643, 0.9217922606924643, 0.875082909573292, 0.5706459054209919, 0.6908107164674056, 0.01850227165909447)
(1.525, 0.9220115932946891, 0.9220115932946891, 0.8654336734693877, 0.5869377162629758, 0.6994845360824741, 0.01826727244242519)
(2.0, 0.9219175936080213, 0.9219175936080213, 0.8573495968575563, 0.5978950403690888, 0.7044933322007984, 0.01816805055094261)
lbfgs
(0.1, 0.9128622904590318, 0.9128622904590318, 0.9325546345139413, 0.3568339100346021, 0.5161626694473409, 0.024231030341009974)
(0.575, 0.920037599874667, 0.920037599874667, 0.8896969696969697, 0.529123414071511, 0.6635928035439833, 0.019431824116141836)
(1.05, 0.9217922606924643, 0.9217922606924643, 0.875082909573292, 0.5706459054209919, 0.690

In [67]:
#reducing hyperparam search to 1 to 5 in steps of 1
import time
score = 0
best_param_2 = [0,0]
scores_3 = {'liblinear':[],'saga':[]}
for c in np.linspace(0.1,2,5):
    for s in ('liblinear','saga'):
        print('Doing ',c, s)
        time_start=time.time()
        lr = LogisticRegression(C=c, solver = s,penalty = 'l1', max_iter = 1000)
        clf = OneVsRestClassifier(lr)
        clf.fit(xtrain_tfidf, ytrain)
        y_pred = clf.predict(xval_tfidf)
        test_accuracy = accuracy_score(yval, y_pred)
        test_precision = precision_score(yval, y_pred, average = "micro")
        test_recall = recall_score(yval, y_pred, average = "micro")
        test_f1_score = f1_score(yval, y_pred, average = "micro")
        hamming_score = hamming_loss(yval, y_pred)
        val_score = clf.score(xval_tfidf,yval)
        if val_score > score:
            score = val_score
            best_param[0] = c
            best_param[1] = s
        print('curr_score ',val_score)
        scores_3[s].append((c,val_score, test_accuracy, test_precision, test_recall, test_f1_score, hamming_score))
        print('best_params ',best_param)
        timetaken = time.time()-time_start
        print('timetaken ',timetaken)
        print('')                             

Doing  0.1 liblinear
curr_score  0.9176876077079743
best_params  [0.1, 'liblinear']
timetaken  2.7865772247314453

Doing  0.1 saga
curr_score  0.9176876077079743
best_params  [0.1, 'liblinear']
timetaken  11.506447076797485

Doing  0.575 liblinear
curr_score  0.922262259125803
best_params  [0.575, 'liblinear']
timetaken  4.057555913925171

Doing  0.575 saga
curr_score  0.922262259125803
best_params  [0.575, 'liblinear']
timetaken  80.42394804954529

Doing  1.05 liblinear
curr_score  0.922418925270249
best_params  [1.05, 'liblinear']
timetaken  5.607601881027222

Doing  1.05 saga
curr_score  0.922418925270249
best_params  [1.05, 'liblinear']
timetaken  277.05101799964905

Doing  1.525 liblinear
curr_score  0.9214162619457935
best_params  [1.05, 'liblinear']
timetaken  5.464899778366089

Doing  1.525 saga
curr_score  0.9214162619457935
best_params  [1.05, 'liblinear']
timetaken  465.5364520549774

Doing  2.0 liblinear
curr_score  0.9212595958013473
best_params  [1.05, 'liblinear']
timeta

In [68]:
for i in scores_3:
    print(i)
    for inte in scores_3[i]:
        print(inte)

liblinear
(0.1, 0.9176876077079743, 0.9176876077079743, 0.8812211390456645, 0.495242214532872, 0.6341148237031567, 0.020700819886155936)
(0.575, 0.922262259125803, 0.922262259125803, 0.8536585365853658, 0.6055363321799307, 0.708502024291498, 0.018047939840200533)
(1.05, 0.922418925270249, 0.922418925270249, 0.8453244461870222, 0.621683967704729, 0.7164575891002741, 0.017823385033161)
(1.525, 0.9214162619457935, 0.9214162619457935, 0.8355351330652881, 0.6291810841983853, 0.71782218932478, 0.017917384719828712)
(2.0, 0.9212595958013473, 0.9212595958013473, 0.8297149329809326, 0.6336505190311419, 0.7185481893239598, 0.017980051177607186)
saga
(0.1, 0.9176876077079743, 0.9176876077079743, 0.8812516029751218, 0.4953863898500577, 0.6342408860175357, 0.020695597681341063)
(0.575, 0.922262259125803, 0.922262259125803, 0.8536585365853658, 0.6055363321799307, 0.708502024291498, 0.018047939840200533)
(1.05, 0.922418925270249, 0.922418925270249, 0.8453244461870222, 0.621683967704729, 0.71645758910

In [51]:
scores_2

{'liblinear': [], 'saga': []}

In [None]:
# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer = 'word', max_df=0.75, max_features=10000, ngram_range=(1,3))
# Dump the file
pickle.dump(tfidf_vect_ngram_toxic, open("tfidf_vect_ngram_toxic.pkl", "wb"))
#load saved tfidfvectorizer
tfidf_vect_ngram_toxic = pickle.load(open("tfidf_vect_ngram_toxic.pkl", 'rb'))


In [None]:
# create TF-IDF features
xtrain_tfidf_ngram = tfidf_vect_ngram.fit_transform(xtrain.values.astype('U'))
xval_tfidf_ngram = tfidf_vect_ngram.transform(xval.values.astype('U'))

In [None]:
#reducing hyperparam search to 1 to 5 in steps of 1
import time
score = 0
best_param_2 = [0,0]
scores_2 = {'liblinear':[],'saga':[]}
for c in range(1,5):
    for s in ('liblinear','saga'):
        print('Doing ',c, s)
        time_start=time.time()
        lr = LogisticRegression(C=c, solver = s,penalty = 'l1', max_iter = 1000)
        clf = OneVsRestClassifier(lr)
        clf.fit(xtrain_tfidf_ngram, ytrain)
        y_pred = clf.predict(xval_tfidf_ngram)
        test_accuracy = accuracy_score(yval, y_pred)
        test_precision = precision_score(yval, y_pred, average = "micro")
        test_recall = recall_score(yval, y_pred, average = "micro")
        test_f1_score = f1_score(yval, y_pred, average = "micro")
        hamming_score = hamming_loss(yval, y_pred)
        val_score = clf.score(xval_tfidf,yval)
        if val_score > score:
            score = val_score
            best_param[0] = c
            best_param[1] = s
        print('curr_score ',val_score)
        scores[s].append((c,val_score, test_accuracy, test_precision, test_recall, test_f1_score, hamming_score))
        print('best_params ',best_param)
        print('')      

In [None]:
# #reducing hyperparam space to 1 to 3
# score = 0
# best_param_3 = [0,0]
# scores_3 = {'newton-cg':[],'lbfgs':[],'liblinear':[],'saga':[],'sag':[]}
# for c in np.linspace(1.1,3,20):
#     for s in ('newton-cg', 'lbfgs', 'liblinear','saga','sag'):
#         print('Doing ',c, s)
#         time_start=time.time()
#         lr = LogisticRegression(C=c, solver = s,penalty = 'l2', max_iter = 1000)
#         clf = OneVsRestClassifier(lr)
#         clf.fit(xtrain_tfidf, ytrain)
#         y_pred = clf.predict(xval_tfidf)
#         test_accuracy = accuracy_score(yval, y_pred)
#         test_precision = precision_score(yval, y_pred, average = "micro")
#         test_recall = recall_score(yval, y_pred, average = "micro")
#         test_f1_score = f1_score(yval, y_pred, average = "micro")
#         hamming_score = hamming_loss(yval, y_pred)
#         val_score = clf.score(xval_tfidf,yval)
#         if val_score > score:
#             score = val_score
#             best_param[0] = c
#             best_param[1] = s
#         print('curr_score ',val_score)
#         scores[s].append((c,val_score, test_accuracy, test_precision, test_recall, test_f1_score, hamming_score))
#         print('best_params ',best_param)
#         print('')       
                                