In [1]:
import pandas as pd, numpy as np
import sklearn
import scipy.sparse
import warnings
warnings.filterwarnings('ignore')
from sklearn.feature_extraction.text import CountVectorizer
from scipy import sparse
from sklearn.metrics import confusion_matrix
from bs4 import BeautifulSoup
import re
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfTransformer


In [2]:
data = pd.read_csv('../data/raw_data/kaggle/train.csv')
labels=(data["toxic"] | data["severe_toxic"] | data["obscene"] | data["threat"] | data["insult"] | data["identity_hate"])
# labels=data["Labels"]
train_data = data[0:100000]
test_data = data[100001:]
train_labels = labels[0:100000]
test_labels = labels[100001:]

corpus = data["comment_text"]

In [3]:
vectorizer = CountVectorizer(strip_accents='unicode',lowercase=True,stop_words='english', analyzer="word",token_pattern=r'\w{1,}')
X = vectorizer.fit_transform(corpus)
train_feature_vectors_sparse = X[0:100000]
test_feature_vectors_sparse = X[100001:]

In [4]:
from sklearn.pipeline import Pipeline
pipeline = Pipeline([
    ('tfidf', TfidfTransformer(norm=None, smooth_idf=True, sublinear_tf=False,use_idf=False)),
    ('clf', LogisticRegression(multi_class='ovr'))
])


In [5]:
from sklearn.metrics import make_scorer

from sklearn.model_selection import GridSearchCV
# tfidf_transformer = TfidfTransformer()
parameters = {'clf__penalty':('l1','l2', None), 'clf__class_weight':(None, 'balanced'), 'clf__solver':('newton-cg', 'lbfgs', 'liblinear')}
grid = GridSearchCV(pipeline, parameters, scoring = make_scorer(f1_score), cv=5, error_score=0.0, verbose=10)


In [None]:
grid.fit(train_feature_vectors_sparse, train_labels)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV] clf__class_weight=None, clf__penalty=l1, clf__solver=newton-cg ..
[CV]  clf__class_weight=None, clf__penalty=l1, clf__solver=newton-cg, score=0.0, total=   0.1s
[CV] clf__class_weight=None, clf__penalty=l1, clf__solver=newton-cg ..


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s


[CV]  clf__class_weight=None, clf__penalty=l1, clf__solver=newton-cg, score=0.0, total=   0.1s
[CV] clf__class_weight=None, clf__penalty=l1, clf__solver=newton-cg ..
[CV]  clf__class_weight=None, clf__penalty=l1, clf__solver=newton-cg, score=0.0, total=   0.1s
[CV] clf__class_weight=None, clf__penalty=l1, clf__solver=newton-cg ..


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.4s remaining:    0.0s


[CV]  clf__class_weight=None, clf__penalty=l1, clf__solver=newton-cg, score=0.0, total=   0.1s
[CV] clf__class_weight=None, clf__penalty=l1, clf__solver=newton-cg ..
[CV]  clf__class_weight=None, clf__penalty=l1, clf__solver=newton-cg, score=0.0, total=   0.1s
[CV] clf__class_weight=None, clf__penalty=l1, clf__solver=lbfgs ......


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.5s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.7s remaining:    0.0s


[CV]  clf__class_weight=None, clf__penalty=l1, clf__solver=lbfgs, score=0.0, total=   0.1s
[CV] clf__class_weight=None, clf__penalty=l1, clf__solver=lbfgs ......
[CV]  clf__class_weight=None, clf__penalty=l1, clf__solver=lbfgs, score=0.0, total=   0.1s
[CV] clf__class_weight=None, clf__penalty=l1, clf__solver=lbfgs ......


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.8s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.9s remaining:    0.0s


[CV]  clf__class_weight=None, clf__penalty=l1, clf__solver=lbfgs, score=0.0, total=   0.1s
[CV] clf__class_weight=None, clf__penalty=l1, clf__solver=lbfgs ......
[CV]  clf__class_weight=None, clf__penalty=l1, clf__solver=lbfgs, score=0.0, total=   0.1s
[CV] clf__class_weight=None, clf__penalty=l1, clf__solver=lbfgs ......


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    1.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    1.2s remaining:    0.0s


[CV]  clf__class_weight=None, clf__penalty=l1, clf__solver=lbfgs, score=0.0, total=   0.1s
[CV] clf__class_weight=None, clf__penalty=l1, clf__solver=liblinear ..
[CV]  clf__class_weight=None, clf__penalty=l1, clf__solver=liblinear, score=0.5781249999999999, total=  13.2s
[CV] clf__class_weight=None, clf__penalty=l1, clf__solver=liblinear ..
[CV]  clf__class_weight=None, clf__penalty=l1, clf__solver=liblinear, score=0.5511309334182861, total=  11.5s
[CV] clf__class_weight=None, clf__penalty=l1, clf__solver=liblinear ..
[CV]  clf__class_weight=None, clf__penalty=l1, clf__solver=liblinear, score=0.6067346308310164, total=  12.4s
[CV] clf__class_weight=None, clf__penalty=l1, clf__solver=liblinear ..
[CV]  clf__class_weight=None, clf__penalty=l1, clf__solver=liblinear, score=0.6205425175251448, total=  11.9s
[CV] clf__class_weight=None, clf__penalty=l1, clf__solver=liblinear ..
[CV]  clf__class_weight=None, clf__penalty=l1, clf__solver=liblinear, score=0.6558167570825798, total=  12.1s
[CV]

[CV]  clf__class_weight=balanced, clf__penalty=l1, clf__solver=liblinear, score=0.6755297050270045, total=14.1min
[CV] clf__class_weight=balanced, clf__penalty=l1, clf__solver=liblinear 
[CV]  clf__class_weight=balanced, clf__penalty=l1, clf__solver=liblinear, score=0.6888001681025424, total=14.4min
[CV] clf__class_weight=balanced, clf__penalty=l1, clf__solver=liblinear 
[CV]  clf__class_weight=balanced, clf__penalty=l1, clf__solver=liblinear, score=0.6742738589211619, total=15.3min
[CV] clf__class_weight=balanced, clf__penalty=l2, clf__solver=newton-cg 
[CV]  clf__class_weight=balanced, clf__penalty=l2, clf__solver=newton-cg, score=0.7564979480164158, total=  24.5s
[CV] clf__class_weight=balanced, clf__penalty=l2, clf__solver=newton-cg 
[CV]  clf__class_weight=balanced, clf__penalty=l2, clf__solver=newton-cg, score=0.7659474671669793, total=  26.2s
[CV] clf__class_weight=balanced, clf__penalty=l2, clf__solver=newton-cg 
[CV]  clf__class_weight=balanced, clf__penalty=l2, clf__solver=ne

[Parallel(n_jobs=1)]: Done  90 out of  90 | elapsed: 75.0min finished


In [115]:
grid.best_score_

0.684472642351882

In [107]:
a = pd.DataFrame.from_dict(grid.cv_results_)
a

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_tfidf__norm,param_tfidf__sublinear_tf,param_tfidf__use_idf,params,split0_test_score,split1_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,0.74002,0.039021,0.147605,0.023373,l1,True,True,"{'tfidf__norm': 'l1', 'tfidf__sublinear_tf': T...",0.01286,0.009202,...,0.014313,0.003363,10,0.015897,0.016655,0.014835,0.013924,0.015139,0.01529,0.00093
1,0.511631,0.041203,0.098736,0.023694,l1,True,False,"{'tfidf__norm': 'l1', 'tfidf__sublinear_tf': T...",0.006144,0.001847,...,0.006142,0.002359,12,0.00691,0.007827,0.006298,0.005838,0.006451,0.006665,0.000675
2,0.680579,0.049231,0.134839,0.015559,l1,False,True,"{'tfidf__norm': 'l1', 'tfidf__sublinear_tf': F...",0.013468,0.012251,...,0.016496,0.003829,9,0.018623,0.019379,0.017261,0.016049,0.018018,0.017866,0.001145
3,0.494477,0.051836,0.084574,0.011483,l1,False,False,"{'tfidf__norm': 'l1', 'tfidf__sublinear_tf': F...",0.006144,0.003077,...,0.007609,0.002807,11,0.008592,0.009355,0.00691,0.007675,0.008286,0.008163,0.000828
4,0.748194,0.055394,0.151993,0.012874,l2,True,True,"{'tfidf__norm': 'l2', 'tfidf__sublinear_tf': T...",0.271907,0.277939,...,0.27321,0.003874,5,0.308632,0.305577,0.305979,0.305458,0.310759,0.307281,0.00209
5,0.472934,0.020001,0.078989,0.005691,l2,True,False,"{'tfidf__norm': 'l2', 'tfidf__sublinear_tf': T...",0.163888,0.161765,...,0.166618,0.007578,7,0.178339,0.1777,0.177803,0.174744,0.177048,0.177127,0.00126
6,0.76395,0.070123,0.142618,0.023845,l2,False,True,"{'tfidf__norm': 'l2', 'tfidf__sublinear_tf': F...",0.269824,0.27321,...,0.269705,0.002433,6,0.303034,0.302155,0.302025,0.301169,0.305597,0.302796,0.00152
7,0.511271,0.02568,0.091755,0.016143,l2,False,False,"{'tfidf__norm': 'l2', 'tfidf__sublinear_tf': F...",0.15873,0.155549,...,0.16234,0.005896,8,0.175458,0.174854,0.17343,0.170704,0.170704,0.17303,0.00201
8,0.750191,0.034636,0.137233,0.00843,,True,True,"{'tfidf__norm': None, 'tfidf__sublinear_tf': T...",0.50199,0.50377,...,0.50259,0.001324,4,0.691282,0.693052,0.695465,0.692765,0.691343,0.692781,0.001523
9,0.501059,0.065832,0.09335,0.019438,,True,False,"{'tfidf__norm': None, 'tfidf__sublinear_tf': T...",0.684741,0.667721,...,0.675214,0.006701,2,0.743491,0.74907,0.746746,0.748397,0.74627,0.746795,0.001946


In [118]:
grid.best_estimator_

Pipeline(memory=None,
     steps=[('tfidf', TfidfTransformer(norm=None, smooth_idf=True, sublinear_tf=False,
         use_idf=False)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [119]:
a["params"][9]

{'tfidf__norm': None, 'tfidf__sublinear_tf': True, 'tfidf__use_idf': False}

In [120]:
a["params"][10]

{'tfidf__norm': None, 'tfidf__sublinear_tf': False, 'tfidf__use_idf': True}