In [2]:
import pandas as pd, numpy as np
import sklearn
import scipy.sparse
import warnings
warnings.filterwarnings('ignore')
from sklearn.feature_extraction.text import CountVectorizer
from scipy import sparse
from sklearn.metrics import confusion_matrix
from bs4 import BeautifulSoup
import re
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfTransformer


In [3]:
train_data = pd.read_csv('../data/raw_data/train.csv')
test_data = pd.read_csv('../data/raw_data/test.csv')

train_labels=(train_data["Labels"])
test_labels = test_data["Labels"]

corpus_train = train_data["Comment"]
corpus_test = test_data["Comment"]

In [4]:
vectorizer = CountVectorizer(strip_accents='unicode',lowercase=True,stop_words='english', analyzer="word",token_pattern=r'\w{1,}')
train_feature_vectors_sparse = vectorizer.fit_transform(corpus_train)
vectorizer_test = CountVectorizer(strip_accents='unicode',lowercase=True,stop_words='english', analyzer="word",token_pattern=r'\w{1,}', vocabulary= vectorizer.get_feature_names())
test_feature_vectors_sparse = vectorizer_test.fit_transform(corpus_test)

In [5]:
test_data.head()

Unnamed: 0,Comment,Labels
0,Thank you for understanding. I think very high...,0
1,:Dear god this site is horrible.,0
2,"""::: Somebody will invariably try to add Relig...",0
3,""" \r\n\r\n It says it right there that it IS a...",0
4,""" \r\n\r\n == Before adding a new product to t...",0


In [6]:
from sklearn.pipeline import Pipeline
pipeline = Pipeline([
    ('tfidf', TfidfTransformer(norm=None, smooth_idf=True, sublinear_tf=False, use_idf=True)),
    ('clf', LogisticRegression(multi_class='ovr'))
])


In [10]:
from sklearn.metrics import make_scorer, roc_auc_score

from sklearn.model_selection import GridSearchCV
# tfidf_transformer = TfidfTransformer()
parameters = {'clf__penalty':('l1','l2', None), 'clf__class_weight':(None, 'balanced'), 'clf__solver':('newton-cg', 'lbfgs', 'liblinear')}
grid = GridSearchCV(pipeline, parameters, scoring = make_scorer(roc_auc_score, average='macro'), cv=5, error_score=0.0, verbose=10)


In [None]:
grid.fit(train_feature_vectors_sparse, train_labels)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV] clf__class_weight=None, clf__penalty=l1, clf__solver=newton-cg ..


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  clf__class_weight=None, clf__penalty=l1, clf__solver=newton-cg, score=0.0, total=   0.4s
[CV] clf__class_weight=None, clf__penalty=l1, clf__solver=newton-cg ..


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.5s remaining:    0.0s


[CV]  clf__class_weight=None, clf__penalty=l1, clf__solver=newton-cg, score=0.0, total=   0.4s
[CV] clf__class_weight=None, clf__penalty=l1, clf__solver=newton-cg ..


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.0s remaining:    0.0s


[CV]  clf__class_weight=None, clf__penalty=l1, clf__solver=newton-cg, score=0.0, total=   0.4s
[CV] clf__class_weight=None, clf__penalty=l1, clf__solver=newton-cg ..


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    1.6s remaining:    0.0s


[CV]  clf__class_weight=None, clf__penalty=l1, clf__solver=newton-cg, score=0.0, total=   0.4s
[CV] clf__class_weight=None, clf__penalty=l1, clf__solver=newton-cg ..


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    2.1s remaining:    0.0s


[CV]  clf__class_weight=None, clf__penalty=l1, clf__solver=newton-cg, score=0.0, total=   0.4s
[CV] clf__class_weight=None, clf__penalty=l1, clf__solver=lbfgs ......


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    2.7s remaining:    0.0s


[CV]  clf__class_weight=None, clf__penalty=l1, clf__solver=lbfgs, score=0.0, total=   0.5s
[CV] clf__class_weight=None, clf__penalty=l1, clf__solver=lbfgs ......


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    3.2s remaining:    0.0s


[CV]  clf__class_weight=None, clf__penalty=l1, clf__solver=lbfgs, score=0.0, total=   0.5s
[CV] clf__class_weight=None, clf__penalty=l1, clf__solver=lbfgs ......


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    3.8s remaining:    0.0s


[CV]  clf__class_weight=None, clf__penalty=l1, clf__solver=lbfgs, score=0.0, total=   0.4s
[CV] clf__class_weight=None, clf__penalty=l1, clf__solver=lbfgs ......


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    4.4s remaining:    0.0s


[CV]  clf__class_weight=None, clf__penalty=l1, clf__solver=lbfgs, score=0.0, total=   0.4s
[CV] clf__class_weight=None, clf__penalty=l1, clf__solver=lbfgs ......


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    4.9s remaining:    0.0s


[CV]  clf__class_weight=None, clf__penalty=l1, clf__solver=lbfgs, score=0.0, total=   0.4s
[CV] clf__class_weight=None, clf__penalty=l1, clf__solver=liblinear ..
[CV]  clf__class_weight=None, clf__penalty=l1, clf__solver=liblinear, score=0.7565277911390602, total=  30.5s
[CV] clf__class_weight=None, clf__penalty=l1, clf__solver=liblinear ..
[CV]  clf__class_weight=None, clf__penalty=l1, clf__solver=liblinear, score=0.7022500533559252, total=  33.3s
[CV] clf__class_weight=None, clf__penalty=l1, clf__solver=liblinear ..
[CV]  clf__class_weight=None, clf__penalty=l1, clf__solver=liblinear, score=0.7538148263740958, total=  28.7s
[CV] clf__class_weight=None, clf__penalty=l1, clf__solver=liblinear ..
[CV]  clf__class_weight=None, clf__penalty=l1, clf__solver=liblinear, score=0.7367088657258574, total=  30.7s
[CV] clf__class_weight=None, clf__penalty=l1, clf__solver=liblinear ..
[CV]  clf__class_weight=None, clf__penalty=l1, clf__solver=liblinear, score=0.7329583916226549, total=  28.9s
[CV]

[CV]  clf__class_weight=balanced, clf__penalty=l1, clf__solver=liblinear, score=0.8389510561033454, total=43.0min
[CV] clf__class_weight=balanced, clf__penalty=l1, clf__solver=liblinear 


In [7]:
grid.best_score_

0.76080052735964854

In [8]:
a = pd.DataFrame.from_dict(grid.cv_results_)
a

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_clf__class_weight,param_clf__penalty,param_clf__solver,params,rank_test_score,split0_test_score,...,split2_test_score,split2_train_score,split3_test_score,split3_train_score,split4_test_score,split4_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
0,0.126878,0.0,0.0,0.0,,l1,newton-cg,"{'clf__class_weight': None, 'clf__penalty': 'l...",9,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.005123,0.0,0.0,0.0
1,0.122108,0.0,0.0,0.0,,l1,lbfgs,"{'clf__class_weight': None, 'clf__penalty': 'l...",9,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.004195,0.0,0.0,0.0
2,12.170704,0.030099,0.602469,0.728202,,l1,liblinear,"{'clf__class_weight': None, 'clf__penalty': 'l...",8,0.578125,...,0.606735,0.726709,0.620543,0.754021,0.655817,0.761084,0.562364,0.000544,0.03583,0.026134
3,15.591796,0.027935,0.748731,0.90998,,l2,newton-cg,"{'clf__class_weight': None, 'clf__penalty': 'l...",5,0.755214,...,0.756104,0.911711,0.745988,0.910371,0.742666,0.911294,0.95605,0.001205,0.005765,0.001472
4,2.844136,0.025706,0.749201,0.908791,,l2,lbfgs,"{'clf__class_weight': None, 'clf__penalty': 'l...",4,0.75518,...,0.757808,0.9116,0.746821,0.906802,0.741379,0.909761,0.124904,0.000509,0.006259,0.001913
5,4.5176,0.027901,0.719302,0.795968,,l2,liblinear,"{'clf__class_weight': None, 'clf__penalty': 'l...",6,0.710758,...,0.733929,0.803316,0.734076,0.820811,0.701439,0.773936,0.193719,0.003286,0.012909,0.023032
6,0.122431,0.0,0.0,0.0,,,newton-cg,"{'clf__class_weight': None, 'clf__penalty': No...",9,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.004945,0.0,0.0,0.0
7,0.11922,0.0,0.0,0.0,,,lbfgs,"{'clf__class_weight': None, 'clf__penalty': No...",9,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.002178,0.0,0.0,0.0
8,0.117854,0.0,0.0,0.0,,,liblinear,"{'clf__class_weight': None, 'clf__penalty': No...",9,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.001395,0.0,0.0,0.0
9,0.117963,0.0,0.0,0.0,balanced,l1,newton-cg,"{'clf__class_weight': 'balanced', 'clf__penalt...",9,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.003009,0.0,0.0,0.0


In [9]:
grid.best_estimator_

Pipeline(memory=None,
     steps=[('tfidf', TfidfTransformer(norm=None, smooth_idf=True, sublinear_tf=False,
         use_idf=False)), ('clf', LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=None, penalty='l2', random_state=None,
          solver='newton-cg', tol=0.0001, verbose=0, warm_start=False))])