In [28]:
import pandas as pd, numpy as np
import sklearn
import scipy.sparse
import warnings
warnings.filterwarnings('ignore')
from sklearn.feature_extraction.text import CountVectorizer
from scipy import sparse
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from bs4 import BeautifulSoup
import re
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfTransformer

In [29]:
train_data = pd.read_csv('../data/raw_data/train.csv')
test_data = pd.read_csv('../data/raw_data/test.csv')

train_labels=(train_data["Labels"])
test_labels = test_data["Labels"]

corpus_train = train_data["Comment"]
corpus_test = test_data["Comment"]

In [30]:
vectorizer = CountVectorizer(strip_accents='unicode',lowercase=True,stop_words='english', analyzer="word",token_pattern=r'\w{1,}')
train_feature_vectors_sparse = vectorizer.fit_transform(corpus_train)
vectorizer_test = CountVectorizer(strip_accents='unicode',lowercase=True,stop_words='english', analyzer="word",token_pattern=r'\w{1,}', vocabulary= vectorizer.get_feature_names())
test_feature_vectors_sparse = vectorizer_test.fit_transform(corpus_test)

In [7]:
from sklearn.pipeline import Pipeline
pipeline = Pipeline([
    ('tfidf', TfidfTransformer(smooth_idf=True)),
    ('clf', LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=None, penalty='l2', random_state=None,
          solver='newton-cg', tol=0.0001, verbose=0, warm_start=False))
])


In [8]:
from sklearn.metrics import make_scorer, roc_auc_score

from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfTransformer
# tfidf_transformer = TfidfTransformer()
parameters = {'tfidf__norm':('l1','l2', None), 'tfidf__use_idf':(True, False), 'tfidf__sublinear_tf':(True, False)}
grid = GridSearchCV(pipeline, parameters, scoring = make_scorer(roc_auc_score, average='macro'), cv=5, error_score=0.0, verbose=10)


In [9]:
grid.fit(train_feature_vectors_sparse, train_labels)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] tfidf__norm=l1, tfidf__sublinear_tf=True, tfidf__use_idf=True ...


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  tfidf__norm=l1, tfidf__sublinear_tf=True, tfidf__use_idf=True, score=0.876424033540372, total=   5.0s
[CV] tfidf__norm=l1, tfidf__sublinear_tf=True, tfidf__use_idf=True ...


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    5.3s remaining:    0.0s


[CV]  tfidf__norm=l1, tfidf__sublinear_tf=True, tfidf__use_idf=True, score=0.8772954267186801, total=   6.1s
[CV] tfidf__norm=l1, tfidf__sublinear_tf=True, tfidf__use_idf=True ...


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   11.8s remaining:    0.0s


[CV]  tfidf__norm=l1, tfidf__sublinear_tf=True, tfidf__use_idf=True, score=0.8719025145461069, total=   6.7s
[CV] tfidf__norm=l1, tfidf__sublinear_tf=True, tfidf__use_idf=True ...


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   19.1s remaining:    0.0s


[CV]  tfidf__norm=l1, tfidf__sublinear_tf=True, tfidf__use_idf=True, score=0.8689720528893059, total=   6.3s
[CV] tfidf__norm=l1, tfidf__sublinear_tf=True, tfidf__use_idf=True ...


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   26.0s remaining:    0.0s


[CV]  tfidf__norm=l1, tfidf__sublinear_tf=True, tfidf__use_idf=True, score=0.8730972626784614, total=   5.9s
[CV] tfidf__norm=l1, tfidf__sublinear_tf=True, tfidf__use_idf=False ..


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   32.4s remaining:    0.0s


[CV]  tfidf__norm=l1, tfidf__sublinear_tf=True, tfidf__use_idf=False, score=0.8699569190453184, total=   9.4s
[CV] tfidf__norm=l1, tfidf__sublinear_tf=True, tfidf__use_idf=False ..


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   42.0s remaining:    0.0s


[CV]  tfidf__norm=l1, tfidf__sublinear_tf=True, tfidf__use_idf=False, score=0.8656153135347872, total=   6.8s
[CV] tfidf__norm=l1, tfidf__sublinear_tf=True, tfidf__use_idf=False ..


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   48.9s remaining:    0.0s


[CV]  tfidf__norm=l1, tfidf__sublinear_tf=True, tfidf__use_idf=False, score=0.8674439907899424, total=   5.6s
[CV] tfidf__norm=l1, tfidf__sublinear_tf=True, tfidf__use_idf=False ..


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   54.7s remaining:    0.0s


[CV]  tfidf__norm=l1, tfidf__sublinear_tf=True, tfidf__use_idf=False, score=0.8613360205407009, total=   5.5s
[CV] tfidf__norm=l1, tfidf__sublinear_tf=True, tfidf__use_idf=False ..


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  1.0min remaining:    0.0s


[CV]  tfidf__norm=l1, tfidf__sublinear_tf=True, tfidf__use_idf=False, score=0.8656967542130221, total=   5.7s
[CV] tfidf__norm=l1, tfidf__sublinear_tf=False, tfidf__use_idf=True ..
[CV]  tfidf__norm=l1, tfidf__sublinear_tf=False, tfidf__use_idf=True, score=0.875973473181622, total=   5.4s
[CV] tfidf__norm=l1, tfidf__sublinear_tf=False, tfidf__use_idf=True ..
[CV]  tfidf__norm=l1, tfidf__sublinear_tf=False, tfidf__use_idf=True, score=0.8737137513603678, total=   5.8s
[CV] tfidf__norm=l1, tfidf__sublinear_tf=False, tfidf__use_idf=True ..
[CV]  tfidf__norm=l1, tfidf__sublinear_tf=False, tfidf__use_idf=True, score=0.8705914448537289, total=   5.6s
[CV] tfidf__norm=l1, tfidf__sublinear_tf=False, tfidf__use_idf=True ..
[CV]  tfidf__norm=l1, tfidf__sublinear_tf=False, tfidf__use_idf=True, score=0.8679574706921318, total=   6.2s
[CV] tfidf__norm=l1, tfidf__sublinear_tf=False, tfidf__use_idf=True ..
[CV]  tfidf__norm=l1, tfidf__sublinear_tf=False, tfidf__use_idf=True, score=0.8717076330709671, 

[CV]  tfidf__norm=None, tfidf__sublinear_tf=False, tfidf__use_idf=False, score=0.8932212257541989, total= 1.2min
[CV] tfidf__norm=None, tfidf__sublinear_tf=False, tfidf__use_idf=False 
[CV]  tfidf__norm=None, tfidf__sublinear_tf=False, tfidf__use_idf=False, score=0.8934415396689949, total= 1.1min
[CV] tfidf__norm=None, tfidf__sublinear_tf=False, tfidf__use_idf=False 
[CV]  tfidf__norm=None, tfidf__sublinear_tf=False, tfidf__use_idf=False, score=0.8895049822421913, total= 1.8min
[CV] tfidf__norm=None, tfidf__sublinear_tf=False, tfidf__use_idf=False 
[CV]  tfidf__norm=None, tfidf__sublinear_tf=False, tfidf__use_idf=False, score=0.892781371953761, total= 1.1min
[CV] tfidf__norm=None, tfidf__sublinear_tf=False, tfidf__use_idf=False 
[CV]  tfidf__norm=None, tfidf__sublinear_tf=False, tfidf__use_idf=False, score=0.8886996853357494, total= 1.0min


[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed: 31.4min finished


GridSearchCV(cv=5, error_score=0.0,
       estimator=Pipeline(memory=None,
     steps=[('tfidf', TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)), ('clf', LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=None, penalty='l2', random_state=None,
          solver='newton-cg', tol=0.0001, verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'tfidf__norm': ('l1', 'l2', None), 'tfidf__use_idf': (True, False), 'tfidf__sublinear_tf': (True, False)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=make_scorer(roc_auc_score, average=macro), verbose=10)

In [16]:
grid.best_score_

0.9018120739259452

In [17]:
a = pd.DataFrame.from_dict(grid.cv_results_)
a

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_tfidf__norm,param_tfidf__sublinear_tf,param_tfidf__use_idf,params,rank_test_score,split0_test_score,...,split2_test_score,split2_train_score,split3_test_score,split3_train_score,split4_test_score,split4_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
0,5.779844,0.197611,0.873538,0.899872,l1,True,True,"{'tfidf__norm': 'l1', 'tfidf__sublinear_tf': T...",7,0.876424,...,0.871903,0.899266,0.868972,0.901213,0.873097,0.899722,0.566887,0.031128,0.003039,0.000703
1,6.545013,0.083951,0.86601,0.884674,l1,True,False,"{'tfidf__norm': 'l1', 'tfidf__sublinear_tf': T...",11,0.869957,...,0.867444,0.884137,0.861336,0.885625,0.865697,0.884555,1.465721,0.011201,0.002819,0.000805
2,5.640599,0.166581,0.871989,0.899668,l1,False,True,"{'tfidf__norm': 'l1', 'tfidf__sublinear_tf': F...",8,0.875973,...,0.870591,0.899502,0.867957,0.901377,0.871708,0.898613,0.268764,0.022087,0.002725,0.000933
3,6.445744,0.080625,0.865087,0.884227,l1,False,False,"{'tfidf__norm': 'l1', 'tfidf__sublinear_tf': F...",12,0.869079,...,0.866171,0.883666,0.86001,0.885094,0.866069,0.884196,1.274221,0.026144,0.002994,0.000723
4,7.280396,0.175471,0.900287,0.967755,l2,True,True,"{'tfidf__norm': 'l2', 'tfidf__sublinear_tf': T...",2,0.901069,...,0.90119,0.967301,0.898442,0.96802,0.900195,0.967993,0.449275,0.015337,0.00099,0.000258
5,10.096277,0.084227,0.901812,0.950805,l2,True,False,"{'tfidf__norm': 'l2', 'tfidf__sublinear_tf': T...",1,0.90252,...,0.902978,0.950529,0.900121,0.950599,0.901716,0.950992,0.800501,0.010093,0.000974,0.00034
6,6.76968,0.164669,0.898498,0.966752,l2,False,True,"{'tfidf__norm': 'l2', 'tfidf__sublinear_tf': F...",4,0.899359,...,0.899065,0.966085,0.897277,0.966803,0.898221,0.967314,0.292792,0.024074,0.000726,0.000409
7,10.195719,0.072587,0.899477,0.94932,l2,False,False,"{'tfidf__norm': 'l2', 'tfidf__sublinear_tf': F...",3,0.899927,...,0.901426,0.949258,0.899533,0.948402,0.898562,0.949633,0.493661,0.010294,0.001201,0.000499
8,43.090925,0.128365,0.869194,0.998451,,True,True,"{'tfidf__norm': None, 'tfidf__sublinear_tf': T...",9,0.869955,...,0.865081,0.998502,0.868832,0.998525,0.86996,0.998395,4.874078,0.022862,0.00232,5.2e-05
9,16.289951,0.073058,0.893842,0.98629,,True,False,"{'tfidf__norm': None, 'tfidf__sublinear_tf': T...",5,0.896393,...,0.892348,0.98635,0.893171,0.986822,0.891424,0.986226,2.391543,0.005185,0.001958,0.000298


In [21]:
grid.best_estimator_

Pipeline(memory=None,
     steps=[('tfidf', TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=True, use_idf=False)), ('clf', LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=None, penalty='l2', random_state=None,
          solver='newton-cg', tol=0.0001, verbose=0, warm_start=False))])

In [31]:
tfidf_transformer = TfidfTransformer(norm= 'l2', smooth_idf=True, sublinear_tf=True, use_idf=False)
train_feature_vectors_sparse = tfidf_transformer.fit_transform(train_feature_vectors_sparse)
test_feature_vectors_sparse = tfidf_transformer.transform(test_feature_vectors_sparse)

clf = LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=None, penalty='l2', random_state=None,
          solver='newton-cg', tol=0.0001, verbose=0, warm_start=False)

clf.fit(train_feature_vectors_sparse, train_labels)

predictions = clf.predict(test_feature_vectors_sparse)
confmat = confusion_matrix(test_labels, predictions)

precision = confmat[1,1]/(confmat[1,1] + confmat[1,0])
recall = confmat[1,1]/(confmat[1,1] + confmat[0,1])
fScore = f1_score(test_labels, predictions, average='macro')
ccr = (confmat[0,0] + confmat[1,1])/(sum(sum(confmat)))
roc_auc = roc_auc_score(test_labels, predictions)
print("Precision = ",precision)
print("Recall = ",recall)
print("F_score = ",fScore)
print("CCR = ",ccr)
print("ROC_AUC = ", roc_auc)
print()
print("Confusion Matrix: \n", confmat)
print()

Precision =  0.911580970687
Recall =  0.453105095541
F_score =  0.768670899775
CCR =  0.884007002407
ROC_AUC =  0.896303172622

Confusion Matrix: 
 [[50866  6869]
 [  552  5691]]

