In [1]:
import pandas as pd, numpy as np
import sklearn
import scipy.sparse
import warnings
warnings.filterwarnings('ignore')
from sklearn.feature_extraction.text import CountVectorizer
from scipy import sparse
from sklearn.metrics import confusion_matrix
from bs4 import BeautifulSoup
import re
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfTransformer


In [2]:
train_data = pd.read_csv('../data/raw_data/train.csv')
test_data = pd.read_csv('../data/raw_data/test.csv')

train_labels=(train_data["Labels"])
test_labels = test_data["Labels"]

corpus_train = train_data["Comment"]
corpus_test = test_data["Comment"]

In [3]:
vectorizer = CountVectorizer(strip_accents='unicode',lowercase=True,stop_words='english', analyzer="word",token_pattern=r'\w{1,}')
train_feature_vectors_sparse = vectorizer.fit_transform(corpus_train)
vectorizer_test = CountVectorizer(strip_accents='unicode',lowercase=True,stop_words='english', analyzer="word",token_pattern=r'\w{1,}', vocabulary= vectorizer.get_feature_names())
test_feature_vectors_sparse = vectorizer_test.fit_transform(corpus_test)

In [5]:
test_data.head()

Unnamed: 0,Comment,Labels
0,Thank you for understanding. I think very high...,0
1,:Dear god this site is horrible.,0
2,"""::: Somebody will invariably try to add Relig...",0
3,""" \r\n\r\n It says it right there that it IS a...",0
4,""" \r\n\r\n == Before adding a new product to t...",0


In [4]:
from sklearn.pipeline import Pipeline
pipeline = Pipeline([
    ('tfidf', TfidfTransformer(norm=None, smooth_idf=True, sublinear_tf=False, use_idf=True)),
    ('clf', LogisticRegression(multi_class='ovr'))
])


In [5]:
from sklearn.metrics import make_scorer, roc_auc_score

from sklearn.model_selection import GridSearchCV
# tfidf_transformer = TfidfTransformer()
parameters = {'clf__penalty':('l1','l2', None), 'clf__class_weight':(None, 'balanced'), 'clf__solver':('newton-cg', 'lbfgs')}
grid = GridSearchCV(pipeline, parameters, scoring = make_scorer(roc_auc_score, average='macro'), cv=5, error_score=0.0, verbose=10)


In [6]:
grid.fit(train_feature_vectors_sparse, train_labels)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] clf__class_weight=None, clf__penalty=l1, clf__solver=newton-cg ..


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  clf__class_weight=None, clf__penalty=l1, clf__solver=newton-cg, score=0.0, total=   0.4s
[CV] clf__class_weight=None, clf__penalty=l1, clf__solver=newton-cg ..


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.4s remaining:    0.0s


[CV]  clf__class_weight=None, clf__penalty=l1, clf__solver=newton-cg, score=0.0, total=   0.3s
[CV] clf__class_weight=None, clf__penalty=l1, clf__solver=newton-cg ..


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.8s remaining:    0.0s


[CV]  clf__class_weight=None, clf__penalty=l1, clf__solver=newton-cg, score=0.0, total=   0.3s
[CV] clf__class_weight=None, clf__penalty=l1, clf__solver=newton-cg ..


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    1.3s remaining:    0.0s


[CV]  clf__class_weight=None, clf__penalty=l1, clf__solver=newton-cg, score=0.0, total=   0.3s
[CV] clf__class_weight=None, clf__penalty=l1, clf__solver=newton-cg ..


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    1.7s remaining:    0.0s


[CV]  clf__class_weight=None, clf__penalty=l1, clf__solver=newton-cg, score=0.0, total=   0.3s
[CV] clf__class_weight=None, clf__penalty=l1, clf__solver=lbfgs ......


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    2.2s remaining:    0.0s


[CV]  clf__class_weight=None, clf__penalty=l1, clf__solver=lbfgs, score=0.0, total=   0.3s
[CV] clf__class_weight=None, clf__penalty=l1, clf__solver=lbfgs ......


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    2.6s remaining:    0.0s


[CV]  clf__class_weight=None, clf__penalty=l1, clf__solver=lbfgs, score=0.0, total=   0.3s
[CV] clf__class_weight=None, clf__penalty=l1, clf__solver=lbfgs ......


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    3.0s remaining:    0.0s


[CV]  clf__class_weight=None, clf__penalty=l1, clf__solver=lbfgs, score=0.0, total=   0.3s
[CV] clf__class_weight=None, clf__penalty=l1, clf__solver=lbfgs ......


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    3.5s remaining:    0.0s


[CV]  clf__class_weight=None, clf__penalty=l1, clf__solver=lbfgs, score=0.0, total=   0.4s
[CV] clf__class_weight=None, clf__penalty=l1, clf__solver=lbfgs ......


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    4.0s remaining:    0.0s


[CV]  clf__class_weight=None, clf__penalty=l1, clf__solver=lbfgs, score=0.0, total=   0.3s
[CV] clf__class_weight=None, clf__penalty=l2, clf__solver=newton-cg ..
[CV]  clf__class_weight=None, clf__penalty=l2, clf__solver=newton-cg, score=0.8557750030499554, total= 2.6min
[CV] clf__class_weight=None, clf__penalty=l2, clf__solver=newton-cg ..
[CV]  clf__class_weight=None, clf__penalty=l2, clf__solver=newton-cg, score=0.8552948506735476, total= 4.1min
[CV] clf__class_weight=None, clf__penalty=l2, clf__solver=newton-cg ..
[CV]  clf__class_weight=None, clf__penalty=l2, clf__solver=newton-cg, score=0.8493465692932902, total= 3.7min
[CV] clf__class_weight=None, clf__penalty=l2, clf__solver=newton-cg ..
[CV]  clf__class_weight=None, clf__penalty=l2, clf__solver=newton-cg, score=0.8522131865749344, total= 3.7min
[CV] clf__class_weight=None, clf__penalty=l2, clf__solver=newton-cg ..
[CV]  clf__class_weight=None, clf__penalty=l2, clf__solver=newton-cg, score=0.8548383625849926, total= 2.9min
[CV]

[CV]  clf__class_weight=balanced, clf__penalty=None, clf__solver=lbfgs, score=0.0, total=   0.3s
[CV] clf__class_weight=balanced, clf__penalty=None, clf__solver=lbfgs 
[CV]  clf__class_weight=balanced, clf__penalty=None, clf__solver=lbfgs, score=0.0, total=   0.3s
[CV] clf__class_weight=balanced, clf__penalty=None, clf__solver=lbfgs 
[CV]  clf__class_weight=balanced, clf__penalty=None, clf__solver=lbfgs, score=0.0, total=   0.3s


[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed: 37.1min finished


GridSearchCV(cv=5, error_score=0.0,
       estimator=Pipeline(memory=None,
     steps=[('tfidf', TfidfTransformer(norm=None, smooth_idf=True, sublinear_tf=False, use_idf=True)), ('clf', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'clf__penalty': ('l1', 'l2', None), 'clf__class_weight': (None, 'balanced'), 'clf__solver': ('newton-cg', 'lbfgs')},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=make_scorer(roc_auc_score, average=macro), verbose=10)

In [7]:
grid.best_score_

0.8740899829366494

In [8]:
a = pd.DataFrame.from_dict(grid.cv_results_)
a

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__class_weight,param_clf__penalty,param_clf__solver,params,split0_test_score,split1_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,0.443544,0.009228,0.0,0.0,,l1,newton-cg,"{'clf__class_weight': None, 'clf__penalty': 'l...",0.0,0.0,...,0.0,0.0,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.44415,0.011582,0.0,0.0,,l1,lbfgs,"{'clf__class_weight': None, 'clf__penalty': 'l...",0.0,0.0,...,0.0,0.0,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,202.772914,32.888484,0.086563,0.006883,,l2,newton-cg,"{'clf__class_weight': None, 'clf__penalty': 'l...",0.855775,0.855295,...,0.853494,0.002412,4,0.993983,0.993721,0.994059,0.994551,0.993781,0.994019,0.000294
3,6.817389,0.176703,0.095347,0.005744,,l2,lbfgs,"{'clf__class_weight': None, 'clf__penalty': 'l...",0.862682,0.860254,...,0.859127,0.002192,3,0.985151,0.984474,0.980524,0.973887,0.989488,0.982705,0.005247
4,0.459138,0.005947,0.0,0.0,,,newton-cg,"{'clf__class_weight': None, 'clf__penalty': No...",0.0,0.0,...,0.0,0.0,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.446946,0.003122,0.0,0.0,,,lbfgs,"{'clf__class_weight': None, 'clf__penalty': No...",0.0,0.0,...,0.0,0.0,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.457137,0.011194,0.0,0.0,balanced,l1,newton-cg,"{'clf__class_weight': 'balanced', 'clf__penalt...",0.0,0.0,...,0.0,0.0,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.464536,0.00851,0.0,0.0,balanced,l1,lbfgs,"{'clf__class_weight': 'balanced', 'clf__penalt...",0.0,0.0,...,0.0,0.0,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,223.258889,36.313532,0.09415,0.016161,balanced,l2,newton-cg,"{'clf__class_weight': 'balanced', 'clf__penalt...",0.867568,0.868812,...,0.867007,0.002204,2,0.998422,0.998313,0.998408,0.998456,0.998305,0.99838,6.1e-05
9,7.433791,0.505775,0.085953,0.001553,balanced,l2,lbfgs,"{'clf__class_weight': 'balanced', 'clf__penalt...",0.877902,0.876271,...,0.87409,0.003648,1,0.993786,0.99473,0.991487,0.992079,0.995651,0.993546,0.001567


In [9]:
grid.best_estimator_

Pipeline(memory=None,
     steps=[('tfidf', TfidfTransformer(norm=None, smooth_idf=True, sublinear_tf=False, use_idf=True)), ('clf', LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=None, penalty='l2', random_state=None,
          solver='lbfgs', tol=0.0001, verbose=0, warm_start=False))])

In [10]:
vectorizer = TfidfTransformer(norm=None, smooth_idf=True, sublinear_tf=False, use_idf=True)
train_feature_vectors_sparse = vectorizer.fit_transform(train_feature_vectors_sparse)
test_feature_vectors_sparse = vectorizer.fit_transform(test_feature_vectors_sparse)

clf = LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=None, penalty='l2', random_state=None,
          solver='lbfgs', tol=0.0001, verbose=0, warm_start=False)

clf.fit(train_feature_vectors_sparse, train_labels)

predictions = clf.predict(test_feature_vectors_sparse)
confmat = confusion_matrix(test_labels, predictions)

precision = confmat[1,1]/(confmat[1,1] + confmat[1,0])
recall = confmat[1,1]/(confmat[1,1] + confmat[0,1])
fScore = f1_score(test_labels, predictions, average='macro')
ccr = (confmat[0,0] + confmat[1,1])/(sum(sum(confmat)))
roc_auc = roc_auc_score(test_labels, predictions)
print("Precision = ",precision)
print("Recall = ",recall)
print("F_score = ",fScore)
print("CCR = ",ccr)
print("ROC_AUC = ", roc_auc)
print()
print("Confusion Matrix: \n", confmat)
print()

Precision =  0.8643280474131027
Recall =  0.48169969648277094
F_score =  0.779213914057477
CCR =  0.8960111288255338
ROC_AUC =  0.8818825653190914

Confusion Matrix: 
 [[51929  5806]
 [  847  5396]]

