In [6]:
import pandas as pd, numpy as np
import sklearn
import scipy.sparse
import warnings
warnings.filterwarnings('ignore')
from sklearn.feature_extraction.text import CountVectorizer
from scipy import sparse
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from bs4 import BeautifulSoup
import re
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import make_scorer, roc_auc_score

In [3]:
train_data = pd.read_csv('../data/raw_data/train.csv')
test_data = pd.read_csv('../data/raw_data/test.csv')

train_labels=(train_data["Labels"])
test_labels = test_data["Labels"]

corpus_train = train_data["Comment"]
corpus_test = test_data["Comment"]

In [4]:
vectorizer = CountVectorizer(strip_accents='unicode',lowercase=True,stop_words='english', analyzer="word",token_pattern=r'\w{1,}')
train_feature_vectors_sparse = vectorizer.fit_transform(corpus_train)
vectorizer_test = CountVectorizer(strip_accents='unicode',lowercase=True,stop_words='english', analyzer="word",token_pattern=r'\w{1,}', vocabulary= vectorizer.get_feature_names())
test_feature_vectors_sparse = vectorizer_test.fit_transform(corpus_test)

In [15]:
from sklearn.pipeline import Pipeline
pipeline = Pipeline([
    ('tfidf', TfidfTransformer(smooth_idf=True)),
    ('clf', MultinomialNB())
])


In [16]:


from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfTransformer
# tfidf_transformer = TfidfTransformer()
parameters = {'tfidf__norm':('l1','l2', None), 'tfidf__use_idf':(True, False), 'tfidf__sublinear_tf':(True, False)}
grid = GridSearchCV(pipeline, parameters, scoring = make_scorer(roc_auc_score, average='macro'), cv=5, error_score=0.0, verbose=10)


In [18]:
grid.fit(train_feature_vectors_sparse, train_labels)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] tfidf__norm=l1, tfidf__sublinear_tf=True, tfidf__use_idf=True ...


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  tfidf__norm=l1, tfidf__sublinear_tf=True, tfidf__use_idf=True, score=0.5121725731895224, total=   0.6s
[CV] tfidf__norm=l1, tfidf__sublinear_tf=True, tfidf__use_idf=True ...


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.0s remaining:    0.0s


[CV]  tfidf__norm=l1, tfidf__sublinear_tf=True, tfidf__use_idf=True, score=0.5117103235747303, total=   0.6s
[CV] tfidf__norm=l1, tfidf__sublinear_tf=True, tfidf__use_idf=True ...


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    2.0s remaining:    0.0s


[CV]  tfidf__norm=l1, tfidf__sublinear_tf=True, tfidf__use_idf=True, score=0.5104776579352851, total=   0.6s
[CV] tfidf__norm=l1, tfidf__sublinear_tf=True, tfidf__use_idf=True ...


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    3.1s remaining:    0.0s


[CV]  tfidf__norm=l1, tfidf__sublinear_tf=True, tfidf__use_idf=True, score=0.5134052388289676, total=   0.6s
[CV] tfidf__norm=l1, tfidf__sublinear_tf=True, tfidf__use_idf=True ...


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    4.1s remaining:    0.0s


[CV]  tfidf__norm=l1, tfidf__sublinear_tf=True, tfidf__use_idf=True, score=0.5124807395993837, total=   0.6s
[CV] tfidf__norm=l1, tfidf__sublinear_tf=True, tfidf__use_idf=False ..


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    5.2s remaining:    0.0s


[CV]  tfidf__norm=l1, tfidf__sublinear_tf=True, tfidf__use_idf=False, score=0.5154083204930663, total=   0.3s
[CV] tfidf__norm=l1, tfidf__sublinear_tf=True, tfidf__use_idf=False ..


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    5.7s remaining:    0.0s


[CV]  tfidf__norm=l1, tfidf__sublinear_tf=True, tfidf__use_idf=False, score=0.5130970724191063, total=   0.3s
[CV] tfidf__norm=l1, tfidf__sublinear_tf=True, tfidf__use_idf=False ..


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    6.3s remaining:    0.0s


[CV]  tfidf__norm=l1, tfidf__sublinear_tf=True, tfidf__use_idf=False, score=0.5129429892141757, total=   0.3s
[CV] tfidf__norm=l1, tfidf__sublinear_tf=True, tfidf__use_idf=False ..


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    6.8s remaining:    0.0s


[CV]  tfidf__norm=l1, tfidf__sublinear_tf=True, tfidf__use_idf=False, score=0.5174114021571649, total=   0.3s
[CV] tfidf__norm=l1, tfidf__sublinear_tf=True, tfidf__use_idf=False ..


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    7.3s remaining:    0.0s


[CV]  tfidf__norm=l1, tfidf__sublinear_tf=True, tfidf__use_idf=False, score=0.5149460708782743, total=   0.3s
[CV] tfidf__norm=l1, tfidf__sublinear_tf=False, tfidf__use_idf=True ..
[CV]  tfidf__norm=l1, tfidf__sublinear_tf=False, tfidf__use_idf=True, score=0.513713405238829, total=   0.5s
[CV] tfidf__norm=l1, tfidf__sublinear_tf=False, tfidf__use_idf=True ..
[CV]  tfidf__norm=l1, tfidf__sublinear_tf=False, tfidf__use_idf=True, score=0.5124807395993837, total=   0.5s
[CV] tfidf__norm=l1, tfidf__sublinear_tf=False, tfidf__use_idf=True ..
[CV]  tfidf__norm=l1, tfidf__sublinear_tf=False, tfidf__use_idf=True, score=0.510939907550077, total=   0.5s
[CV] tfidf__norm=l1, tfidf__sublinear_tf=False, tfidf__use_idf=True ..
[CV]  tfidf__norm=l1, tfidf__sublinear_tf=False, tfidf__use_idf=True, score=0.5147919876733436, total=   0.5s
[CV] tfidf__norm=l1, tfidf__sublinear_tf=False, tfidf__use_idf=True ..
[CV]  tfidf__norm=l1, tfidf__sublinear_tf=False, tfidf__use_idf=True, score=0.513713405238829, to

[CV]  tfidf__norm=None, tfidf__sublinear_tf=False, tfidf__use_idf=False, score=0.8085440937548201, total=   0.2s
[CV] tfidf__norm=None, tfidf__sublinear_tf=False, tfidf__use_idf=False 
[CV]  tfidf__norm=None, tfidf__sublinear_tf=False, tfidf__use_idf=False, score=0.8035956922057246, total=   0.2s
[CV] tfidf__norm=None, tfidf__sublinear_tf=False, tfidf__use_idf=False 
[CV]  tfidf__norm=None, tfidf__sublinear_tf=False, tfidf__use_idf=False, score=0.8026361562321682, total=   0.2s
[CV] tfidf__norm=None, tfidf__sublinear_tf=False, tfidf__use_idf=False 
[CV]  tfidf__norm=None, tfidf__sublinear_tf=False, tfidf__use_idf=False, score=0.8018308593257263, total=   0.2s
[CV] tfidf__norm=None, tfidf__sublinear_tf=False, tfidf__use_idf=False 
[CV]  tfidf__norm=None, tfidf__sublinear_tf=False, tfidf__use_idf=False, score=0.8115525050519502, total=   0.2s


[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:   44.4s finished


GridSearchCV(cv=5, error_score=0.0,
       estimator=Pipeline(memory=None,
     steps=[('tfidf', TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'tfidf__norm': ('l1', 'l2', None), 'tfidf__use_idf': (True, False), 'tfidf__sublinear_tf': (True, False)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=make_scorer(roc_auc_score, average=macro), verbose=10)

In [19]:
grid.best_score_

0.8485721501929472

In [20]:
a = pd.DataFrame.from_dict(grid.cv_results_)
a

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_tfidf__norm,param_tfidf__sublinear_tf,param_tfidf__use_idf,params,split0_test_score,split1_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,0.593424,0.005066,0.106676,0.003205,l1,True,True,"{'tfidf__norm': 'l1', 'tfidf__sublinear_tf': T...",0.512173,0.51171,...,0.512049,0.000962,12,0.513213,0.513482,0.513598,0.51275,0.513213,0.513251,0.000292
1,0.344644,0.0025,0.049635,0.000486,l1,True,False,"{'tfidf__norm': 'l1', 'tfidf__sublinear_tf': T...",0.515408,0.513097,...,0.514761,0.001646,10,0.515678,0.516025,0.516333,0.5151,0.515562,0.51574,0.000419
2,0.548989,0.001356,0.09527,0.00248,l1,False,True,"{'tfidf__norm': 'l1', 'tfidf__sublinear_tf': F...",0.513713,0.512481,...,0.513128,0.001316,11,0.513829,0.514599,0.514869,0.513945,0.514484,0.514345,0.000396
3,0.314826,0.001964,0.043231,0.000748,l1,False,False,"{'tfidf__norm': 'l1', 'tfidf__sublinear_tf': F...",0.516487,0.514176,...,0.515716,0.001538,9,0.516949,0.517373,0.517643,0.516834,0.516949,0.517149,0.000308
4,0.593819,0.019253,0.104877,0.002788,l2,True,True,"{'tfidf__norm': 'l2', 'tfidf__sublinear_tf': T...",0.597823,0.599962,...,0.597962,0.002263,5,0.614157,0.61289,0.613669,0.613917,0.614841,0.613895,0.000637
5,0.34445,0.002568,0.049633,0.000493,l2,True,False,"{'tfidf__norm': 'l2', 'tfidf__sublinear_tf': T...",0.589743,0.59202,...,0.589924,0.001442,7,0.598595,0.597761,0.597739,0.599053,0.599318,0.598493,0.00065
6,0.54879,0.004416,0.095469,0.002247,l2,False,True,"{'tfidf__norm': 'l2', 'tfidf__sublinear_tf': F...",0.59695,0.600442,...,0.597888,0.002318,6,0.613399,0.612287,0.612681,0.613232,0.613434,0.613007,0.00045
7,0.317622,0.002576,0.043031,0.000628,l2,False,False,"{'tfidf__norm': 'l2', 'tfidf__sublinear_tf': F...",0.589453,0.591592,...,0.589535,0.001363,8,0.598429,0.596931,0.597607,0.597996,0.59827,0.597846,0.000536
8,0.558998,0.004228,0.098469,0.001741,,True,True,"{'tfidf__norm': None, 'tfidf__sublinear_tf': T...",0.849611,0.85127,...,0.848553,0.002233,2,0.939212,0.939713,0.940174,0.938856,0.938646,0.93932,0.00056
9,0.326635,0.001856,0.045603,0.000468,,True,False,"{'tfidf__norm': None, 'tfidf__sublinear_tf': T...",0.816726,0.809198,...,0.812731,0.00442,3,0.849489,0.84917,0.849271,0.85001,0.84901,0.84939,0.000347


In [1]:
grid.best_estimator_

NameError: name 'grid' is not defined

In [7]:
vectorizer = TfidfTransformer(norm=None, smooth_idf=True, sublinear_tf=False, use_idf=True)
train_feature_vectors_sparse = vectorizer.fit_transform(train_feature_vectors_sparse)
test_feature_vectors_sparse = vectorizer.fit_transform(test_feature_vectors_sparse)

clf = MultinomialNB()

clf.fit(train_feature_vectors_sparse, train_labels)

predictions = clf.predict(test_feature_vectors_sparse)
confmat = confusion_matrix(test_labels, predictions)

precision = confmat[1,1]/(confmat[1,1] + confmat[1,0])
recall = confmat[1,1]/(confmat[1,1] + confmat[0,1])
fScore = f1_score(test_labels, predictions, average='macro')
ccr = (confmat[0,0] + confmat[1,1])/(sum(sum(confmat)))
roc_auc = roc_auc_score(test_labels, predictions)
print("Precision = ",precision)
print("Recall = ",recall)
print("F_score = ",fScore)
print("CCR = ",ccr)
print("ROC_AUC = ", roc_auc)
print()
print("Confusion Matrix: \n", confmat)
print()

Precision =  0.707512413904
Recall =  0.423368158727
F_score =  0.729636229736
CCR =  0.877426615399
ROC_AUC =  0.801656094368

Confusion Matrix: 
 [[51719  6016]
 [ 1826  4417]]

