# OffensiveEval Model Tuning
After already using the other notebook to test out the basic capabilities of several of the most popular models, in this notebook I will start using different methods to try and increase the accuracy of the models.

The first part of this will be me trying to tune the vectoriser. I will make a small pipeline to test several different variations of BoW on a naive bayes model. This is just because it has the fastest time to train and run and will lead to the quickest pipelines.

I will be trying to use k-fold cross validation to verify my results.

In [1]:
import pandas as pd
import numpy as np
import logging
from pprint import pprint
from time import time
from sklearn import metrics
from sklearn.metrics import f1_score, precision_score, recall_score

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer

from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
path = 'data/olid-training-v2.0.tsv'
testset = pd.read_table(path, header=None, names=['id','tweet','sub_a','sub_b','sub_c'])
# Just gonna be using the training data to do all this testing, at least to start with.
# Just so I can try my best to avoid overfitting of any kind

In [4]:
testset['label_a_num'] = testset.sub_a.map({'NOT':0, 'OFF':1})

In [5]:
x = testset.tweet
y = testset.label_a_num
x_train, x_test, y_train, y_test = train_test_split(x,y,random_state=1)

In [6]:
pipe = Pipeline(steps=[('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier())])

parameters = {
    'vect__max_df': (0.5, 0.75, 1.0, 0.9),
    'vect__stop_words': ('english',),
    'vect__min_df': (2, 0.1, 3, 0.2, 4),
    'vect__ngram_range': ((1, 1), (1, 2),),  
    'tfidf__use_idf': (True, False),
#     'tfidf__norm': ('l1','l2'),
#     'clf__max_iter': (100000,),
    'clf__penalty': ('l1','l2', 'elasticnet'),
    'clf__alpha': (0.0001,0.00001,0.0002,0.00002),
}


if __name__ == "__main__":
    grid_pipeline = GridSearchCV(pipe,parameters,n_jobs=8,verbose=1, scoring='f1')

    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipe.steps])
    print("classifier: SGDClassifier")
    print("parameters:")
    pprint(parameters)
    t0 = time()
    grid_pipeline.fit(x_train,y_train)
    print("done in %0.3fs" % (time() - t0))
    print("scoring paramater: f1")

    print("Best score: %0.3f" % grid_pipeline.best_score_)
    print("Best parameters set:")
    best_parameters = grid_pipeline.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
classifier: SGDClassifier
parameters:
{'clf__alpha': (0.0001, 1e-05, 0.0002, 2e-05),
 'clf__penalty': ('l1', 'l2', 'elasticnet'),
 'tfidf__use_idf': (True, False),
 'vect__max_df': (0.5, 0.75, 1.0, 0.9),
 'vect__min_df': (2, 0.1, 3, 0.2, 4),
 'vect__ngram_range': ((1, 1), (1, 2)),
 'vect__stop_words': ('english',)}
Fitting 5 folds for each of 960 candidates, totalling 4800 fits


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    4.3s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:   12.7s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:   26.5s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:   45.9s
[Parallel(n_jobs=8)]: Done 1234 tasks      | elapsed:  1.2min
[Parallel(n_jobs=8)]: Done 1784 tasks      | elapsed:  1.7min
[Parallel(n_jobs=8)]: Done 2434 tasks      | elapsed:  2.4min
[Parallel(n_jobs=8)]: Done 3184 tasks      | elapsed:  3.0min
[Parallel(n_jobs=8)]: Done 4034 tasks      | elapsed:  3.8min
[Parallel(n_jobs=8)]: Done 4800 out of 4800 | elapsed:  4.6min finished


done in 274.767s
scoring paramater: f1
Best score: 0.586
Best parameters set:
	clf__alpha: 2e-05
	clf__penalty: 'l1'
	tfidf__use_idf: False
	vect__max_df: 0.9
	vect__min_df: 3
	vect__ngram_range: (1, 1)
	vect__stop_words: 'english'


In [7]:
vect_vanilla = CountVectorizer(stop_words='english',)
vect_maxDF = CountVectorizer(stop_words='english', max_df=0.9)
vect_minDF = CountVectorizer(stop_words='english', min_df=2)
vect_2gram = CountVectorizer(stop_words='english', ngram_range=(1, 2))
vect_3gram = CountVectorizer(stop_words='english', ngram_range=(1, 3))
vect_combined = CountVectorizer(stop_words='english', ngram_range=(1, 2), min_df=2)
vect_gridsearch1 = CountVectorizer(max_df=0.75, stop_words='english', ngram_range=(1, 1), min_df=3)
vect_gridsearch2 = CountVectorizer(max_df=1.0, stop_words='english', ngram_range=(1, 1), min_df=4)
vect_gridsearch3 = CountVectorizer(max_df=0.5, stop_words='english', ngram_range=(1, 2), min_df=4)

In [8]:
vectoriser = [
    vect_vanilla,
    vect_minDF,
    vect_2gram,
    vect_3gram,
    vect_combined,
    vect_maxDF,
    vect_gridsearch1,
    vect_gridsearch2,
    vect_gridsearch3
]

In [9]:
for vect in vectoriser:
    pipe = Pipeline(steps=[('vectoriser', vect),
                           ('tfidf', TfidfTransformer(use_idf=True)),
                          ('classifier', SGDClassifier(penalty='elasticnet'))])
    pipe.fit(x_train,y_train)
    print(vect)
    print("model score: %.3f" % pipe.score(x_test, y_test))

CountVectorizer(stop_words='english')
model score: 0.776
CountVectorizer(min_df=2, stop_words='english')
model score: 0.774
CountVectorizer(ngram_range=(1, 2), stop_words='english')
model score: 0.769
CountVectorizer(ngram_range=(1, 3), stop_words='english')
model score: 0.760
CountVectorizer(min_df=2, ngram_range=(1, 2), stop_words='english')
model score: 0.774
CountVectorizer(max_df=0.9, stop_words='english')
model score: 0.774
CountVectorizer(max_df=0.75, min_df=3, stop_words='english')
model score: 0.773
CountVectorizer(min_df=4, stop_words='english')
model score: 0.773
CountVectorizer(max_df=0.5, min_df=4, ngram_range=(1, 2), stop_words='english')
model score: 0.773


In [10]:
vect = CountVectorizer(stop_words='english', min_df=2, max_df=0.5)
tf_idf = TfidfTransformer()

In [11]:
x_train_dtm = vect.fit_transform(x_train)
x_test_dtm = vect.transform(x_test)
nb = SGDClassifier(penalty='l1', alpha=2e-05)
obj_nb = %timeit -o nb.fit(x_train_dtm, y_train)
y_pred_class = nb.predict(x_test_dtm)
nb_acc = metrics.accuracy_score(y_test, y_pred_class)
metrics.confusion_matrix(y_test, y_pred_class)

106 ms ± 2.64 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


array([[1709,  503],
       [ 470,  628]], dtype=int64)

In [22]:
x_train_dtm

<9930x7115 sparse matrix of type '<class 'numpy.int64'>'
	with 82989 stored elements in Compressed Sparse Row format>

In [12]:
nb_acc

0.7060422960725076

In [13]:
precision_score(y_test, y_pred_class)

0.5552608311229

In [14]:
recall_score(y_test, y_pred_class)

0.5719489981785064

In [18]:
f1_score(y_test, y_pred_class, average='macro')

0.670945883331857

In [16]:
x_test[(y_pred_class==0) & (y_test==0)]

11561          @USER We the People   Not you the poitician
7659                    @USER Just announced on ESPN he is
9151         @USER 😉😍😘You are welcome anytime my dear.😉😍😘😘
3953     He can shout out to me any time he wishes.... ...
4732     @USER I have him in 2 leagues and he is probab...
                               ...                        
3472     @USER “Because you are alone” dang Mama Chu is...
5231     @USER You 100% believe AB would be the player ...
571      VOTING IS NOT OPTIONAL!!! CERTAINLY NOT THIS N...
6181     @USER So you are advocating for a 35-person de...
4050     @USER Lol same. Mom: why don’t you do art of r...
Name: tweet, Length: 1709, dtype: object

In [17]:
x_test[4050]

'@USER Lol same. Mom: why don’t you do art of real people? Me: he is a real people.'