# Paper 2: Thesis paper using SVM + BOG and Bigrams

In [21]:
import json
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate

In [22]:
def printScores(scores):
    from statistics import mean
    test_precision_micro = mean(scores['test_precision_micro'])

    test_precision_macro = mean(scores['test_precision_macro'])

    test_recall_micro = mean(scores['test_recall_micro'])

    test_recall_macro = mean(scores['test_recall_macro'])

    test_f1_micro = mean(scores['test_f1_micro'])

    test_f1_macro = mean(scores['test_f1_macro'])

    print("Macro")
    print("average precision is %f"%test_precision_macro)
    print("average recall is %f"%test_recall_macro)
    print("average f1 is %f"%test_f1_macro)

    print("Micro")
    print("average precision is %f"%test_precision_micro)
    print("average recall is %f"%test_recall_micro)
    print("average f1 is %f"%test_f1_micro)

### Load data

In [23]:
json_samples = []
X = []
y = []

indices_1 = []
indices_0 = []
#with open('../twitter-hatespeech/data/data_manual.json') as json_file:
with open('data_manual_axel_plus_turk.json') as json_file:
    data = json.load(json_file)
    
    
    
    for i, p in enumerate(data):
        if i == 1197:
            continue
            
        text = p['text']
        X.append(text)
        if "no_hs" in p["label"]:
            
            y.append(0)
            indices_0.append(i)
        else:
            y.append(1)
            indices_1.append(i)
            
    

In [24]:
print(len(y))
print(len(X))
print(y.count(0))
print(y.count(1))
print(len(indices_0))
print(len(indices_1))

658
658
289
369
289
369


### Extract features

In [25]:
'''count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X)
print(X_train_counts.shape)

tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
print(X_train_tf.shape)
'''
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer(use_idf=False)),
                     ('clf', SGDClassifier(loss='hinge', penalty='l2', # hinge gives an SVM
                                           alpha=1e-3, random_state=42,
                                           max_iter=5, tol=None)),
                    ])

#text_clf.fit(X, y)

In [26]:
parameters = {
    'vect__ngram_range': [(1, 1), (1, 2)],
    'tfidf__use_idf': (True, False),
    'clf__alpha': (1e-2, 1e-3),
}

scoring = ['accuracy','precision', 'recall','f1',
           'precision_micro', 'precision_macro', 'precision_weighted',
           'recall_micro', 'recall_macro', 'recall_weighted',
           'f1_micro','f1_macro','f1_weighted']

gs_clf = GridSearchCV(text_clf, parameters, iid=False, n_jobs=-1)
gs_clf = gs_clf.fit(X, y)
scores = cross_validate(gs_clf, X, y, cv=10, scoring=scoring)
print(scores)
print(gs_clf.best_score_)                              
#print(gs_clf.best_estimator_.C)


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


{'train_recall_weighted': array([0.98648649, 0.56081081, 0.89189189, 0.9847973 , 0.99155405,
       1.        , 1.        , 0.56081081, 0.98817568, 0.56060606]), 'train_precision_macro': array([0.98507463, 0.28040541, 0.8938231 , 0.98327138, 0.99087568,
       1.        , 1.        , 0.28040541, 0.98864626, 0.28030303]), 'test_recall_macro': array([0.57688723, 0.5       , 0.47017707, 0.43802423, 0.42823858,
       0.54007456, 0.54473439, 0.5       , 0.54706431, 0.5       ]), 'test_f1': array([0.6       , 0.7184466 , 0.30188679, 0.38709677, 0.33898305,
       0.52307692, 0.63291139, 0.7184466 , 0.6744186 , 0.72      ]), 'test_recall_micro': array([0.57575758, 0.56060606, 0.43939394, 0.42424242, 0.40909091,
       0.53030303, 0.56060606, 0.56060606, 0.57575758, 0.5625    ]), 'train_f1_macro': array([0.98632668, 0.35930736, 0.89164951, 0.98462315, 0.99143774,
       1.        , 1.        , 0.35930736, 0.9879828 , 0.3592233 ]), 'test_f1_macro': array([0.57419355, 0.3592233 , 0.41676618, 0.

In [27]:
print(gs_clf.best_score_)                                  
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

0.5683893731838937
clf__alpha: 0.001
tfidf__use_idf: False
vect__ngram_range: (1, 2)


## Experiments:

### BOW + Counts

In [28]:
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('clf', svm.SVC(kernel='linear', C=1, random_state=0)),
                    ])
print(text_clf.get_params().keys())
Cs = np.logspace(-6, 3, 10)
parameters = {
    'vect__ngram_range': [(1, 1)],
    #'clf__alpha': (1e-2, 1e-3),
    "clf__C":Cs,
}

scoring = ['accuracy','precision', 'recall','f1',
           'precision_micro', 'precision_macro', 'precision_weighted',
           'recall_micro', 'recall_macro', 'recall_weighted',
           'f1_micro','f1_macro','f1_weighted']

gs_clf = GridSearchCV(text_clf, parameters, iid=False, n_jobs=-1)
gs_clf = gs_clf.fit(X, y)
scores1 = cross_validate(gs_clf, X, y, cv=10, scoring=scoring)
print(scores1)
print(gs_clf.best_score_)                              
#print(gs_clf.best_estimator_.C)

dict_keys(['vect__input', 'clf__probability', 'vect__min_df', 'vect__vocabulary', 'clf__class_weight', 'clf__coef0', 'clf__kernel', 'vect__dtype', 'vect__decode_error', 'vect__max_df', 'clf__degree', 'memory', 'vect__analyzer', 'clf', 'clf__shrinking', 'vect__token_pattern', 'clf__tol', 'clf__max_iter', 'vect__binary', 'clf__verbose', 'vect__tokenizer', 'vect', 'vect__max_features', 'vect__strip_accents', 'clf__cache_size', 'vect__ngram_range', 'clf__C', 'vect__encoding', 'clf__gamma', 'clf__random_state', 'vect__stop_words', 'vect__preprocessor', 'vect__lowercase', 'clf__decision_function_shape', 'steps'])


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


{'train_recall_weighted': array([1.        , 0.56081081, 1.        , 1.        , 1.        ,
       0.71114865, 1.        , 0.97804054, 0.98141892, 0.56060606]), 'train_precision_macro': array([1.        , 0.28040541, 1.        , 1.        , 1.        ,
       0.78093188, 1.        , 0.98115942, 0.98277309, 0.28030303]), 'test_recall_macro': array([0.56570363, 0.5       , 0.42684063, 0.50186393, 0.49673812,
       0.47809879, 0.56057782, 0.5778192 , 0.60857409, 0.5       ]), 'test_f1': array([0.63157895, 0.7184466 , 0.44117647, 0.52173913, 0.6       ,
       0.64444444, 0.68965517, 0.69767442, 0.72093023, 0.72      ]), 'test_recall_micro': array([0.57575758, 0.56060606, 0.42424242, 0.5       , 0.51515152,
       0.51515152, 0.59090909, 0.60606061, 0.63636364, 0.5625    ]), 'train_f1_macro': array([1.        , 0.35930736, 1.        , 1.        , 1.        ,
       0.66535208, 1.        , 0.97757831, 0.98108203, 0.3592233 ]), 'test_f1_macro': array([0.56578947, 0.3592233 , 0.42371324, 0.

In [29]:
print(gs_clf.best_score_)                                  
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

0.5714335132143351
clf__C: 1.0
vect__ngram_range: (1, 1)


In [30]:
printScores(scores1)

Macro
average precision is 0.484169
average recall is 0.521622
average f1 is 0.485192
Micro
average precision is 0.548674
average recall is 0.548674
average f1 is 0.548674


### BOW + Frequencies

In [31]:
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer(use_idf=False)),
                     ('clf', svm.SVC(kernel='linear', C=1, random_state=0)),
                    ])
print(text_clf.get_params().keys())
Cs = np.logspace(-6, 3, 10)
parameters = {
    'vect__ngram_range': [(1, 1)],
    #'clf__alpha': (1e-2, 1e-3),
    "clf__C":Cs,
}

scoring = ['accuracy','precision', 'recall','f1',
           'precision_micro', 'precision_macro', 'precision_weighted',
           'recall_micro', 'recall_macro', 'recall_weighted',
           'f1_micro','f1_macro','f1_weighted']

gs_clf = GridSearchCV(text_clf, parameters, iid=False, n_jobs=-1)
gs_clf = gs_clf.fit(X, y)
scores2 = cross_validate(gs_clf, X, y, cv=10, scoring=scoring)
print(scores2)
print(gs_clf.best_score_)                              
#print(gs_clf.best_estimator_.C)

dict_keys(['vect__input', 'clf__probability', 'vect__min_df', 'vect__vocabulary', 'clf__class_weight', 'clf__coef0', 'clf__kernel', 'vect__dtype', 'clf__gamma', 'vect__decode_error', 'vect__max_df', 'tfidf__sublinear_tf', 'memory', 'vect__analyzer', 'clf', 'clf__shrinking', 'tfidf__norm', 'vect__token_pattern', 'tfidf__smooth_idf', 'clf__max_iter', 'vect__binary', 'clf__verbose', 'vect__tokenizer', 'tfidf__use_idf', 'clf__degree', 'vect', 'vect__max_features', 'vect__strip_accents', 'clf__cache_size', 'tfidf', 'vect__ngram_range', 'clf__C', 'vect__encoding', 'clf__tol', 'clf__random_state', 'vect__stop_words', 'vect__preprocessor', 'vect__lowercase', 'clf__decision_function_shape', 'steps'])


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


{'train_recall_weighted': array([0.875     , 0.56081081, 0.88513514, 0.86655405, 0.86993243,
       0.86148649, 0.56081081, 0.56081081, 0.56081081, 0.56060606]), 'train_precision_macro': array([0.88378378, 0.28040541, 0.8858734 , 0.86909728, 0.87145796,
       0.86352846, 0.28040541, 0.28040541, 0.28040541, 0.28030303]), 'test_recall_macro': array([0.55452004, 0.5       , 0.44408201, 0.52889096, 0.44874185,
       0.53494874, 0.5       , 0.5       , 0.5       , 0.5       ]), 'test_f1': array([0.65853659, 0.7184466 , 0.44776119, 0.56338028, 0.56790123,
       0.60526316, 0.7184466 , 0.7184466 , 0.7184466 , 0.72      ]), 'test_recall_micro': array([0.57575758, 0.56060606, 0.43939394, 0.53030303, 0.46969697,
       0.54545455, 0.56060606, 0.56060606, 0.56060606, 0.5625    ]), 'train_f1_macro': array([0.87052996, 0.35930736, 0.88265883, 0.86302678, 0.86678473,
       0.85790392, 0.35930736, 0.35930736, 0.35930736, 0.3592233 ]), 'test_f1_macro': array([0.54926829, 0.3592233 , 0.43926521, 0.

In [32]:
print(gs_clf.best_score_)                                  
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

0.5607928601079286
clf__C: 1e-06
vect__ngram_range: (1, 1)


In [33]:
printScores(scores2)

Macro
average precision is 0.391610
average recall is 0.501118
average f1 is 0.428861
Micro
average precision is 0.536553
average recall is 0.536553
average f1 is 0.536553


### Bigrams + Counts

In [34]:
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('clf', svm.SVC(kernel='linear', C=1, random_state=0)),
                    ])
print(text_clf.get_params().keys())
Cs = np.logspace(-6, 3, 10)
parameters = {
    'vect__ngram_range': [(1, 2)],
    #'clf__alpha': (1e-2, 1e-3),
    "clf__C":Cs,
}

scoring = ['accuracy','precision', 'recall','f1',
           'precision_micro', 'precision_macro', 'precision_weighted',
           'recall_micro', 'recall_macro', 'recall_weighted',
           'f1_micro','f1_macro','f1_weighted']

gs_clf = GridSearchCV(text_clf, parameters, iid=False, n_jobs=-1)
gs_clf = gs_clf.fit(X, y)
scores3 = cross_validate(gs_clf, X, y, cv=10, scoring=scoring)
print(scores3)
print(gs_clf.best_score_)                              
#print(gs_clf.best_estimator_.C)

dict_keys(['vect__input', 'clf__probability', 'vect__min_df', 'vect__vocabulary', 'clf__class_weight', 'clf__coef0', 'clf__kernel', 'vect__dtype', 'vect__decode_error', 'vect__max_df', 'clf__degree', 'memory', 'vect__analyzer', 'clf', 'clf__shrinking', 'vect__token_pattern', 'clf__tol', 'clf__max_iter', 'vect__binary', 'clf__verbose', 'vect__tokenizer', 'vect', 'vect__max_features', 'vect__strip_accents', 'clf__cache_size', 'vect__ngram_range', 'clf__C', 'vect__encoding', 'clf__gamma', 'clf__random_state', 'vect__stop_words', 'vect__preprocessor', 'vect__lowercase', 'clf__decision_function_shape', 'steps'])


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


{'train_recall_weighted': array([1.        , 0.56081081, 1.        , 1.        , 1.        ,
       1.        , 1.        , 0.56081081, 1.        , 0.56060606]), 'train_precision_macro': array([1.        , 0.28040541, 1.        , 1.        , 1.        ,
       1.        , 1.        , 0.28040541, 1.        , 0.28030303]), 'test_recall_macro': array([0.53727866, 0.5       , 0.4193849 , 0.4193849 , 0.48695247,
       0.51164958, 0.62581547, 0.5       , 0.63560112, 0.5       ]), 'test_f1': array([0.65060241, 0.7184466 , 0.47222222, 0.47222222, 0.57142857,
       0.55555556, 0.72941176, 0.7184466 , 0.75      , 0.72      ]), 'test_recall_micro': array([0.56060606, 0.56060606, 0.42424242, 0.42424242, 0.5       ,
       0.51515152, 0.65151515, 0.56060606, 0.66666667, 0.5625    ]), 'train_f1_macro': array([1.        , 0.35930736, 1.        , 1.        , 1.        ,
       1.        , 1.        , 0.35930736, 1.        , 0.3592233 ]), 'test_f1_macro': array([0.52938284, 0.3592233 , 0.41944444, 0.

In [35]:
print(gs_clf.best_score_)                                  
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

0.5866472948664729
clf__C: 1.0
vect__ngram_range: (1, 2)


In [36]:
printScores(scores3)

Macro
average precision is 0.457039
average recall is 0.513607
average f1 is 0.468857
Micro
average precision is 0.542614
average recall is 0.542614
average f1 is 0.542614


### Bigrams + Frequencies

In [37]:
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer(use_idf=False)),
                     ('clf', svm.SVC(kernel='linear', C=1, random_state=0)),
                    ])
print(text_clf.get_params().keys())
Cs = np.logspace(-6, 3, 10)
parameters = {
    'vect__ngram_range': [(1, 2)],
    #'clf__alpha': (1e-2, 1e-3),
    "clf__C":Cs,
}

scoring = ['accuracy','precision', 'recall','f1',
           'precision_micro', 'precision_macro', 'precision_weighted',
           'recall_micro', 'recall_macro', 'recall_weighted',
           'f1_micro','f1_macro','f1_weighted']

gs_clf = GridSearchCV(text_clf, parameters, iid=False, n_jobs=-1)
gs_clf = gs_clf.fit(X, y)
scores4 = cross_validate(gs_clf, X, y, cv=10, scoring=scoring)
print(scores4)
print(gs_clf.best_score_)                              
#print(gs_clf.best_estimator_.C)

dict_keys(['vect__input', 'clf__probability', 'vect__min_df', 'vect__vocabulary', 'clf__class_weight', 'clf__coef0', 'clf__kernel', 'vect__dtype', 'clf__gamma', 'vect__decode_error', 'vect__max_df', 'tfidf__sublinear_tf', 'memory', 'vect__analyzer', 'clf', 'clf__shrinking', 'tfidf__norm', 'vect__token_pattern', 'tfidf__smooth_idf', 'clf__max_iter', 'vect__binary', 'clf__verbose', 'vect__tokenizer', 'tfidf__use_idf', 'clf__degree', 'vect', 'vect__max_features', 'vect__strip_accents', 'clf__cache_size', 'tfidf', 'vect__ngram_range', 'clf__C', 'vect__encoding', 'clf__tol', 'clf__random_state', 'vect__stop_words', 'vect__preprocessor', 'vect__lowercase', 'clf__decision_function_shape', 'steps'])


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


{'train_recall_weighted': array([0.96114865, 0.56081081, 0.96790541, 0.95945946, 0.96452703,
       0.56081081, 1.        , 0.56081081, 0.56081081, 0.56060606]), 'train_precision_macro': array([0.96317047, 0.28040541, 0.96801789, 0.96306765, 0.96457982,
       0.28040541, 1.        , 0.28040541, 0.28040541, 0.28030303]), 'test_recall_macro': array([0.54100652, 0.5       , 0.45153774, 0.46738117, 0.45992544,
       0.5       , 0.5027959 , 0.5       , 0.5       , 0.5       ]), 'test_f1': array([0.64197531, 0.7184466 , 0.41269841, 0.50704225, 0.53333333,
       0.7184466 , 0.63529412, 0.7184466 , 0.7184466 , 0.72      ]), 'test_recall_micro': array([0.56060606, 0.56060606, 0.43939394, 0.46969697, 0.46969697,
       0.56060606, 0.53030303, 0.56060606, 0.56060606, 0.5625    ]), 'train_f1_macro': array([0.96036964, 0.35930736, 0.96738188, 0.95854342, 0.96394839,
       0.35930736, 1.        , 0.35930736, 0.35930736, 0.3592233 ]), 'test_f1_macro': array([0.53667393, 0.3592233 , 0.43823326, 0.

In [38]:
print(gs_clf.best_score_)                                  
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

0.5775356302753563
clf__C: 10.0
vect__ngram_range: (1, 2)


In [39]:
printScores(scores4)

Macro
average precision is 0.382881
average recall is 0.492265
average f1 is 0.418595
Micro
average precision is 0.527462
average recall is 0.527462
average f1 is 0.527462


### Averaging Rrsults

In [40]:
from statistics import mean
test_precision_micro = mean([0.48148148, 0.48148148, 0.51851852, 0.54074074, 0.56296296,
       0.49253731, 0.56716418, 0.52238806, 0.59701493, 0.58208955])

test_precision_macro = mean([0.48070919, 0.48094816, 0.51837588, 0.54644525, 0.56333333,
       0.49247698, 0.56740443, 0.52457814, 0.61754386, 0.59664306])

test_recall_micro = mean([0.48148148, 0.48148148, 0.51851852, 0.54074074, 0.56296296,
       0.49253731, 0.56716418, 0.52238806, 0.59701493, 0.58208955])

test_recall_macro = mean([0.48101405, 0.48112379, 0.51832748, 0.53928885, 0.56255487,
       0.49253731, 0.56716418, 0.52238806, 0.59701493, 0.58208955])
    
test_f1_micro = mean([0.48148148, 0.48148148, 0.51851852, 0.54074074, 0.56296296,
       0.49253731, 0.56716418, 0.52238806, 0.59701493, 0.58208955])
    
test_f1_macro = mean([0.47916667, 0.48008363, 0.51809545, 0.52160494, 0.56142283,
       0.49151786, 0.56677815, 0.51150604, 0.57861635, 0.56574074])

print("Macro")
print("average precision is %f"%test_precision_macro)
print("average recall is %f"%test_recall_macro)
print("average f1 is %f"%test_f1_macro)

print("Micro")
print("average precision is %f"%test_precision_micro)
print("average recall is %f"%test_recall_micro)
print("average f1 is %f"%test_f1_micro)

Macro
average precision is 0.538846
average recall is 0.534350
average f1 is 0.527453
Micro
average precision is 0.534638
average recall is 0.534638
average f1 is 0.534638
