# Set up

In [1]:
import pandas as pd
import numpy as np
import json
import pickle

# Load training set

In [2]:
train_set = pd.read_pickle('train_set.pkl')
train_set

Unnamed: 0,claim,claimant,date,label,related_articles,id
9389,While arguing over President Reagan’s 1981 tax...,Sarah Sanders,2017-10-31,1,"[34218, 55700, 18736, 39031, 34219, 34220]",10354
1861,"Recently Rick Scott ""closed 30 women’s health ...",Lois Frankel,2014-09-12,0,"[73190, 76997, 38841, 77415, 77303, 9280, 8332...",2053
11035,Says Target installed urinals in a women’s bat...,Facebook posts,2016-04-22,0,"[9619, 22197]",12160
12221,"Says ""combined doses of vaccines"" have never b...",Facebook posts,2019-04-15,0,"[57163, 31528, 40908, 31536, 68904, 44601]",13458
11354,: The AMBER Alert system has been discontinu...,,2013-10-13,0,"[103978, 121475, 121849]",12504
...,...,...,...,...,...,...
2910,Health insurance costs for Floridians are up 3...,Republican Party of Florida,2014-09-23,1,"[9581, 89571, 7836, 7945, 7949, 77360, 83491, ...",3208
6096,"A photograph captures Harriet Tubman as a ""Gun...",,2019-03-25,0,"[125108, 125968, 126005]",6701
10446,"ISIS leader Abu Bakr al Baghdadi was ""released...",Jeanine Pirro,2014-06-14,0,"[80115, 93998, 5968, 175, 91475, 8710, 89881, ...",11514
5414,"""The board of a nonprofit organization on whic...",Tennessee Republican Party,2008-02-25,1,"[96453, 71123, 61, 69968, 96477]",5966


# Load test data

In [3]:
test_set = pd.read_pickle('test_set.pkl')

# Train model

In [4]:
import pandas as pd
import numpy as np

import sklearn
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline

Source: https://www.learndatasci.com/tutorials/predicting-reddit-news-sentiment-naive-bayes-text-classifiers/

In [5]:
X_train = train_set.claim
y_train = train_set.label
X_test = test_set.claim
y_test = test_set.label

In [6]:
# Create pipeline for the model
model = make_pipeline(CountVectorizer(), MultinomialNB())

In [7]:
# Train the model
model.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('countvectorizer',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('multinomialnb',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [8]:
# Use model to make predictions
y_pred = model.predict(X_test)

In [9]:
# Check accuracy of model
accuracy_score(y_test, y_pred)

0.6039858566377371

In [10]:
print("Accuracy: {:.2f}%".format(accuracy_score(y_test, y_pred) * 100))
print("\nF1 Score: {:.2f}".format(f1_score(y_test, y_pred, average='macro') * 100))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 60.40%

F1 Score: 44.18

Confusion Matrix:
 [[923 522   9]
 [377 947  10]
 [121 193   9]]


In [11]:
counts = train_set.label.value_counts()
print(counts)

0    5954
1    5117
2    1373
Name: label, dtype: int64


In [12]:
print("\nPredicting only 0 = {:.2f}% accuracy".format(counts[0] / sum(counts) * 100))


Predicting only 0 = 47.85% accuracy


In [13]:
print("\nPredicting only 1 = {:.2f}% accuracy".format(counts[1] / sum(counts) * 100))


Predicting only 1 = 41.12% accuracy


In [14]:
print("\nPredicting only 2 = {:.2f}% accuracy".format(counts[2] / sum(counts) * 100))


Predicting only 2 = 11.03% accuracy


In [15]:
vect = CountVectorizer()

X_train_vect = vect.fit_transform(X_train)

In [16]:
list(zip(X_train_vect[1].toarray()[0], vect.get_feature_names()))

[(0, '00'),
 (0, '000'),
 (0, '0000188'),
 (0, '000mw'),
 (0, '001'),
 (0, '004'),
 (0, '008'),
 (0, '01'),
 (0, '014'),
 (0, '018'),
 (0, '01c1537833bf99'),
 (0, '02'),
 (0, '029'),
 (0, '03'),
 (0, '033'),
 (0, '04'),
 (0, '042'),
 (0, '05'),
 (0, '050'),
 (0, '053'),
 (0, '054th'),
 (0, '06'),
 (0, '0607'),
 (0, '06072016_outstanding'),
 (0, '063'),
 (0, '07'),
 (0, '08'),
 (0, '084'),
 (0, '09'),
 (0, '0900'),
 (0, '095'),
 (0, '0cx1cqs9x8'),
 (0, '0hour'),
 (0, '0whllsofrn'),
 (0, '10'),
 (0, '100'),
 (0, '1000'),
 (0, '100011345193387'),
 (0, '10010'),
 (0, '1004'),
 (0, '100k'),
 (0, '100s'),
 (0, '100th'),
 (0, '101'),
 (0, '101st'),
 (0, '102'),
 (0, '10205607353878502'),
 (0, '1024'),
 (0, '103'),
 (0, '104'),
 (0, '1040'),
 (0, '105'),
 (0, '106'),
 (0, '107'),
 (0, '1070'),
 (0, '1073741828'),
 (0, '108'),
 (0, '108th'),
 (0, '109'),
 (0, '1092990525'),
 (0, '10990'),
 (0, '10995'),
 (0, '10997'),
 (0, '10998'),
 (0, '109th'),
 (0, '10am'),
 (0, '10s'),
 (0, '10sad'),
 (0, 

In [17]:
from imblearn.over_sampling import SMOTE

sm = SMOTE()

X_train_res, y_train_res = sm.fit_sample(X_train_vect, y_train)

In [18]:
unique, counts = np.unique(y_train_res, return_counts=True)
print(list(zip(unique, counts)))

[(0, 5954), (1, 5954), (2, 5954)]


In [19]:
nb = MultinomialNB()

nb.fit(X_train_res, y_train_res)

nb.score(X_train_res, y_train_res)

0.7272421901242861

In [20]:
X_test_vect = vect.transform(X_test)

y_pred = nb.predict(X_test_vect)

In [21]:
print("Accuracy: {:.2f}%".format(accuracy_score(y_test, y_pred) * 100))
print("\nF1 Score: {:.2f}".format(f1_score(y_test, y_pred, average='macro') * 100))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 58.44%

F1 Score: 48.25

Confusion Matrix:
 [[839 509 106]
 [341 922  71]
 [ 94 172  57]]


In [22]:
from sklearn.model_selection import ShuffleSplit

vect = CountVectorizer()
nb = MultinomialNB()

X = train_set.claim
y = train_set.label

ss = ShuffleSplit(n_splits=10, test_size=0.2)
sm = SMOTE()

accs = []
f1s = []
cms = []

for train_index, test_index in ss.split(X):
    
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Fit vectorizer and transform X train, then transform X test
    X_train_vect = vect.fit_transform(X_train)
    X_test_vect = vect.transform(X_test)
    
    # Oversample
    X_train_res, y_train_res = sm.fit_sample(X_train_vect, y_train)
    
    # Fit Naive Bayes on the vectorized X with y train labels, 
    # then predict new y labels using X test
    nb.fit(X_train_res, y_train_res)
    y_pred = nb.predict(X_test_vect)
    
    # Determine test set accuracy and f1 score on this fold using the true y labels and predicted y labels
    accs.append(accuracy_score(y_test, y_pred))
    f1s.append(f1_score(y_test, y_pred, average='macro'))
    cms.append(confusion_matrix(y_test, y_pred))
    
print("\nAverage accuracy across folds: {:.2f}%".format(sum(accs) / len(accs) * 100))
print("\nAverage F1 score across folds: {:.2f}".format(sum(f1s) / len(f1s) * 100))
print("\nAverage Confusion Matrix across folds: \n {}".format(sum(cms) / len(cms)))


Average accuracy across folds: 57.31%

Average F1 score across folds: 46.21

Average Confusion Matrix across folds: 
 [[680.6 420.5  89.2]
 [254.2 710.2  64.9]
 [ 90.1 143.6  35.7]]


In [23]:
import matplotlib.pyplot as plt
import seaborn as sns

fig, (ax1, ax2) = plt.subplots(2, 1, sharex=True, figsize=(16,9))

acc_scores = [round(a * 100, 1) for a in accs]
f1_scores = [round(f * 100, 2) for f in f1s]

x1 = np.arange(len(acc_scores))
x2 = np.arange(len(f1_scores))

ax1.bar(x1, acc_scores)
ax2.bar(x2, f1_scores, color='#559ebf')

# Place values on top of bars
for i, v in enumerate(list(zip(acc_scores, f1_scores))):
    ax1.text(i - 0.25, v[0] + 2, str(v[0]) + '%')
    ax2.text(i - 0.25, v[1] + 2, str(v[1]))

ax1.set_ylabel('Accuracy (%)')
ax1.set_title('Naive Bayes')
ax1.set_ylim([0, 100])

ax2.set_ylabel('F1 Score')
ax2.set_xlabel('Runs')
ax2.set_ylim([0, 100])

sns.despine(bottom=True, left=True)  # Remove the ticks on axes for cleaner presentation

plt.show()

<Figure size 1600x900 with 2 Axes>

In [24]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

X = train_set.claim
y = train_set.label

cv = ShuffleSplit(n_splits=20, test_size=0.2)

models = [
    MultinomialNB(),
    BernoulliNB(),
    LogisticRegression(),
    SGDClassifier(),
    LinearSVC(),
    RandomForestClassifier(),
    MLPClassifier()
]

sm = SMOTE()

# Init a dictionary for storing results of each run for each model
results = {
    model.__class__.__name__: {
        'accuracy': [], 
        'f1_score': [],
        'confusion_matrix': []
    } for model in models
}

for train_index, test_index in cv.split(X):
    X_train, X_test  = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    X_train_vect = vect.fit_transform(X_train)    
    X_test_vect = vect.transform(X_test)
    
    X_train_res, y_train_res = sm.fit_sample(X_train_vect, y_train)
    
    for model in models:
        model.fit(X_train_res, y_train_res)
        y_pred = model.predict(X_test_vect)
        
        acc = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='macro')
        cm = confusion_matrix(y_test, y_pred)
        
        results[model.__class__.__name__]['accuracy'].append(acc)
        results[model.__class__.__name__]['f1_score'].append(f1)
        results[model.__class__.__name__]['confusion_matrix'].append(cm)





In [25]:
for model, d in results.items():
    avg_acc = sum(d['accuracy']) / len(d['accuracy']) * 100
    avg_f1 = sum(d['f1_score']) / len(d['f1_score']) * 100
    avg_cm = sum(d['confusion_matrix']) / len(d['confusion_matrix'])
    
    slashes = '-' * 30
    
    s = f"""{model}\n{slashes}
        Avg. Accuracy: {avg_acc:.2f}%
        Avg. F1 Score: {avg_f1:.2f}
        Avg. Confusion Matrix: 
        \n{avg_cm}
        """
    print(s)

MultinomialNB
------------------------------
        Avg. Accuracy: 57.34%
        Avg. F1 Score: 46.57
        Avg. Confusion Matrix: 
        
[[686.5  419.45  86.25]
 [259.4  702.    59.5 ]
 [ 88.4  148.9   38.6 ]]
        
BernoulliNB
------------------------------
        Avg. Accuracy: 47.89%
        Avg. F1 Score: 42.40
        Avg. Confusion Matrix: 
        
[[483.8  362.3  346.1 ]
 [196.15 622.2  202.55]
 [ 63.1  126.9   85.9 ]]
        
LogisticRegression
------------------------------
        Avg. Accuracy: 52.49%
        Avg. F1 Score: 44.28
        Avg. Confusion Matrix: 
        
[[688.75 340.6  162.85]
 [327.15 560.25 133.5 ]
 [ 94.   124.4   57.5 ]]
        
SGDClassifier
------------------------------
        Avg. Accuracy: 51.21%
        Avg. F1 Score: 43.28
        Avg. Confusion Matrix: 
        
[[680.9  344.2  167.1 ]
 [338.35 535.85 146.7 ]
 [ 94.1  123.85  57.95]]
        
LinearSVC
------------------------------
        Avg. Accuracy: 50.02%
        Avg. F1 Sc