<a href="https://colab.research.google.com/github/bradleymclellan/stc510/blob/main/Exploratory_Data_Analysis_Proficient.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import math
import random
from collections import defaultdict
from pprint import pprint
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.model_selection import ShuffleSplit
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import VotingClassifier
from imblearn.over_sampling import SMOTE

# Prevent future/deprecation warnings from showing in output
import warnings
warnings.filterwarnings(action='ignore')

import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Set global styles for plots
sns.set_style(style='white')
sns.set_context(context='notebook', font_scale=1.3, rc={'figure.figsize': (16,9)})

In [2]:
# Read in data
df = pd.read_csv('reddit_ai_headlines.csv', encoding='utf-8')
df.head()

Unnamed: 0,headline,label
0,Double Suncycle Miracle: Watch A Beautiful H.r...,1
1,A.I. Rick & Morty Analyzing Real-Time A.I. Gen...,0
2,StoryRobot - A Twitch.tv livestream where chat...,0
3,A.I. Fighter Jets show an Autonomous Military ...,0
4,I made an AI chatbot app! Come check it out :)...,1


In [3]:
# Remove the neutral (0) headlines labels to focus on only classifying positive or negative
df = df[df.label != 0]
df.label.value_counts()

 1    345
-1    104
Name: label, dtype: int64

In [None]:
# Transform the headlines of words into numbers and apply vectorization
vect = CountVectorizer(max_features=1000, binary=True)
X = vect.fit_transform(df.headline)

X.toarray()

In [5]:
# Split into train and test sets
X = df.headline
y = df.label

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [6]:
# Fit the vectorizer to the training data
vect = CountVectorizer(max_features=1000, binary=True)

X_train_vect = vect.fit_transform(X_train)

In [None]:
counts = df.label.value_counts()
print(counts)

print("\nPredicting only -1 = {:.2f}% accuracy".format(counts[-1] / sum(counts) * 100))

In [8]:
#Balance the training data
oversample = SMOTE()

X_train_res, y_train_res = oversample.fit_resample(X_train_vect, y_train)

In [None]:
unique, counts = np.unique(y_train_res, return_counts=True)
print(list(zip(unique, counts)))

In [None]:
#Instantiate a Multinomial Naive Bayes classifier
nb = MultinomialNB()

nb.fit(X_train_res, y_train_res)

nb.score(X_train_res, y_train_res)

In [None]:
# Vectorize the test data
X_test_vect = vect.transform(X_test)

y_pred = nb.predict(X_test_vect)

y_pred

In [None]:
# Calculate the accuracy of the classifier
print("Accuracy: {:.2f}%".format(accuracy_score(y_test, y_pred) * 100))
print("\nF1 Score: {:.2f}".format(f1_score(y_test, y_pred) * 100))
print("\nCOnfusion Matrix:\n", confusion_matrix(y_test, y_pred))

In [None]:
# Cross Validation
X = df.headline
y = df.label

ss = ShuffleSplit(n_splits=10, test_size=0.2)
oversample = SMOTE()

accs = []
f1s = []
cms = []

for train_index, test_index in ss.split(X):
    
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Fit vectorizer and transform X train, then transform X test
    X_train_vect = vect.fit_transform(X_train)
    X_test_vect = vect.transform(X_test)
    
    # Oversample
    X_train_res, y_train_res = oversample.fit_resample(X_train_vect, y_train)
    
    # Fit Naive Bayes on the vectorized X with y train labels, 
    # then predict new y labels using X test
    nb.fit(X_train_res, y_train_res)
    y_pred = nb.predict(X_test_vect)
    
    # Determine test set accuracy and f1 score on this fold using the true y labels and predicted y labels
    accs.append(accuracy_score(y_test, y_pred))
    f1s.append(f1_score(y_test, y_pred))
    cms.append(confusion_matrix(y_test, y_pred))
    
print("\nAverage accuracy across folds: {:.2f}%".format(sum(accs) / len(accs) * 100))
print("\nAverage F1 score across folds: {:.2f}%".format(sum(f1s) / len(f1s) * 100))
print("\nAverage Confusion Matrix across folds: \n {}".format(sum(cms) / len(cms)))

In [None]:
# Plot the confusion matrix
fig, (ax1, ax2) = plt.subplots(2, 1, sharex=True, figsize=(16,9))

acc_scores = [round(a * 100, 1) for a in accs]
f1_scores = [round(f * 100, 2) for f in f1s]

x1 = np.arange(len(acc_scores))
x2 = np.arange(len(f1_scores))

ax1.bar(x1, acc_scores)
ax2.bar(x2, f1_scores, color='#559ebf')

# Place values on top of bars
for i, v in enumerate(list(zip(acc_scores, f1_scores))):
    ax1.text(i - 0.25, v[0] + 2, str(v[0]) + '%')
    ax2.text(i - 0.25, v[1] + 2, str(v[1]))

ax1.set_ylabel('Accuracy (%)')
ax1.set_title('Naive Bayes')
ax1.set_ylim([0, 100])

ax2.set_ylabel('F1 Score')
ax2.set_xlabel('Runs')
ax2.set_ylim([0, 100])

sns.despine(bottom=True, left=True)  # Remove the ticks on axes for cleaner presentation

plt.show()

In [21]:
# Instantiate multiple models
X = df.headline
y = df.label

cv = ShuffleSplit(n_splits=20, test_size=0.2)

models = [
    MultinomialNB(),
    BernoulliNB(),
    LogisticRegression(),
    SGDClassifier(),
    LinearSVC(),
    RandomForestClassifier(),
    MLPClassifier()
]

oversample = SMOTE()

# Init a dictionary for storing results of each run for each model
results = {
    model.__class__.__name__: {
        'accuracy': [], 
        'f1_score': [],
        'confusion_matrix': []
    } for model in models
}

for train_index, test_index in cv.split(X):
    X_train, X_test  = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    X_train_vect = vect.fit_transform(X_train)    
    X_test_vect = vect.transform(X_test)
    
    X_train_res, y_train_res = oversample.fit_resample(X_train_vect, y_train)
    
    for model in models:
        model.fit(X_train_res, y_train_res)
        y_pred = model.predict(X_test_vect)
        
        acc = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        cm = confusion_matrix(y_test, y_pred)
        
        results[model.__class__.__name__]['accuracy'].append(acc)
        results[model.__class__.__name__]['f1_score'].append(f1)
        results[model.__class__.__name__]['confusion_matrix'].append(cm)

In [None]:
for model, d in results.items():
    avg_acc = sum(d['accuracy']) / len(d['accuracy']) * 100
    avg_f1 = sum(d['f1_score']) / len(d['f1_score']) * 100
    avg_cm = sum(d['confusion_matrix']) / len(d['confusion_matrix'])
    
    slashes = '-' * 30
    
    s = f"""{model}\n{slashes}
        Avg. Accuracy: {avg_acc:.2f}%
        Avg. F1 Score: {avg_f1:.2f}
        Avg. Confusion Matrix: 
        \n{avg_cm}
        """
    print(s)

In [None]:
fig, axs = plt.subplots(2, 4, figsize=(20,10))
axs = axs.ravel()

for i, model in enumerate(results.keys()):
    avg_acc = sum(results[model]['accuracy']) / len(results[model]['accuracy']) * 100
    avg_f1 = sum(results[model]['f1_score']) / len(results[model]['f1_score']) * 100
    avg_cm = sum(results[model]['confusion_matrix']) / len(results[model]['confusion_matrix'])

    # Plot the confusion matrix
    im = axs[i].imshow(avg_cm, interpolation='nearest', cmap=plt.cm.Blues)
    axs[i].figure.colorbar(im, ax=axs[i])

    # Add text to each subplot
    axs[i].set(xticks=np.arange(avg_cm.shape[1]),
           yticks=np.arange(avg_cm.shape[0]), 
           xticklabels=['Negative', 'Positive'], 
           yticklabels=['Negative', 'Positive'],
           title=model,
           ylabel='Actual label',
           xlabel='Predicted label')

    # Loop over data dimensions and create text annotations
    for row in range(avg_cm.shape[0]):
        for col in range(avg_cm.shape[1]):
            axs[i].text(col, row, str(int(avg_cm[row, col])),
                    ha="center", va="center", color="white" if avg_cm[row, col] > (avg_cm.max() / 2.) else "black")

# Set layout properties and show plot
plt.tight_layout()
plt.show()

In [25]:
X = df.headline
y = df.label

cv = ShuffleSplit(n_splits=10, test_size=0.2)

models = [
    MultinomialNB(),
    BernoulliNB(),
    LogisticRegression(),
    SGDClassifier(),
    LinearSVC(),
    RandomForestClassifier(),
    MLPClassifier()
]

m_names = [m.__class__.__name__ for m in models]

models = list(zip(m_names, models))
vc = VotingClassifier(estimators=models)

oversample = SMOTE()

# No need for dictionary now
accs = []
f1s = []
cms = []

for train_index, test_index in cv.split(X):
    X_train, X_test  = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    X_train_vect = vect.fit_transform(X_train)    
    X_test_vect = vect.transform(X_test)
    
    X_train_res, y_train_res = oversample.fit_resample(X_train_vect, y_train)
    
    vc.fit(X_train_res, y_train_res)
    
    y_pred = vc.predict(X_test_vect)
    
    accs.append(accuracy_score(y_test, y_pred))
    f1s.append(f1_score(y_test, y_pred))
    cms.append(confusion_matrix(y_test, y_pred))

In [None]:
print("Voting Classifier")
print("-" * 30)
print("Avg. Accuracy: {:.2f}%".format(sum(accs) / len(accs) * 100))
print("Avg. F1 Score: {:.2f}".format(sum(f1s) / len(f1s) * 100))
print("Confusion Matrix:\n", sum(cms) / len(cms))

In [None]:
labels = ['Accuracy', 'F1 Score']
values = [sum(accs)/len(accs)*100, sum(f1s)/len(f1s)]

fig, ax = plt.subplots()
ax.bar(labels, values)

ax.set_ylim(0, 100)
ax.set_ylabel('Percentage')
ax.set_title('Performance of Voting Classifier')

plt.show()

print("Confusion Matrix:\n", sum(cms) / len(cms))