# Naive Bayes

In [5]:
# imports
from sklearn.model_selection import train_test_split as split

from src.data_util import load_data
from src.naive_bayes import NaiveBayesClassifier

import numpy as np

In [4]:
# load the data
headlines = load_data("../data/dataset.conllu")

# split into training and test sets
SEED = 42
train_headlines, other_headlines = split(headlines, test_size=0.3, random_state=SEED)
val_headlines, test_headlines = split(other_headlines, test_size=0.5, random_state=SEED)
print(
    f"Number of headlines for training, validation, \
        and test is {len(train_headlines)}, {len(val_headlines)}, \
        and {len(test_headlines)} resp."
)

Number of headlines for training, validation,         and test is 20033, 4293,         and 4293 resp.


In [5]:
naive_bayes = NaiveBayesClassifier(ngram_range=(1, 3))
naive_bayes.fit(train_headlines)
fp, fn = naive_bayes.test(train_headlines)
fp, fn = naive_bayes.test(test_headlines)

100%|██████████| 20033/20033 [04:37<00:00, 72.17it/s]
100%|██████████| 20033/20033 [01:27<00:00, 228.81it/s]


               precision    recall  f1-score   support

Non-sarcastic       1.00      1.00      1.00     10530
    Sarcastic       1.00      1.00      1.00      9503

     accuracy                           1.00     20033
    macro avg       1.00      1.00      1.00     20033
 weighted avg       1.00      1.00      1.00     20033



100%|██████████| 4293/4293 [00:19<00:00, 224.42it/s]

               precision    recall  f1-score   support

Non-sarcastic       0.84      0.89      0.86      2237
    Sarcastic       0.87      0.81      0.84      2056

     accuracy                           0.85      4293
    macro avg       0.85      0.85      0.85      4293
 weighted avg       0.85      0.85      0.85      4293






In [6]:
# balance the dataset
tweets = load_data("../data/tweets.conllu")

In [7]:
tweets_sarcastic = [tweet for tweet in tweets if tweet[0].metadata['class'] == "1"]
tweets_non_sarcastic = [tweet for tweet in tweets if tweet[0].metadata['class'] == "0"]

In [17]:
len(tweets_sarcastic)

867

In [18]:
len(tweets_non_sarcastic)

2601

In [15]:
tweets[0][0]

TokenList<The, only, thing, I, got, from, college, is, a, caffeine, addiction, metadata={text: "The only thing I got from college is a caffeine addiction", headline_id: "1", sent_id: "0", class: "1", link: "a"}>

In [8]:
tweets_non_sarcastic_sample = []

In [9]:
sampled_indices = np.random.choice(len(tweets_non_sarcastic), size=len(tweets_sarcastic), replace=False)
for idx in sampled_indices:
    tweets_non_sarcastic_sample.append(tweets_non_sarcastic[idx])

In [10]:
sampled_tweets = tweets_sarcastic + tweets_non_sarcastic_sample

In [28]:
len(sampled_tweets)

1734

In [29]:
# test on extra data
fp, fn = naive_bayes.test(sampled_tweets)

100%|██████████| 1734/1734 [00:10<00:00, 172.94it/s]


               precision    recall  f1-score   support

Non-sarcastic       0.50      0.86      0.63       867
    Sarcastic       0.52      0.15      0.23       867

     accuracy                           0.50      1734
    macro avg       0.51      0.50      0.43      1734
 weighted avg       0.51      0.50      0.43      1734



The Naive Bayes abseline achieved a satisfactory performance on the news headlines, however with the new dataset the metrics are significantly worse, which might suggest that we are actually not learning to detect sarcasm but rather Onion writing style vs. Huffpost writing style.