In [1]:
# imports
from src.data_util import load_data
from src.naive_bayes import NaiveBayesClassifier

In [2]:
# load the data
headlines = load_data("../data/dataset.conllu")

# split into training and test sets
train_headlines, test_headlines = headlines[:25000], headlines[25000:]
print(len(train_headlines))
print(len(test_headlines))

25000
3619


In [None]:
# fit the Naive Bayes Bag of Word model to training data
nb = NaiveBayesClassifier((1, 1))
nb.fit(train_headlines)

100%|██████████| 25000/25000 [00:13<00:00, 1842.46it/s]


In [12]:
# test on training data
fp, fn = nb.test(train_headlines)

100%|██████████| 25000/25000 [00:05<00:00, 4649.64it/s]

               precision    recall  f1-score   support

Non-sarcastic       0.92      0.93      0.92     13089
    Sarcastic       0.92      0.91      0.91     11911

     accuracy                           0.92     25000
    macro avg       0.92      0.92      0.92     25000
 weighted avg       0.92      0.92      0.92     25000






In [13]:
# test on test data and get false positive and false negatives
fp, fn = nb.test(test_headlines)

100%|██████████| 3619/3619 [00:00<00:00, 4466.26it/s]


               precision    recall  f1-score   support

Non-sarcastic       0.84      0.88      0.86      1896
    Sarcastic       0.86      0.82      0.84      1723

     accuracy                           0.85      3619
    macro avg       0.85      0.85      0.85      3619
 weighted avg       0.85      0.85      0.85      3619



In [14]:
N = 10
print("\n--- some false negatives ---")
for f in fn[:N]:
    print(f[0].metadata["text"])
print("\n--- some false positives ---")
for f in fp[:N]:
    print(f[0].metadata["text"])


--- some false negatives ---
new 'cut off your genitals' challenge gains popularity among teens online
breaking: we might be doing a bad job
well known gresham, or musicians form gresham, or supergroup
federal judge pencils blocking trump's unconstitutional executive orders into monthly schedule
sight of 400 war elephants on horizon marks hillary clinton's arrival in swing state
afro-disney plans scrapped
donald trump jr. divorce leaves confused, heartbroken nation wondering why bad things happen to good people
new honda commercial openly says your kids will die in a car crash if you buy a different brand
custom fireplace store totally jumps gentrification gun
diorama of rome built in a day

--- some false positives ---
gutters and castles
bts proves k-pop's power with spot on time magazine's most influential list
meryl streep and tom hanks have too much fun playing each other's characters
video shows e-cigarette suddenly explode in new jersey woman's handbag
john kerry attempts to bu

In [153]:
def show_word_weights(headline):
    print(('{:>14}'*4).format('word', 'sarcastic', 'non-sarcastic', 'diff'))
    print('='*56)
    threshold = 1
    for sentence in headline:
        for token in sentence:
            if token["lemma"] in vocabulary:
                neg_weight = bow.feature_log_prob_[0][vectorizer.transform([[token["lemma"]]]).nonzero()[1][0]]
                pos_weight = bow.feature_log_prob_[1][vectorizer.transform([[token["lemma"]]]).nonzero()[1][0]]
                diff = pos_weight - neg_weight
            else:
                pos_weight, neg_weight, diff = -1, -1, 0
            p_token = token["form"] if abs(diff) < threshold else f"*{token['form']}"
            print(f'{p_token:>14}{pos_weight:>14.2f}{neg_weight:>14.2f}{diff:>14.2f}')
    print()

In [155]:
fp = false_positives[8]
show_word_weights(fp)
fn = false_negatives[6]
show_word_weights(fn)

          word     sarcastic non-sarcastic          diff
         *mitt        -10.76        -11.88          1.12
       *romney         -9.40        -11.29          1.90
             :         -1.00         -1.00          0.00
             '         -7.24         -6.71         -0.53
           *we         -9.07         -7.57         -1.50
           've         -6.78         -7.14          0.35
           got         -7.33         -7.50          0.17
            ta         -5.09         -5.31          0.23
       rethink        -12.09        -12.11          0.02
      campaign         -8.75         -9.03          0.27
       finance        -11.86        -11.55         -0.32
             '         -7.24         -6.71         -0.53

          word     sarcastic non-sarcastic          diff
        *sight        -10.83        -11.88          1.05
            of         -5.36         -5.82          0.46
           400        -10.91        -11.88          0.97
           war         -9.03  

We can see that the word "trump" (also "donald") has a much larger weight for non-sarcastic labels.
This complies with our analysis in milestone 1 regarding the most common lemmas: the word "trump" is way more frequent in non-sarcastics.
This makes the Bag of Words model have a hard time detecting sarcastic headlines containing "trump".