In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

In [2]:
pd.set_option('max_colwidth', 100)

In [3]:
# Load dataset
df =  pd.read_json('Sarcasm_Headlines_Dataset_v2.json', lines=True)
df.rename(columns={'headline': 'text'}, inplace=True)
df.head()

Unnamed: 0,is_sarcastic,text,article_link
0,1,thirtysomething scientists unveil doomsday clock of hair loss,https://www.theonion.com/thirtysomething-scientists-unveil-doomsday-clock-of-hai-1819586205
1,0,"dem rep. totally nails why congress is falling short on gender, racial equality",https://www.huffingtonpost.com/entry/donna-edwards-inequality_us_57455f7fe4b055bb1170b207
2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-veggies-9-delici_b_8899742.html
3,1,inclement weather prevents liar from getting to work,https://local.theonion.com/inclement-weather-prevents-liar-from-getting-to-work-1819576031
4,1,mother comes pretty close to using word 'streaming' correctly,https://www.theonion.com/mother-comes-pretty-close-to-using-word-streaming-cor-1819575546


In [4]:
df.shape

(28619, 3)

In [5]:
df['is_sarcastic'].value_counts()

0    14985
1    13634
Name: is_sarcastic, dtype: int64

In [6]:
df_unsacarstic = df[df['is_sarcastic']==0].head(10)
df_unsacarstic[['text']]

Unnamed: 0,text
1,"dem rep. totally nails why congress is falling short on gender, racial equality"
2,eat your veggies: 9 deliciously different recipes
5,my white inheritance
6,5 ways to file your taxes with less stress
9,lots of parents know this scenario
10,this lesbian is considered a father in indiana (and an amazing one at that)
11,amanda peet told her daughter sex is 'a special hug'
12,what to know regarding current treatments for ebola
13,chris christie suggests hillary clinton was to blame for boko haram's kidnapping of hundreds of ...
15,uber ceo travis kalanick stepping down from trump economic advisory council


In [7]:
df_sarcastic = df[df['is_sarcastic']==1].head(10)
df_sarcastic[['text']]

Unnamed: 0,text
0,thirtysomething scientists unveil doomsday clock of hair loss
3,inclement weather prevents liar from getting to work
4,mother comes pretty close to using word 'streaming' correctly
7,richard branson's global-warming donation nearly as much as cost of failed balloon trips
8,shadow government getting too large to meet in marriott conference room b
14,ford develops new suv that runs purely on gasoline
16,area boy enters jumping-and-touching-tops-of-doorways phase
17,area man does most of his traveling by gurney
21,guard in video game under strict orders to repeatedly pace same stretch of hallway
25,secret service agent not so secret about being david alan grier fan


In [8]:
# Feature endineering
df['text'] = df['text'].replace('!', ' exclamation ')
df['text'] = df['text'].replace('?', ' question ')
df['text'] = df['text'].replace('\'', ' quotation ')
df['text'] = df['text'].replace('\"', ' quotation ')

In [9]:
# Create a bag of words
count_vec = CountVectorizer()
bow = count_vec.fit_transform(df['text'])
bow = np.array(bow.todense())

In [10]:
X = bow
y = df['is_sarcastic']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.3, 
                                                    stratify=y)

In [13]:
model = MultinomialNB().fit(X_train, y_train)
y_pred = model.predict(X_test)

In [14]:
# Checking accuracy of our model
print('Accuracy:', accuracy_score(y_test, y_pred))
print('F1 score:', f1_score(y_test, y_pred, average="macro"))
print('ROC AUC:', roc_auc_score(y_test, y_pred))

Accuracy: 0.8490566037735849
F1 score: 0.8484836155777133
ROC AUC: 0.8480449342637629


In [15]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.87      0.86      4496
           1       0.85      0.83      0.84      4090

    accuracy                           0.85      8586
   macro avg       0.85      0.85      0.85      8586
weighted avg       0.85      0.85      0.85      8586



Now we can check where our model went wrong

In [16]:
results = pd.DataFrame(data={'predicted': y_pred, 'actual': y_test})
predictions = results.join(df)

In [17]:
def is_correct(predicted, actual):
    if predicted == actual:
        return True
    else:
        return False

In [18]:
predictions['correct'] = predictions.apply(lambda x: is_correct(x.predicted, x.actual), axis=1)
predictions = predictions[['text','predicted','actual','correct']]

In [19]:
predictions[predictions['correct']==False].sample(10)

Unnamed: 0,text,predicted,actual,correct
22329,florida passes strict ban on being unarmed,0,1,False
11147,new epa chief proposes 30% cut in all carbon-based organisms,0,1,False
20395,random online photo leads to navy veteran's rescue from flooded house,1,0,False
1743,"'syrians' lives are worthless,' obama tells daughters before kissing them goodnight",0,1,False
20499,the world's oldest living cat has died,1,0,False
3553,house conservatives introduce resolution to impale rod rosenstein,0,1,False
24039,light beer healthiest food option at stadium,0,1,False
5096,"12-year-old says she's desperate to stop the hitting, screaming, fighting with mom",1,0,False
4251,company encourages women who have been sexually harassed to come forward with resignation letter,0,1,False
4688,roy moore on pedophilia accusers: 'these women are only discrediting me now because shifting soc...,0,1,False


In [20]:
# Saving our model 
import pickle

In [21]:
pickle.dump(model, open('model.pkl','wb'))

In [22]:
pickle.dump(count_vec, open('cv_transform.pkl','wb'))