In [1]:
# Author: Akshya Kumar Shrestha
# Date: February 10, 2019
# Description: Naive Bayes Classifier is used to detect sarcasm on the news headlines.

In [2]:
import pandas as pd

In [3]:
raw = pd.read_json('Sarcasm_Headlines_Dataset.json', lines=True)
raw.head(3)

Unnamed: 0,article_link,headline,is_sarcastic
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1


In [4]:
df = raw
df.pop('article_link')
df.dropna()
df.head()

Unnamed: 0,headline,is_sarcastic
0,former versace store clerk sues over secret 'b...,0
1,the 'roseanne' revival catches up to our thorn...,0
2,mom starting to fear son's web series closest ...,1
3,"boehner just wants wife to listen, not come up...",1
4,j.k. rowling wishes snape happy birthday in th...,0


In [5]:
from sklearn.model_selection import train_test_split

X = df['headline']
y = df['is_sarcastic']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [6]:
from nltk.corpus import stopwords
import string
from sklearn.feature_extraction.text import CountVectorizer

stop_words =  stopwords.words('english') + list(string.punctuation)
vectorizer = CountVectorizer(lowercase=True, stop_words=stop_words)

In [7]:
X_train = vectorizer.fit_transform(X_train)

In [8]:
from sklearn import naive_bayes

model = naive_bayes.MultinomialNB()
model.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [9]:
from sklearn.metrics import accuracy_score

test_data = vectorizer.transform(X_test)
y_predict = model.predict(test_data)

In [10]:
print(accuracy_score(y_test, y_predict))

0.8113066267315612


In [31]:
sample_data = ['today is sunday', 
               'youre tall as a giant dwarf', 
               'former versace store clerk sues over secret ',
               'youre very nice little pumpkin !',
              'you are chepal',
              'you are rozeen',
              'i am akshya and i am tall',
              'you are akshya and you are tall',
              'i am shristi and i am sharma',
              'timi haru ayena vane ma last risauxu hai guys',
              'I am Chepal and im a guitarist',
              'i am akshya and i am a freaking bollywood star',
              'I am little shristi and brand new miss nepal',
              'i promise me and chepal will finish fyp today.. yayy',
              'i am rozeen and i am best programmer in the world yayy yayy yayy',
              'i am rozeen and i am better and richer than bill gates',
              'amazon is my father business',
              'i am god',
              'i am chepal and i am better than rozeen',
              'i am rozeen and better than chepal',
              'i am chepal and i promise i will finish my fyp till 15th of april']
predict_sample_data = vectorizer.transform(sample_data)
predicted = model.predict(predict_sample_data)

for i in range(0, len(sample_data)):
    if predicted[i] == 1:
        print(sample_data[i], "-> Sarcastic")
    else:
        print(sample_data[i], "-> Non Sarcastic")

today is sunday -> Non Sarcastic
youre tall as a giant dwarf -> Sarcastic
former versace store clerk sues over secret  -> Non Sarcastic
youre very nice little pumpkin ! -> Sarcastic
you are chepal -> Non Sarcastic
you are rozeen -> Non Sarcastic
i am akshya and i am tall -> Sarcastic
you are akshya and you are tall -> Sarcastic
i am shristi and i am sharma -> Non Sarcastic
timi haru ayena vane ma last risauxu hai guys -> Sarcastic
I am Chepal and im a guitarist -> Non Sarcastic
i am akshya and i am a freaking bollywood star -> Non Sarcastic
I am little shristi and brand new miss nepal -> Non Sarcastic
i promise me and chepal will finish fyp today.. yayy -> Sarcastic
i am rozeen and i am best programmer in the world yayy yayy yayy -> Non Sarcastic
i am rozeen and i am better and richer than bill gates -> Non Sarcastic
amazon is my father business -> Non Sarcastic
i am god -> Sarcastic
i am chepal and i am better than rozeen -> Non Sarcastic
i am rozeen and better than chepal -> Non Sarc