In [1]:
import pandas as pd
import math
import json
import nltk

# LOADING THE DATASET

In [2]:
def read_data(filename):
    f = open(filename, 'r')
    
    df = pd.read_json(f, lines=True)
    
    df.drop(columns = ['article_link'])                    #drop the column 'article_link', as we only need the headline body
    del df['article_link']
    
    return df

In [3]:
fileName = 'Sarcasm_Headlines_Dataset.json'

data = read_data(fileName) # Read Data file

data

Unnamed: 0,headline,is_sarcastic
0,former versace store clerk sues over secret 'b...,0
1,the 'roseanne' revival catches up to our thorn...,0
2,mom starting to fear son's web series closest ...,1
3,"boehner just wants wife to listen, not come up...",1
4,j.k. rowling wishes snape happy birthday in th...,0
...,...,...
26704,american politics in moral free-fall,0
26705,america's best 20 hikes,0
26706,reparations and obama,0
26707,israeli ban targeting boycott supporters raise...,0


In [4]:
print('Total sentences in dataset:', len(data))

Total sentences in dataset: 26709


In [5]:
print('Total Sentences in dataset with sarcastic tag:', len(data[data['is_sarcastic'] == 0]))
print('Total Sentences in dataset with non-sarcastic tag:', len(data[data['is_sarcastic'] == 1]))

Total Sentences in dataset with sarcastic tag: 14985
Total Sentences in dataset with non-sarcastic tag: 11724


# 

#  Process data 
Removing all the stop words from the text and finding the lemma(perform lemmatization) of words.

In [6]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [7]:
#Method to Remove all the punctuation symbols from a sentence 

import re

def eliminate_punct(sent):
    sent = re.sub("[']", '', sent)
    sent = re.sub("[^\w]", ' ', sent)
    
    return sent

In [8]:
#Apply the lemmatization to each word in the dataset

from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer() 


In [9]:
def remove_stop_words(sentence):
    words = sentence.split(' ')
    new_sent = ""

    for word in words:
        if word not in stop_words:
            word = lemmatizer.lemmatize(word)
            new_sent += word
            new_sent += " "
    
    return new_sent

In [10]:
def process(data):
    for i in range(0, len(data)):
        sentence = eliminate_punct(data.iloc[i,0])
        #sentence = remove_stop_words(sentence)
       
        data.iloc[i,0] = sentence.lower()
    return data
    
    

In [11]:
data = process(data)

In [12]:
print('The Data After PreProcessing')
data

The Data After PreProcessing


Unnamed: 0,headline,is_sarcastic
0,former versace store clerk sues over secret bl...,0
1,the roseanne revival catches up to our thorny ...,0
2,mom starting to fear sons web series closest t...,1
3,boehner just wants wife to listen not come up...,1
4,j k rowling wishes snape happy birthday in th...,0
...,...,...
26704,american politics in moral free fall,0
26705,americas best 20 hikes,0
26706,reparations and obama,0
26707,israeli ban targeting boycott supporters raise...,0


# 

# Building Tf-Vector (Term Frequency Vector)
The TF vector will be used for the VSM model as well as for eliminating the words with low frequency.

In [13]:
#This method would return the word with frequency greater than 5

def high_freq_words(tf_vector):
    final_tf = {}
    
    for word in tf_vector:
        if tf_vector[word] >= 5:
            final_tf[word] = tf_vector[word]
    
    return final_tf

In [14]:
def calc_tf(data):
    tf_words = {}
    
    for i in range(len(data['headline'])):
        line = data['headline'][i].split()
        
        for word in line:
            if word in tf_words:
                tf_words[word] += 1
            else:
                tf_words[word] = 1
    
    return high_freq_words(tf_words)

In [15]:
#Remove Low Freq Words
words_tf = calc_tf(data)

In [16]:
words_tf

{'former': 106,
 'store': 55,
 'clerk': 11,
 'sues': 17,
 'over': 483,
 'secret': 92,
 'black': 267,
 'code': 19,
 'for': 3321,
 'minority': 15,
 'the': 5415,
 'revival': 11,
 'catches': 14,
 'up': 842,
 'to': 8267,
 'our': 177,
 'political': 83,
 'mood': 13,
 'better': 124,
 'and': 1935,
 'worse': 40,
 'mom': 198,
 'starting': 45,
 'fear': 46,
 'sons': 36,
 'web': 19,
 'series': 62,
 'closest': 7,
 'thing': 144,
 'she': 199,
 'will': 558,
 'have': 494,
 'boehner': 27,
 'just': 579,
 'wants': 139,
 'wife': 84,
 'listen': 24,
 'not': 739,
 'come': 105,
 'with': 1805,
 'alternative': 13,
 'debt': 29,
 'ideas': 43,
 'j': 49,
 'k': 53,
 'rowling': 7,
 'wishes': 42,
 'happy': 64,
 'birthday': 64,
 'in': 4238,
 'most': 261,
 'magical': 32,
 'way': 266,
 'worlds': 93,
 'women': 329,
 'case': 94,
 'eating': 76,
 'lab': 8,
 'grown': 18,
 'meat': 28,
 'this': 920,
 'ceo': 73,
 'send': 35,
 'your': 653,
 'kids': 197,
 'school': 247,
 'if': 301,
 'you': 1004,
 'work': 187,
 'his': 633,
 'company':

# 

# TRAINING PHASE

# 

# Naive Bayes

In [17]:
def Probabilities(data):
    probabities = {}
    
    probabities[1] = data['is_sarcastic'].value_counts()[1]
    probabities[0] = data['is_sarcastic'].value_counts()[0]
    
    return probabities

In [18]:
def uniqueWords(train, vocab):
    words = {}
    
    for line in train['headline']:
        line = line.split()
        
        for word in line:
            if word in vocab:
                words[word] = 1
            
    return list(words.keys())            

In [19]:
def countBoth(X, words):                           #Method to count the total sarcastic sentence and non-sarcastic sentences
                                               #X is the trained dataset
    sarcastic_sent = 0
    non_sarc_sent = 0
    
    for i in range(0, len(X)):
        line = X.iloc[i]
        tag = line[1]
        line = line[0].split()
        
        for word in line:
            if tag == 1:
                sarcastic_sent += 1
            else:
                non_sarc_sent += 1
    
    return sarcastic_sent, non_sarc_sent
    

In [20]:
def wordsOccurences(train, words):                     #Method would calculate the number of times a word appeared in sarcastic as well as non-sarcastic sentence

    count_words = {}
    
    for i in range(0, len(train)):
        line = train.iloc[i]
        tag = line[1]
        line = line[0].split()
        
        for word in line:
            if word in words and word not in count_words:
                count_words[word] = {}
                count_words[word][0] = 1
                count_words[word][1] = 1
 
            elif tag == 1 and word in count_words:
                count_words[word][1] += 1
            
            elif tag == 0 and word in count_words:
                count_words[word][0] += 1
                
    
    return count_words     
    

In [21]:
def find_probs(train, sarc, non_sarc, word_count, total_unique):
    prob = {}
    p = Probabilities(train)
    prob['pr_0'] = p[0]
    prob['pr_1'] = p[1]

    prob['total_w'] = total_unique
    prob['tag0'] = non_sarc
    prob['tag1'] = sarc
    
    for word in word_count:
        prob[word] = {}
        prob[word][0] = ((word_count[word][0] + 1) / (total_unique + non_sarc))
        prob[word][1] = ((word_count[word][1] + 1) / (total_unique + sarc))
    
    return prob

In [22]:
def NaiveBayes(train, vocab):
    words = uniqueWords(train, vocab)
    
    total_words = len(words)
    
    sarc, non_sarc = countBoth(train, words)
    
    word_count = wordsOccurences(train, words)
    
    prob = find_probs(train, sarc, non_sarc, word_count, total_words)
    
    return prob
        

In [23]:

from sklearn.model_selection import train_test_split


In [24]:
X, Y = train_test_split(data, test_size = 0.3,shuffle=True)      #70% training and 30% testing

X.reset_index(inplace=True, drop=True)

Y.reset_index(inplace=True, drop=True)


In [25]:
print('Training Dataset')
X

Training Dataset


Unnamed: 0,headline,is_sarcastic
0,once homeless student who worked 4 jobs to sup...,0
1,james comeys book pre sold almost 200 000 copi...,0
2,syrian archbishop on christians threatened by ...,0
3,whoops franklin grahams new bank is lgbt frie...,0
4,chris christie neglects new jersey woes while ...,0
...,...,...
18691,grieving couple finds different ways to use st...,1
18692,kim kardashian celebrates 42 million insta fol...,0
18693,someone filming b roll at pike place market ri...,1
18694,local man knows he moved to minneapolis for so...,1


In [26]:
print('Testing DataSet')
Y

Testing DataSet


Unnamed: 0,headline,is_sarcastic
0,watch this cat lose its mind after faced with ...,0
1,president trumps war on children,0
2,the case for holistic education in the wake of...,0
3,khloe kardashian thanks fans for their patienc...,0
4,new therapist obsessed with old therapist,1
...,...,...
8008,new report finds americans most interested in ...,1
8009,a sadder pride because of washington inaction,0
8010,rob kardashian apparently tweeted kylie jenner...,0
8011,man humiliated by wi fis poor behavior in fron...,1


In [27]:
print("Length of Training Dataset: ", len(X))
print("Length of Testing dataset", len(Y))

Length of Training Dataset:  18696
Length of Testing dataset 8013


In [28]:
train_dataset = NaiveBayes(X, words_tf.keys())

# VSM Model

In [29]:
def calc_idf(document_frequency):
    idf = {}
    for i in document_frequency:
        
        for j in document_frequency[i]:
            if i not in idf:
                idf[i] = 0
            idf[i] += 1 
        
        if(idf[i] > 0):
            idf[i] = 1/idf[i]
    return idf


In [30]:
def preProcess():
    doc_vector = [dict() for x in range(51)]
    document_frequency = {}

    for i in range(1,51):
        for word in X.iloc[i,0].split():       
            #for word in line.split():
                if(True):
                    #word = clean(word)
                    #word = lemmatizer.lemmatize(word, pos =  "v")
                    if word not in document_frequency:
                        document_frequency[word] = []
                   
                    if word in document_frequency:
                        if i not in document_frequency[word]:
                            document_frequency[word].append(i)
                    
                    if word not in doc_vector[i]:
                        doc_vector[i][word] = 0
                    
                    if word in doc_vector[i]:
                        doc_vector[i][word] += 1
    

    idf = calc_idf(document_frequency)
    
    for i in range(1,51):
        for x in doc_vector[i]:
            doc_vector[i][x] = doc_vector[i][x] * idf[x]
        
    
    return idf, doc_vector, document_frequency

In [31]:

idf, doc_vector, document_frequency = preProcess()


In [32]:
def get_mod(vector):
    sum = 0
    for i in vector:
        sum += (vector[i] * vector[i])
    return math.sqrt(sum)


In [33]:

def dot_prod(v1, v2):
    cp = 0
    for i in v1:
        if i in v2:
            cp += v1[i] * v2[i]
    return cp

In [34]:
def is_sarcastic_vsm(sentence):
    score = -1
    index =0 
    count = 0
    
    for i in doc_vector:
        s1 = dot_prod(sentence, i)
        #s = score/(get_mod(sentence) * get_mod(i))
        if s1 >= score:
            score = s1
            index = count
        count += 1
    
    return X.iloc[index, 1]


# NGRAM

In [35]:

def n_gram_apply():
    ss= len(X)
    check = False
    for i in range(0,ss, 10):
        if(X.iloc[i,1] == 1 or check == True):
            if(X.iloc[i,1] == 1):
                check = True
            else:
                check = False
            #comment=eliminate_punct(data.iloc[i,0])
            ngram_vocab = ngrams(X.iloc[i,0].split(), 4)
            #print(data.iloc[i,0])
            new_df = {}
            for j in ngram_vocab:
                sentence = ""
                for k in j:
                    sentence = sentence + " " + str(k.lower())
                sentence = eliminate_punct(sentence)
                new_df['headline'] = sentence
                new_df['is_sarcastic'] = X.iloc[i,1]
                X=X.append(new_df, ignore_index = True)
                new_df.clear()


# 

# TESTING PHASE

In [36]:
def test_NaiveBayes(trained,test):
    result_list = []
    tf = 1

    for i in range(0, len(test)):
        predicted = ""
        true_prob = 1
        false_prob = 1
        line = test.iloc[i]
        line = line[0].split()
        
        for w in line:
            if w in trained:
                true_prob *= trained[w][1]
                false_prob *= trained[w][0]
            else:
                true_prob *= (1 + tf) / ((trained['total_w'] + trained['tag1']))
                false_prob *= (1 + tf) / ((trained['total_w'] + trained['tag0']))
        
        true_prob = (true_prob * trained['pr_1'])
        false_prob = (false_prob * trained['pr_0'])
        
        if true_prob > false_prob:
            predicted = 1
        else:
            predicted = 0
        
        result_list.append(predicted)
    
    return result_list

In [37]:
def TestVSM():
    sentence = {}                                  #Testing the VSM with the sentence "report"
    for i in sent.split():
        sentence[i] = 1
    #print(cross_prod(sentence, X.iloc[0,0]))
    print("Ans: ")
    print(is_sarcastic_vsm(sentence))


In [38]:
predicted = test_NaiveBayes(train_dataset,Y)

In [39]:
#TestVSM()

# 

# Finding the Accuracy of model

In [40]:
def Accuracy(actual, predicted):
    true_predicted = 0
    
    for i in range(0, len(predicted)):
        if(i < len(actual) and i < len(predicted) and actual[i] == predicted[i]):
            true_predicted += 1
    
    accuracy = (true_predicted / len(Y)) * 100
    
    return accuracy
    

In [41]:
vsm_predicted = []

def calc_vsm_accuracy():
    for i in range(0, 8013):
        sent = {}
        for w in Y.iloc[i,0].split():
            if w not in sent:
                sent[w] = 0
            sent[w] +=1
        vsm_predicted.append(is_sarcastic_vsm(sent))
        sent.clear()


In [42]:
actual = list(Y['is_sarcastic'])
naiveBayes_acc = Accuracy(actual, predicted)


In [43]:

calc_vsm_accuracy()
vsm_acc = Accuracy(actual, vsm_predicted)
print("VSM Accuracy is : ", vsm_acc)
print("VSM Rounded-Accurcy",math.ceil(vsm_acc))

VSM Accuracy is :  53.36328466242356
VSM Rounded-Accurcy 54


In [44]:
print("Naive Bayes Accuracy is : ", naiveBayes_acc)
print("Naive Bayes Rounded-Accurcy", math.ceil(naiveBayes_acc))

Naive Bayes Accuracy is :  84.89953825034318
Naive Bayes Rounded-Accurcy 85


In [45]:
predict_output = Y.copy()
predict_output = predict_output.drop(['is_sarcastic'],axis=1)
predict_output['is_sarcastic'] = predicted

In [46]:
print('The Predicted Output on test data')
predict_output

The Predicted Output on test data


Unnamed: 0,headline,is_sarcastic
0,watch this cat lose its mind after faced with ...,0
1,president trumps war on children,0
2,the case for holistic education in the wake of...,0
3,khloe kardashian thanks fans for their patienc...,0
4,new therapist obsessed with old therapist,1
...,...,...
8008,new report finds americans most interested in ...,1
8009,a sadder pride because of washington inaction,0
8010,rob kardashian apparently tweeted kylie jenner...,0
8011,man humiliated by wi fis poor behavior in fron...,1


# 

# Comparison Of Accuracies of both models

In [47]:
print('Naive Bayes Accuracy:', naiveBayes_acc)
print('VSM Accuracy:', vsm_acc)

Naive Bayes Accuracy: 84.89953825034318
VSM Accuracy: 53.36328466242356


###### The Naive Bayes Gained the accuracy of 85%, whereas The VSM gained the accuracy of 58%. The Naive Bayes model worked very well on the dataset.

# 

# Classify Based on user input

In [48]:
def isSarcastic_NaiveBayes(trained, sent):
    line=sent.split()
    true_prob=1
    false_prob=1
    tf=1
    
    for w in line:
        if w in trained:
            true_prob *= trained[w][1]
            false_prob *= trained[w][0]
        else:
            true_prob *= (1+tf) / ((trained['total_w'] + trained['tag1']))
            false_prob *= (1+tf) / ((trained['total_w'] + trained['tag0']))
    
    
    true_prob = (true_prob * trained['pr_1'])
    false_prob = (false_prob * trained['pr_0'])
    
    if(true_prob > false_prob):
        return "Sarcastic"
    else:
        return "Not Sarcastic"
        

In [None]:
sent_input=input("Enter The Sentence : ")

In [None]:
sent_input = eliminate_punct(sent_input)
sent_input = remove_stop_words(sent_input)

In [None]:
ans = isSarcastic_NaiveBayes(train_dataset,sent_input)

In [None]:
print(ans)