# Using NLP to detect sarcasm in text

In this notebook we will train a deep learning model to detect if there is a sarcasm into an input sentence or not 

In [130]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import json
import tensorflow as tf
import numpy as np

we will read the dataset from a json file (sarcasm.json) , here is an exemple of an item from this file : 

{'article_link': 'https://www.huffingtonpost.com/entry/versace-black-code_us_5861fbefe4b0de3a08f600d5', 
'headline': "former versace store clerk sues over secret 'black code' for minority shoppers", '
is_sarcastic': 0}



In [None]:
with open('../data/sarcasm.json','r') as f: 
    datasources = json.load(f)
    
sentenses = []
labels = []
urls = []

for item in datasources :
    sentenses.append(item["headline"])
    labels.append(item["is_sarcastic"])
    urls.append(item["article_link"])
    

After reading our data we will now using a Tokenize to tokenize our words and give each one a unique Id (int)

In [80]:
tokenizer = Tokenizer(oov_token="Other")
tokenizer.fit_on_texts(sentenses)
word_index = tokenizer.word_index


The word_index is a dict which contains all the words and their ids {"word" : 1, "word2" : 2,...}

after that we are going to convert our sentences into a sequences of integers each one represent a word

In [82]:
sequences = tokenizer.texts_to_sequences(sentenses)
padd = pad_sequences(sequences,padding="post")

# Split the data 

In [38]:
test_size = int(len(sentenses) * 0.2) +1
train_size = int(len(sentenses)*0.8) 

In [89]:
train_sentenses = sentenses[0:train_size+1]
test_sentenses  = sentenses[train_size+1:]
train_labels = labels[0:train_size+1]
test_labels= labels[train_size+1:]

In [94]:
OOV_TRAIN = "ohters_train"
MAX_LEN = len(max(sentenses))

In [97]:
MAX_LEN

96

In [41]:
train_tokenizer = Tokenizer(oov_token = OOV_TRAIN)

In [42]:
train_tokenizer.fit_on_texts(train_sentenses)


In [54]:
train_sentenses

["former versace store clerk sues over secret 'black code' for minority shoppers",
 "the 'roseanne' revival catches up to our thorny political mood, for better and worse",
 "mom starting to fear son's web series closest thing she will have to grandchild",
 'boehner just wants wife to listen, not come up with alternative debt-reduction ideas',
 'j.k. rowling wishes snape happy birthday in the most magical way',
 "advancing the world's women",
 'the fascinating case for eating lab-grown meat',
 'this ceo will send your kids to school, if you work for his company',
 'top snake handler leaves sinking huckabee campaign',
 "friday's morning email: inside trump's presser for the ages",
 'airline passengers tackle man who rushes cockpit in bomb threat',
 'facebook reportedly working on healthcare features and apps',
 "north korea praises trump and urges us voters to reject 'dull hillary'",
 "actually, cnn's jeffrey lord has been 'indefensible' for a while",
 'barcelona holds huge protest in su

In [43]:
train_tokenizer.word_index

{'ohters_train': 1,
 'to': 2,
 'of': 3,
 'the': 4,
 'in': 5,
 'for': 6,
 'a': 7,
 'on': 8,
 'and': 9,
 'with': 10,
 'is': 11,
 'new': 12,
 'trump': 13,
 'man': 14,
 'from': 15,
 'at': 16,
 'about': 17,
 'you': 18,
 'by': 19,
 'this': 20,
 'after': 21,
 'up': 22,
 'out': 23,
 'be': 24,
 'how': 25,
 'that': 26,
 'it': 27,
 'as': 28,
 'not': 29,
 'are': 30,
 'your': 31,
 'what': 32,
 'his': 33,
 'all': 34,
 'he': 35,
 'who': 36,
 'just': 37,
 'has': 38,
 'will': 39,
 'more': 40,
 'into': 41,
 'one': 42,
 'year': 43,
 'report': 44,
 'have': 45,
 'over': 46,
 'area': 47,
 'why': 48,
 'donald': 49,
 'u': 50,
 'day': 51,
 'can': 52,
 'says': 53,
 's': 54,
 'first': 55,
 'woman': 56,
 'time': 57,
 'like': 58,
 'get': 59,
 'her': 60,
 'old': 61,
 "trump's": 62,
 'no': 63,
 'now': 64,
 'off': 65,
 'an': 66,
 'life': 67,
 'obama': 68,
 'people': 69,
 'than': 70,
 'women': 71,
 "'": 72,
 'house': 73,
 'back': 74,
 'was': 75,
 'still': 76,
 'white': 77,
 'make': 78,
 'down': 79,
 'clinton': 80,
 'm

In [98]:
train_sequences = train_tokenizer.texts_to_sequences(train_sentenses)
test_sequences = train_tokenizer.texts_to_sequences(test_sentenses)

In [101]:
MAX_LEN = max (max([len(x) for x in train_sequences]),max([len(x) for x in test_sequences]))
MAX_LEN

40

In [102]:
padd_train = pad_sequences(train_sequences, padding= "post",maxlen=MAX_LEN)
padd_test = pad_sequences(test_sequences,padding="post",maxlen=MAX_LEN)

In [107]:
values = []
for item in train_tokenizer.word_index.items():
    values.append(item[1])

In [108]:
print(max(values)+1)

26510


# Creating and Training the model

In [109]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(max(values)+1,2,input_length=MAX_LEN),
    tf.keras.layers.GlobalAveragePooling1D(), 
    tf.keras.layers.Dense(24,activation="relu"), 
    tf.keras.layers.Dense(1,activation="sigmoid")
])

In [110]:
model.compile(loss="binary_crossentropy",optimizer = "adam",metrics=["accuracy"])

In [111]:
model.fit(padd_train,np.array(train_labels),epochs= 60,validation_data=(padd_test,np.array(test_labels)),verbose=2)

Epoch 1/60
668/668 - 1s - loss: 0.6601 - accuracy: 0.5925 - val_loss: 0.5754 - val_accuracy: 0.8042
Epoch 2/60
668/668 - 1s - loss: 0.4309 - accuracy: 0.8440 - val_loss: 0.3836 - val_accuracy: 0.8472
Epoch 3/60
668/668 - 1s - loss: 0.2983 - accuracy: 0.8883 - val_loss: 0.3481 - val_accuracy: 0.8575
Epoch 4/60
668/668 - 1s - loss: 0.2336 - accuracy: 0.9145 - val_loss: 0.3374 - val_accuracy: 0.8611
Epoch 5/60
668/668 - 1s - loss: 0.1873 - accuracy: 0.9326 - val_loss: 0.3392 - val_accuracy: 0.8592
Epoch 6/60
668/668 - 1s - loss: 0.1517 - accuracy: 0.9470 - val_loss: 0.3473 - val_accuracy: 0.8603
Epoch 7/60
668/668 - 1s - loss: 0.1213 - accuracy: 0.9603 - val_loss: 0.3626 - val_accuracy: 0.8575
Epoch 8/60
668/668 - 1s - loss: 0.0978 - accuracy: 0.9690 - val_loss: 0.3823 - val_accuracy: 0.8530
Epoch 9/60
668/668 - 1s - loss: 0.0793 - accuracy: 0.9766 - val_loss: 0.4078 - val_accuracy: 0.8506
Epoch 10/60
668/668 - 1s - loss: 0.0640 - accuracy: 0.9813 - val_loss: 0.4355 - val_accuracy: 0.8512

<tensorflow.python.keras.callbacks.History at 0x22fe63822e0>

In [113]:
model.evaluate(padd_test)



[0.0, 0.0]

In [114]:
len(np.array(train_labels))

21368

In [115]:
len(sentenses)

26709

In [116]:
len(sentenses)

26709

In [117]:
values = []
for item in train_tokenizer.word_index.items():
    values.append(item[1])

In [118]:
max(values)

26509

In [156]:
def predict(model,sentence): 
    sequence = train_tokenizer.texts_to_sequences([sentence])
    padd = pad_sequences(sequence,maxlen=MAX_LEN)
    prediction = model.predict(padd)
    # we are checking if the predictions is close to 0 or to 1
    print(prediction)
    if prediction <0.5: 
        print("The sentence does not contains Sarcasm ")
    else: 
        print("The sentence contains Sarcasm ")
        

In [157]:
new_sentence = "grany starting to fear the spiders in the garden might be real"
predict(model,new_sentence)

[[0.05485606]]
The sentence does not contains Sarcasm 


In [158]:
predict(model,"No ,you didn´t expect him to get  kiiled.")

[[0.00059581]]
The sentence does not contains Sarcasm 


In [161]:
predict(model,"they weren´t there because they had any say about it")

[[1.]]
The sentence contains Sarcasm 
