<a href="https://colab.research.google.com/github/ahmedSalah-11/SureStart2021/blob/main/Day5/Sarcasm_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [53]:
import json
import tensorflow as tf
import pandas as pd
import numpy as np


from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [26]:
vocab_size = 10000
embedding_dim = 16
max_length = 100
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
training_size = 20000

In [67]:
datastore = pd.read_json("/tmp/Sarcasm_Headlines_Dataset.json", lines = True)


datastore.head()



Unnamed: 0,article_link,headline,is_sarcastic
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0


In [68]:
datastore.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26709 entries, 0 to 26708
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   article_link  26709 non-null  object
 1   headline      26709 non-null  object
 2   is_sarcastic  26709 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 626.1+ KB


In [69]:
sentcences=datastore['headline'].to_numpy()
labels=datastore['is_sarcastic'].to_numpy()

### Split Data into training and testing portions

In [70]:

training_sentences = sentcences[0:training_size]
testing_sentences = sentcences[training_size:]
training_labels = labels[0:training_size]
testing_labels = labels[training_size:]


### Tokenization process

In [71]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)

word_index = tokenizer.word_index

training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

### Building the model

In [72]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [41]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 16)           160000    
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 24)                408       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 25        
Total params: 160,433
Trainable params: 160,433
Non-trainable params: 0
_________________________________________________________________


### Training and testing 

In [42]:
num_epochs = 30
history = model.fit(training_padded, training_labels, epochs=num_epochs, validation_data=(testing_padded, testing_labels), verbose=2)

Epoch 1/30
625/625 - 2s - loss: 0.6692 - accuracy: 0.5724 - val_loss: 0.6036 - val_accuracy: 0.6982
Epoch 2/30
625/625 - 2s - loss: 0.4511 - accuracy: 0.8195 - val_loss: 0.3906 - val_accuracy: 0.8398
Epoch 3/30
625/625 - 2s - loss: 0.3186 - accuracy: 0.8745 - val_loss: 0.3556 - val_accuracy: 0.8533
Epoch 4/30
625/625 - 2s - loss: 0.2640 - accuracy: 0.8977 - val_loss: 0.3467 - val_accuracy: 0.8545
Epoch 5/30
625/625 - 2s - loss: 0.2288 - accuracy: 0.9122 - val_loss: 0.3466 - val_accuracy: 0.8556
Epoch 6/30
625/625 - 2s - loss: 0.2035 - accuracy: 0.9241 - val_loss: 0.3583 - val_accuracy: 0.8518
Epoch 7/30
625/625 - 2s - loss: 0.1793 - accuracy: 0.9322 - val_loss: 0.3682 - val_accuracy: 0.8478
Epoch 8/30
625/625 - 2s - loss: 0.1625 - accuracy: 0.9391 - val_loss: 0.3760 - val_accuracy: 0.8509
Epoch 9/30
625/625 - 2s - loss: 0.1446 - accuracy: 0.9491 - val_loss: 0.3910 - val_accuracy: 0.8515
Epoch 10/30
625/625 - 2s - loss: 0.1321 - accuracy: 0.9534 - val_loss: 0.4182 - val_accuracy: 0.8453