In [1]:
import pandas as pd
# Reading json files
df = pd.read_json("/content/drive/MyDrive/Projects/sarcasm_detector/Sarcasm_Headlines_Dataset.json", lines=True)
dfv2 = pd.read_json("/content/drive/MyDrive/Projects/sarcasm_detector/Sarcasm_Headlines_Dataset_v2.json", lines=True)
# Combining both datasets
train_df=pd.concat([df,dfv2])
train_df.head()

Unnamed: 0,article_link,headline,is_sarcastic
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0


In [2]:
# Shuffle training dataframe
train_df_shuffled = train_df.sample(frac=1, random_state=42)
train_df_shuffled['headline'].astype(str)
train_df_shuffled.head()

Unnamed: 0,article_link,headline,is_sarcastic
25962,https://www.huffingtonpost.com/entry/stephen-c...,stephen colbert reveals the back-up slogans fo...,0
273,https://politics.theonion.com/hooded-members-o...,hooded members of congress drown another love ...,1
6087,https://local.theonion.com/man-knows-he-must-r...,man knows he must ride unexpected urge to clea...,1
24057,https://www.theonion.com/area-family-awakes-to...,area family awakes to find michelle obama tend...,1
26448,https://www.huffingtonpost.com/entry/james-cor...,james corden and harry styles kiss for holiday...,0


In [3]:
from sklearn.model_selection import train_test_split

X = train_df_shuffled.headline.values
y = train_df_shuffled.is_sarcastic.values

training_sentences, testing_sentences, training_labels, testing_labels = train_test_split(X, y, test_size=0.2)

In [4]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle

vocab_size = 10000
embedding_dim = 16
max_length = 100
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)

filename = 'tokenizer.pickle'
pickle.dump(tokenizer, open(filename, 'wb'), protocol=pickle.HIGHEST_PROTOCOL)

word_index = tokenizer.word_index

training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [5]:
# Need this block to get it to work with TensorFlow 2.x
import numpy as np
training_padded = np.array(training_padded)
training_labels = np.array(training_labels)
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels)

In [6]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [7]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 16)           160000    
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 32)                544       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 33        
Total params: 160,577
Trainable params: 160,577
Non-trainable params: 0
_________________________________________________________________


In [8]:
num_epochs = 30
history = model.fit(training_padded, training_labels, 
                    epochs=num_epochs, 
                    validation_data=(testing_padded, testing_labels))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [9]:
from tensorflow.keras.models import save_model
save_model(model,'model.h5')


In [10]:
import tensorflow as tf
from tensorflow import keras

model = keras.models.load_model('model.h5')


In [11]:
sentence = 'Earthquake damage is caused by shaking'

In [12]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import pickle

tk = Tokenizer()
with open('/content/drive/MyDrive/Projects/sarcasm_detector/tokenizer.pickle', 'rb') as handle:
    tk = pickle.load(handle)

In [13]:
X = tk.texts_to_sequences([sentence])
X = pad_sequences(X, maxlen=100, padding='post', value=0)
pred = model.predict(X)

In [14]:
pred[0]

array([0.9998349], dtype=float32)