### **1. Import Libraries**

In [0]:
import pandas as pd 
import json
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

from keras.models import Model
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from keras.models import Sequential

### **Import Dataset**

In [0]:
df_v1 = pd.read_json('/content/Sarcasm_Headlines_Dataset.json',lines=True)
df_v2 = pd.read_json('/content/Sarcasm_Headlines_Dataset_v2.json',lines=True)

In [0]:
df = df_v1.append(df_v2, sort = False)

### **2.Preprocessing**
I will first delete the article_link column since it won't be needed for this project, and then check to see the proportion of sarcastic and not_sarcastic headlines.

In [0]:
df = df.iloc[:,[1,2]]
df[:5]

Unnamed: 0,headline,is_sarcastic
0,former versace store clerk sues over secret 'b...,0
1,the 'roseanne' revival catches up to our thorn...,0
2,mom starting to fear son's web series closest ...,1
3,"boehner just wants wife to listen, not come up...",1
4,j.k. rowling wishes snape happy birthday in th...,0


In [0]:
overview = df.groupby(by = "is_sarcastic")
overview.size()

is_sarcastic
0    29970
1    25358
dtype: int64

Even if all headlines seem to have already lowercased. I will do so just to make sure. I will also delete all punctuation. 

In [0]:
import numpy as np
tokenizer = Tokenizer(num_words = 5000, filters = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower = True, split = ' ')
tokenizer.fit_on_texts(list(df['headline']))
X = tokenizer.texts_to_sequences(df['headline'])
print(X[2])
X = pad_sequences(X)
print(X[2])
Y = pd.get_dummies(df['is_sarcastic']).values
print(np.argmax(Y[2]))

[144, 764, 1, 929, 1818, 2194, 603, 4800, 222, 130, 38, 44, 1]
[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0  144  764    1  929 1818
 2194  603 4800  222  130   38   44    1]
1


In [0]:
X_train, X_valid, y_train, y_valid = train_test_split(X, Y, test_size=0.3, random_state=1)

### **3. Machine Learning Model**

Model used: bidirectional - RNN - LSTM

In [0]:
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, Bidirectional
model = Sequential()
model.add(Embedding(input_dim=10000, output_dim=12))
model.add(Bidirectional(LSTM(16, return_sequences=True, recurrent_dropout=0.2, dropout=0.2)))
model.add(Bidirectional(LSTM(32, recurrent_dropout=0.2, dropout=0.2)))
model.add(Dense(2, activation='softmax'))
model.compile(optimizer = 'rmsprop',
                  loss = 'binary_crossentropy',
                  metrics = ['acc'])

print(model.summary())

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 12)          120000    
_________________________________________________________________
bidirectional_3 (Bidirection (None, None, 32)          3712      
_________________________________________________________________
bidirectional_4 (Bidirection (None, 64)                16640     
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 130       
Total params: 140,482
Trainable params: 140,482
Non-trainable params: 0
_________________________________________________________________
None


In [0]:
history = model.fit(X_train, y_train, epochs = 5, verbose = 2, validation_split = 0.2)
model.save('new_model.h5')

Train on 30983 samples, validate on 7746 samples
Epoch 1/5
 - 587s - loss: 0.4221 - acc: 0.7967 - val_loss: 0.3280 - val_acc: 0.8612
Epoch 2/5
 - 558s - loss: 0.3019 - acc: 0.8735 - val_loss: 0.3086 - val_acc: 0.8673
Epoch 3/5
 - 581s - loss: 0.2767 - acc: 0.8868 - val_loss: 0.3079 - val_acc: 0.8700
Epoch 4/5
 - 558s - loss: 0.2714 - acc: 0.8896 - val_loss: 0.3042 - val_acc: 0.8765
Epoch 5/5
 - 553s - loss: 0.2606 - acc: 0.8939 - val_loss: 0.2974 - val_acc: 0.8774


In [0]:
import numpy as np
true_result = []
for i in range(len(y_valid)):
  true_result.append(y_valid[i][1])
true_result

true_pos = 0
true_neg = 0
false_pos = 0
false_neg = 0

for i in range(len(y_valid)):
  pred = model.predict(X_valid[i].reshape(1,X_valid.shape[1]),batch_size=1,verbose = 2)
  pred = np.argmax(pred)
  
  if pred == 1: #positive
    if pred == true_result[i]: #true positive
      true_pos += 1
    else: #false positive
      false_pos += 1
  elif pred == 0: #negative
    if pred == true_result[i]: #true negative
      true_neg += 1
    else: #false negative
      false_neg+= 1
      
print(true_pos + true_neg + false_pos + false_neg)
print(len(y_valid))

precision = true_pos/(true_pos + false_pos)
recall = true_pos/(true_pos + false_neg)
accuracy = (true_pos + true_neg)/len(y_valid)
F1 = 2*((accuracy*recall)/(accuracy + recall))
print("precision: ")
print(precision)
print("recall: ")
print(recall)
print("accuracy: ")
print(accuracy)
print("F1: ")
print(F1)

KeyboardInterrupt: ignored