![alt text](https://drive.google.com/uc?export=view&id=1UXScsVx_Wni_JuDdB8LeTnM6jsPfIwkW)

Proprietary content. © Great Learning. All Rights Reserved. Unauthorized use or distribution prohibited.

### Package Version:
- tensorflow==2.2.0
- pandas==1.0.5
- numpy==1.18.5
- google==2.0.3

# Sarcasm Detection

### Dataset

#### Acknowledgement
Misra, Rishabh, and Prahal Arora. "Sarcasm Detection using Hybrid Neural Network." arXiv preprint arXiv:1908.07414 (2019).

**Required Files given in below link.**

https://drive.google.com/drive/folders/1xUnF35naPGU63xwRDVGc-DkZ3M8V5mMk

### Load Data (3 Marks)

In [None]:
from google.colab import drive
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from sklearn.preprocessing import LabelBinarizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud,STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize
from bs4 import BeautifulSoup
import re,string,unicodedata
from keras.preprocessing import text, sequence
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.model_selection import train_test_split
from string import punctuation
import keras
from keras.models import Sequential
from keras.layers import Dense,Embedding,LSTM,Dropout,Bidirectional,GRU
import tensorflow as tf


In [None]:
df = pd.read_json("../gdrive/My Drive/Project/Sarcasm Detection/Data/Sarcasm_Headlines_Dataset.json", lines=True)
df.head()

Unnamed: 0,article_link,headline,is_sarcastic
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0


### Drop `article_link` from dataset (3 Marks)

In [None]:
df.isna().sum()

article_link    0
headline        0
is_sarcastic    0
dtype: int64

In [None]:
del df['article_link']


In [None]:
df.head

<bound method NDFrame.head of                                                 headline  is_sarcastic
0      former versace store clerk sues over secret 'b...             0
1      the 'roseanne' revival catches up to our thorn...             0
2      mom starting to fear son's web series closest ...             1
3      boehner just wants wife to listen, not come up...             1
4      j.k. rowling wishes snape happy birthday in th...             0
...                                                  ...           ...
26704               american politics in moral free-fall             0
26705                            america's best 20 hikes             0
26706                              reparations and obama             0
26707  israeli ban targeting boycott supporters raise...             0
26708                  gourmet gifts for the foodie 2014             0

[26709 rows x 2 columns]>

### Get length of each headline and add a column for that (3 Marks)

In [None]:
df["Headline Length"]=df["headline"].str.len()

In [None]:
df

Unnamed: 0,headline,is_sarcastic,Headline Length
0,former versace store clerk sues over secret 'b...,0,78
1,the 'roseanne' revival catches up to our thorn...,0,84
2,mom starting to fear son's web series closest ...,1,79
3,"boehner just wants wife to listen, not come up...",1,84
4,j.k. rowling wishes snape happy birthday in th...,0,64
...,...,...,...
26704,american politics in moral free-fall,0,36
26705,america's best 20 hikes,0,23
26706,reparations and obama,0,21
26707,israeli ban targeting boycott supporters raise...,0,60


### Initialize parameter values
- Set values for max_features, maxlen, & embedding_size
- max_features: Number of words to take from tokenizer(most frequent words)
- maxlen: Maximum length of each sentence to be limited to 25
- embedding_size: size of embedding vector

In [None]:
max_features = 10000
maxlen = 25
embedding_size = 200

### Apply `tensorflow.keras` Tokenizer and get indices for words (3 Marks)
- Initialize Tokenizer object with number of words as 10000
- Fit the tokenizer object on headline column
- Convert the text to sequence


In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(df['headline'])
sequences = tokenizer.texts_to_sequences(df['headline'])



### Pad sequences (3 Marks)
- Pad each example with a maximum length
- Convert target column into numpy array

In [None]:
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
X = pad_sequences(sequences, maxlen=maxlen)
Y = df['is_sarcastic'].values
Y = np.vstack(Y)
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.3, random_state = 42)

### Vocab mapping
- There is no word for 0th index

In [None]:
tokenizer.word_index
print(tokenizer.word_index)
val=len(tokenizer.word_index)
print(val)

29656


### Set number of words
- Since the above 0th index doesn't have a word, add 1 to the length of the vocabulary

In [None]:
num_words = len(tokenizer.word_index) + 1
print(num_words)

29657


### Load Glove Word Embeddings (3 Marks)

### Create embedding matrix

In [None]:
EMBEDDING_FILE = '../gdrive/My Drive/Project/Sarcasm Detection/Data/glove.6B.200d.txt'

In [None]:
EMBEDDING_FILE = '../gdrive/My Drive/Project/Sarcasm Detection/Data/glove.6B.200d.txt'

embeddings = {}
for o in open(EMBEDDING_FILE):
    word = o.split(" ")[0]
    #print(word)
    embd = o.split(" ")[1:]
    embd = np.asarray(embd, dtype='float32')
    #print(embd)
    embeddings[word] = embd

# create a weight matrix for words in training docs
embedding_matrix = np.zeros((num_words, 200))

for word, i in tokenizer.word_index.items():
	embedding_vector = embeddings.get(word)
	if embedding_vector is not None:
		embedding_matrix[i] = embedding_vector

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
 -1.5387e-01  8.5432e-02 -5.3695e-01  4.6054e-01 -5.5538e-01 -5.9139e-01
 -1.5517e-01 -2.8654e-01  4.1996e-01 -2.6805e-01  2.1583e-01  3.8380e-01
 -8.3423e-02 -2.3261e-01  1.4032e-01 -1.1815e+00  4.0826e-01 -5.2669e-01
  1.4970e-01 -6.4766e-03 -2.9932e-02 -9.4087e-01  9.2404e-03 -4.9584e-01
 -1.7578e-01  1.6995e-01 -5.8417e-02 -1.8995e-01 -2.3456e-02  4.8558e-01
 -1.8241e-01 -6.8930e-01  9.8291e-02  5.4780e-02 -3.3840e-01 -4.1022e-01
  2.9928e-01 -9.5813e-01 -3.1496e-01  5.7035e-02 -3.5640e-01  1.0647e+00
 -1.0382e-01  5.5537e-01]
layout
[-0.097726   0.67313    0.5728     0.21874    0.58021   -0.10658
 -0.87721   -0.98366    0.36056   -1.2625     0.60296   -0.089173
  0.0070461  0.29342    0.21021   -0.46345    0.39215   -0.37537
  0.42455    0.015697  -0.23133    1.5437     0.51877   -0.14651
  0.72707    0.021268   0.35409   -0.53086   -0.50287   -0.51295
 -0.59423    0.22163    0.21011   -0.27612    0.11577   -0.031852

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
 -1.1287e-01  8.5896e-02  2.4258e-03 -2.6241e-01 -1.0850e-01  4.6628e-01
 -1.1001e+00  3.7631e-01  1.2846e+00 -2.4567e-01 -1.5709e-01  3.6508e-01
 -2.7061e-01  1.7334e-01 -1.6547e-01  4.9802e-01 -3.3223e-01  2.0194e-01
  5.8037e-01 -4.8517e-01 -3.0753e-01 -4.6274e-01 -8.6334e-02  7.6703e-01
 -4.2603e-01  3.2898e-01 -1.1632e-01 -2.5088e-01  3.2618e-01  3.3916e-01
  6.7178e-01 -5.4603e-01 -8.9994e-02  8.4794e-01  4.1088e-02 -8.9179e-02
 -9.8843e-01 -2.2392e-01  2.3460e-01  3.8945e-01 -4.6112e-01 -8.4757e-01
 -5.5709e-01 -3.0529e-01 -3.0192e-01  3.5590e-01 -2.6058e-01  4.0933e-01
 -1.5966e-01 -7.1914e-03]
microscope
[-3.6878e-01 -4.4945e-01 -1.4459e-01 -1.6215e-03  5.1812e-01 -2.8997e-01
 -7.7140e-02  1.6159e-02  5.9800e-01 -2.0593e-01  4.7015e-01  4.7996e-01
  5.1843e-01 -1.1352e-01 -2.1708e-01  2.7951e-01 -5.7146e-02  1.6563e-01
  1.9735e-01  3.3992e-01 -7.5751e-01  6.2289e-01  5.2424e-01 -4.4953e-01
  7.2955e-01 -3.4933e-

Define model (5 Marks)
- Hint: Use Sequential model instance and then add Embedding layer, Bidirectional(LSTM) layer, then dense and dropout layers as required. 
In the end add a final dense layer with sigmoid activation for binary classification.

In [36]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,GlobalMaxPool1D,Dense,Dropout,Conv1D,MaxPooling1D,Bidirectional,LSTM

model = Sequential()
model.add(Embedding(num_words,embedding_size, weights=[embedding_matrix]))
model.add(Bidirectional(LSTM(units=128 , return_sequences=True)))
model.add(GlobalMaxPool1D())
model.add(Dense(40, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(20, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))


In [38]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 200)         5931400   
_________________________________________________________________
bidirectional (Bidirectional (None, None, 256)         336896    
_________________________________________________________________
global_max_pooling1d (Global (None, 256)               0         
_________________________________________________________________
dense (Dense)                (None, 40)                10280     
_________________________________________________________________
dropout (Dropout)            (None, 40)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 20)                820       
_________________________________________________________________
dropout_1 (Dropout)          (None, 20)               

### Compile the model (3 Marks)

In [39]:
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

### Fit the model (4 Marks)

In [41]:
batch_size=100
epochs=10
history=model.fit(X,Y,batch_size=batch_size,epochs=epochs,validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [42]:
#Reducing the layers

model1=Sequential()
model1.add(Embedding(num_words,embedding_size,weights=[embedding_matrix]))
model1.add(Bidirectional(LSTM(units=128,recurrent_dropout=0.5,dropout=0.5)))
model1.add(Dense(40,activation='relu'))
model1.add(Dropout(0.5))
model1.add(Dense(1,activation='sigmoid'))



In [43]:
model1.compile(optimizer='adam',loss='binary_crossentropy',metrics=['acc'])

In [44]:
batch_size=100
epochs=10
history=model.fit(X,Y,batch_size=batch_size,epochs=epochs,validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
import matplotlib.pyplot as plt
fig,(ax1,ax2) = plt.subplots(1,2,figsize=(15,5))
fig.suptitle("Performances of th embeddings (FAST TEXT")
ax1.plot(history.history['acc'])
ax1.plot(history.history['val_acc'])
vline_cut = np.where(history.history['val_acc'] == np.max(history.history['val_acc']))[0][0]
ax1.axvline(x=vline_cut,color='k',linestyle='--')
ax1.set_title("Model Accuracy")
ax1.legend(['train','test'])

ax2.plot(history.history['loss'])
ax2.plot(history.history['val_loss'])
vline_cut = np.where(history.history['val_loss'] == np.max(history.history['val_loss']))[0][0]
ax2.axvline(x=vline_cut,color='k',linestyle='--')
ax2.set_title("Model Loss")
ax2.legend(['train','test'])
plt.show()