# Import Libraries

In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns
import re
from collections import Counter
from wordcloud import WordCloud, STOPWORDS
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, confusion_matrix, auc, accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
import pickle
import cv2
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import *
from tensorflow.keras import Model, Input, Sequential
from datetime import datetime
from tensorflow.keras import Input, Model
from tensorflow.keras.layers import *
from tensorflow.keras.utils import plot_model
from tqdm import tqdm

In [2]:
tf.__version__, cv2.__version__, hub.__version__

('2.2.0', '4.5.4-dev', '0.12.0')

# Load data

In [3]:
test = pd.read_json("Sarcasm_Headlines_Dataset.json", lines=True)
test.head()

Unnamed: 0,is_sarcastic,headline,article_link
0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...
1,0,dem rep. totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...
2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...
3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...
4,1,mother comes pretty close to using word 'strea...,https://www.theonion.com/mother-comes-pretty-c...


In [4]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28619 entries, 0 to 28618
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   is_sarcastic  28619 non-null  int64 
 1   headline      28619 non-null  object
 2   article_link  28619 non-null  object
dtypes: int64(1), object(2)
memory usage: 670.9+ KB


In [None]:
train = pd.read_json("/kaggle/input/news-headlines-dataset-for-sarcasm-detection/Sarcasm_Headlines_Dataset_v2.json", lines=True)
train.head()

In [None]:
train.info()

In [None]:
plt.figure()
sns.countplot(data = train, x = "is_sarcastic")
plt.title("Class distribution")
plt.show()

In [None]:
def length(phrase):
  return len(phrase.split())

In [None]:
train["length"] = train["headline"].apply(length)
train.head()

In [None]:
plt.figure()
sns.displot(data = train, x = "length", kde = True)
plt.title("distribution of number of words in headlines")
plt.show()

In [None]:
for i in [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]:
  print("{0}th percentile is {1}".format(i, np.percentile(train["length"], i)))
print()
for i in [90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100]:
  print("{0}th percentile is {1}".format(i, np.percentile(train["length"], i)))
print()
for i in [99, 99.10, 99.20, 99.30, 99.40, 99.50, 99.60, 99.70, 99.80, 99.90]:
  print("{0}th percentile is {1}".format(i, np.percentile(train["length"], i)))
print()

In [None]:
# Reference: https://stackoverflow.com/a/47091490/6645883

def decontracted(phrase):
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    #phrase = re.sub(r"[^A-Za-z0-9 ]+", "", phrase)
    return phrase.lower()

In [None]:
train["headline"] = train["headline"].apply(decontracted)
test["headline"] = test["headline"].apply(decontracted)

In [None]:
# Reference: # https://www.geeksforgeeks.org/generating-word-cloud-python/

def wordcloud_plot(df):
  comment_words = ""
  stopwords = set(STOPWORDS)

  # iterate through the csv file
  for val in df.headline:
    
    # typecaste each val to string
    val = str(val)

    # split the value
    tokens = val.split()
    
    # Converts each token into lowercase
    for i in range(len(tokens)):
      tokens[i] = tokens[i].lower()
    
    comment_words += " ".join(tokens)+" "

  wordcloud = WordCloud(width = 800, height = 800,
          background_color = "white",
          stopwords = stopwords,
          min_font_size = 10).generate(comment_words)

  # plot the WordCloud image					
  plt.figure(figsize = (8, 8), facecolor = None)
  plt.imshow(wordcloud)
  plt.axis("off")
  plt.tight_layout(pad = 0)
  plt.show()

In [None]:
wordcloud_plot(train)

In [None]:
wordcloud_plot(test)

# Deep learning

In [None]:
y_train = train["is_sarcastic"]
y_test = test["is_sarcastic"]

In [None]:
!wget https://github.com/nagi1995/sarcastic-comment-detection/raw/main/glove_vectors

In [None]:
with open("./glove_vectors", "rb") as fi:
  glove_model = pickle.load(fi)
  glove_words = set(glove_model.keys())

In [None]:
t = Tokenizer()
t.fit_on_texts(train["headline"])

encoded_train = t.texts_to_sequences(train["headline"])
encoded_test = t.texts_to_sequences(test["headline"])

max_length = 25

padded_train = pad_sequences(encoded_train, 
                             maxlen = max_length, 
                             padding = "post", 
                             truncating = "post")

padded_test = pad_sequences(encoded_test, 
                            maxlen = max_length, 
                            padding = "post", 
                            truncating = "post")

print(padded_train.shape, padded_test.shape, type(padded_train))

vocab_size = len(t.word_index) + 1
vocab_size

In [None]:
embedding_matrix = np.zeros((vocab_size, 300)) # vector len of each word is 300

for word, i in t.word_index.items():
  if word in glove_words:
    vec = glove_model[word]
    embedding_matrix[i] = vec

embedding_matrix.shape

### callbacks

In [None]:
%load_ext tensorboard

In [None]:
def checkpoint_path():
  return "./model/weights.{epoch:02d}-{val_accuracy:.4f}.hdf5"

def log_dir():
  return "./logs/fit/" + datetime.now().strftime("%Y-%m-%d-%H:%M:%S")

earlystop = EarlyStopping(monitor = "val_accuracy", 
                          patience = 7, 
                          verbose = 1,  
                          restore_best_weights = True, 
                          mode = 'max')

reduce_lr = ReduceLROnPlateau(monitor = "val_accuracy", 
                              factor = .4642,
                              patience = 3,
                              verbose = 1, 
                              min_delta = 0.001,
                              mode = 'max')


### model building

In [None]:
tf.keras.backend.clear_session()
input = Input(shape = (max_length, ), name = "input")

embedding = Embedding(input_dim = vocab_size, 
                      output_dim = 300, # glove vector size
                      weights = [embedding_matrix], 
                      trainable = False)(input)

lstm = LSTM(32)(embedding)
flatten = Flatten()(lstm)

dense = Dense(16, activation = None, 
              kernel_initializer = "he_uniform")(flatten)

dropout = Dropout(.25)(dense)
activation = Activation("relu")(dropout)
output = Dense(2, activation = "softmax", name = "output")(activation)
model = Model(inputs = input, outputs = output)

model.compile(optimizer = "adam", loss = "sparse_categorical_crossentropy", metrics = ["accuracy"])

plot_model(model, to_file = "./model.png", show_shapes = True)

model.summary()

In [None]:
plt.figure(figsize = (10, 20))
image = cv2.imread("./model.png")
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
plt.imshow(image, cmap = "gray")
plt.show()

### training model

In [None]:
tensorboard_callback = TensorBoard(log_dir = log_dir(), 
                                   histogram_freq = 1, 
                                   write_images = True)

checkpoint = ModelCheckpoint(filepath = checkpoint_path(), 
                             monitor='val_accuracy', 
                             verbose = 1, 
                             save_best_only = True, 
                             mode = "max")

callbacks_list = [checkpoint, earlystop, reduce_lr]

history = model.fit(padded_train, y_train, 
                    validation_data = (padded_test, y_test), 
                    epochs = 30, 
                    batch_size = 32, 
                    callbacks = callbacks_list)

In [None]:
plt.figure()
L = len(history.history["loss"]) + 1
plt.plot(range(1, L), history.history["loss"], "bo-", label = "loss")
plt.plot(range(1, L), history.history["accuracy"], "g*-", label = "accuracy")
plt.plot(range(1, L), history.history["val_loss"], "y^-", label = "val_loss")
plt.plot(range(1, L), history.history["val_accuracy"], "ro-", label = "val_accuracy")
plt.legend()
plt.xlabel("epoch")
plt.grid()
plt.show()

### testing model

In [None]:
y_pred_softmax = model.predict(padded_test)
y_pred = []
for i in range(len(y_pred_softmax)):
  if  y_pred_softmax[i][0] >= 0.5:
    y_pred.append(0)
  else:
    y_pred.append(1)


print("Accuracy:", 100*accuracy_score(y_test, y_pred))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot = True, fmt = "d")
plt.xlabel("predicted label")
plt.ylabel("actual label")
plt.title("test confusion matrix")
plt.show()