In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install contractions


**Contractions** are shortened version of words or syllables. They often exist in either written or spoken forms in the English language. These shortened versions or contractions of words are created by removing specific letters and sounds.

# ***Importing the libraries***

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from plotly import graph_objs as go
import plotly.figure_factory as ff


#Text Preprocessing libraries
import nltk
nltk.download('stopwords')
import re 
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud,STOPWORDS
from nltk.tokenize import word_tokenize
import contractions
from nltk.stem import SnowballStemmer


from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.metrics import accuracy_score,confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline


import xgboost as xgb
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from keras.optimizers import RMSprop
from keras.models import Model

nltk.download('wordnet')
nltk.download('punkt')

In [None]:
# Defining all our palette colours.
primary_blue = "#496595"
primary_red = "#eb345b"
primary_blue3 = "#3f4d63"
primary_grey = "#c6ccd8"
primary_black = "#202022"
primary_bgcolor = "#f4f0ea"

# ***Loading the news-headlines-dataset***

In [None]:
df = pd.read_json("../input/news-headlines-dataset-for-sarcasm-detection/Sarcasm_Headlines_Dataset_v2.json", lines=True)
df.head()

In [None]:
df.info()

In [None]:
print("-------Shape of data--------")
df.shape

In [None]:
#Copying the data 
df_copy = df.copy()

In [None]:
df['is_sarcastic'].value_counts()

In [None]:
df['source_of_text'] = df['article_link'].apply(lambda x: re.findall(r'\w+', x)[2])
df.head()

In [None]:
df = df.drop(['article_link'],axis = 1)

In [None]:
df.head()

In [None]:
px.pie(df,names='is_sarcastic',labels=['Sarcastic','Acclaim'],title='Sarcasam Vs Acclaim',template='plotly_dark')

In [None]:
# prettier graphs!
plt.style.use('ggplot')

In [None]:
target_counts=df['source_of_text'].value_counts()
plt.figure(figsize = (15,7))
sns.barplot(y=target_counts,x=target_counts.index)
plt.title("Counting the values in Source column",fontsize = 24)
plt.ylabel('Sample')
plt.xlabel('Target')

In [None]:
df['message_len'] = df['headline'].apply(lambda x: len(x.split(' ')))
df.head()

In [None]:
sarcastic_df = df[df['is_sarcastic'] == 1]['message_len'].value_counts().sort_index()
not_sarcastic_df = df[df['is_sarcastic'] == 0]['message_len'].value_counts().sort_index()

fig = go.Figure()
fig.add_trace(go.Scatter(
    x=sarcastic_df.index,
    y=sarcastic_df.values,
    name='Sarcastic',
    fill='tozeroy',
    marker_color=primary_red,
))
fig.add_trace(go.Scatter(
    x=not_sarcastic_df.index,
    y=not_sarcastic_df.values,
    name='Acclaim',
    fill='tozeroy',
    marker_color=primary_blue,
))
fig.update_layout(
    title='<span style="font-size:32px; font-family:Times New Roman">Data Roles in Different Fields</span>'
)
fig.update_xaxes(range=[0, 70])
fig.show()

# ***Proprocessing the data***

In [None]:
df['headline']=df['headline'].str.lower()
# Code to remove the Hashtags from the text
df['headline']=df['headline'].apply(lambda x:re.sub(r'\B#\S+','',x))
# Code to remove the links from the text
df['headline']=df['headline'].apply(lambda x:re.sub(r"http\S+", "", x))
# Code to remove the Special characters from the text 
df['headline']=df['headline'].apply(lambda x:' '.join(re.findall(r'\w+', x)))
# Code to substitute the multiple spaces with single spaces
df['headline']=df['headline'].apply(lambda x:re.sub(r'\s+', ' ', x, flags=re.I))
# Code to remove all the single characters in the text
df['headline']=df['headline'].apply(lambda x:re.sub(r'\s+[a-zA-Z]\s+', '', x))
# Remove the twitter handlers
df['headline']=df['headline'].apply(lambda x:re.sub('@[^\s]+','',x))

In [None]:
stop_words = stopwords.words('english')
more_stopwords = ['u', 'im', 'c']
stop_words = stop_words + more_stopwords

def remove_stopwords(text):
    text = ' '.join(word for word in text.split(' ') if word not in stop_words)
    return text
    
df['headline_clean'] = df['headline'].apply(remove_stopwords)
df.head()

In [None]:
def tokenization(text):
    text = re.split('\W+', text)
    return text

df['tokenized'] = df['headline_clean'].apply(lambda x: tokenization(x.lower()))
df.head()

In [None]:
sarcastic = df[df['is_sarcastic']==0]['headline_clean']
sarcastic[:10]

In [None]:
not_sarcastic = df[df['is_sarcastic']==1]['headline_clean']
not_sarcastic[:10]

In [None]:
stemmer = nltk.SnowballStemmer("english")

def stemm_text(text):
    text = ' '.join(stemmer.stem(word) for word in text.split(' '))
    return text

df['headline_clean'] = df['headline_clean'].apply(stemm_text)
df.head()

# ***Word Cloud***

In [None]:
plt.figure(figsize = (20,20)) # Text that is Not Sarcastic
wc = WordCloud(max_words = 2000 , width = 1600 , height = 800).generate(" ".join(df[df.is_sarcastic == 0].headline_clean))
plt.imshow(wc , interpolation = 'bilinear')

In [None]:
plt.figure(figsize = (20,20)) # Text that is Sarcastic
wc = WordCloud(max_words = 2000 , width = 1600 , height = 800).generate(" ".join(df[df.is_sarcastic == 1].headline_clean))
plt.imshow(wc , interpolation = 'bilinear')

# THE TOP 100 MOST FREQUENTLY OCCURING WORDS IN THE NEWS HEADLINE DATA

In [None]:
all_words=[]
for i in range(len(df['tokenized'])):
    a=df['tokenized'][i]
    for i in a:
        all_words.append(i)
all_words=pd.Series(np.array(all_words))

common_words=all_words.value_counts()[:100].rename_axis('Common Words').reset_index(name='count')

fig = px.treemap(common_words, path=['Common Words'], values='count',template= "plotly_dark",title='100 Most Common Words In Headline')
fig.show()

In [None]:
#Spliting it into training and testing 
X = df['headline_clean']
y = df['is_sarcastic']

# Split into train and test sets
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, y,test_size = 0.2,random_state=42)
print(len(x_train), len(y_train))
print(len(x_test), len(y_test))

# XGB CLASSIFIER

In [None]:
pipe = Pipeline([
    ('bow', CountVectorizer()), 
    ('tfid', TfidfTransformer()),  
    ('model', xgb.XGBClassifier(
        use_label_encoder=False,
        eval_metric='auc',
    ))
])
from sklearn import metrics

# Fit the pipeline with the data
MODEL = pipe.fit(x_train, y_train)

y_pred_class = pipe.predict(x_test)
y_pred_train = pipe.predict(x_train)

print('Train: {}'.format(metrics.accuracy_score(y_train, y_pred_train)))
print('Test: {}'.format(metrics.accuracy_score(y_test, y_pred_class)))

In [None]:
cm = metrics.confusion_matrix(y_test, y_pred_class)
cm = pd.DataFrame(cm , index = ['Not Sarcastic','Sarcastic'] , columns = ['Not Sarcastic','Sarcastic'])
plt.figure(figsize = (10,10))
sns.heatmap(cm,cmap= "seismic_r", linecolor = 'black' , linewidth = 1 , annot = True, fmt='' ,
            xticklabels = ['Not Sarcastic','Sarcastic'] , yticklabels = ['Not Sarcastic','Sarcastic'])

# ***Model Building with keras and Tensorflow***

In [None]:
vocab_size = 3000
max_len = 500
embedding_dim = 16
oov_tok = "<OOV>"
padding_type = "post"
trunc_type = "post"
training_size = 20000

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
tok = Tokenizer(num_words=vocab_size)
tok.fit_on_texts(x_train)
sequences = tok.texts_to_sequences(x_train)
sequences_matrix = sequence.pad_sequences(sequences,maxlen=max_len)

In [None]:
# define a DNN model with an Embedding layer
from tensorflow import keras
from keras import layers
import tensorflow as tf

model = keras.Sequential([layers.Embedding(vocab_size, embedding_dim, input_length=max_len),
                         layers.GlobalAveragePooling1D(),
                         layers.Dropout(0.3),
                         layers.Dense(16, activation="relu"),
                         layers.Dense(1, activation="sigmoid")])
                         
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
                         
model.summary()
keras.utils.plot_model(model)

In [None]:
#Early stopping
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
early_stopping = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)
rlrp=ReduceLROnPlateau(monitor='val_loss', patience=2,factor=0.01, min_lr=0.00001)

In [None]:
batch_size=16
epochs=20

In [None]:
history_model = model.fit(
    x=sequences_matrix,
    y=y_train,
    validation_data=(x_test, y_test),
    validation_split=0.1,
    batch_size=batch_size,
    epochs=epochs,
    shuffle=True,
    verbose=1,
    callbacks=[early_stopping,rlrp]
)

In [None]:
#Let's plot the curve for loss,val_loss,accuracy,val_accuracy
def plot_loss_nd_accuracy(history):
    history_df=pd.DataFrame(history)
    history_df.loc[0:,['loss','val_loss']].plot()
    history_df.loc[0:,['accuracy','val_accuracy']].plot()

In [None]:
plot_loss_nd_accuracy(history_model.history)

# **--------------------Up-Vote-----------------------**