# Importing Required Libraries

In [15]:
import pandas as pd
import numpy as np,re
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split

# Loading the DataSet

In [16]:
URL = 'https://raw.githubusercontent.com/bhavya-jain99/NLP/main/Sarcasm_Headlines_Dataset.json'
df = pd.read_json(URL,lines=True)
df

Unnamed: 0,article_link,headline,is_sarcastic
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0
...,...,...,...
26704,https://www.huffingtonpost.com/entry/american-...,american politics in moral free-fall,0
26705,https://www.huffingtonpost.com/entry/americas-...,america's best 20 hikes,0
26706,https://www.huffingtonpost.com/entry/reparatio...,reparations and obama,0
26707,https://www.huffingtonpost.com/entry/israeli-b...,israeli ban targeting boycott supporters raise...,0


# Checking Null Values

In [17]:
df.isnull().sum()

article_link    0
headline        0
is_sarcastic    0
dtype: int64

# Insight About The data

In [18]:
df.describe()

Unnamed: 0,is_sarcastic
count,26709.0
mean,0.438953
std,0.496269
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


# Cleaning The data

In [19]:
df['headline'] = df['headline'].apply(lambda s : re.sub('[^a-zA-Z]', ' ', s))

# Getting features and labels

In [20]:
#Article Link doesnot play an important role in prediction of label hence skipping it.
feature = df['headline']
label = df['is_sarcastic']

# Stemming using porter Stemmer

In [21]:
#Stemming is the process of reducing a word to its word stem.
PS = PorterStemmer()
feature = feature.apply(lambda x: x.split())
feature = feature.apply(lambda x : ' '.join([PS.stem(word) for word in x]))

# Vectorization of features using TF-IDF Vectorizer

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features = 5000)
feature = list(feature)
feature = tfidf.fit_transform(feature).toarray()

# Training and Testing data

In [23]:
train_x,test_x,train_y,test_y = train_test_split(feature, label, test_size = .05, random_state = 0)

# Model 1: Logistic Regression

In [24]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(train_x, train_y)
print(lr.score(train_x, train_y))   
print(lr.score(test_x, test_y))     

0.8816458440074094
0.8308383233532934


# Model 2: Random Forest Classifier

In [25]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators = 10, random_state = 0)
rfc.fit(train_x, train_y)
print(rfc.score(train_x, train_y))  
print(rfc.score(test_x, test_y))   

0.9882946439128207
0.7971556886227545


# Model 3: Gaussian Naive Bayes

In [26]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(train_x, train_y)
print(gnb.score(train_x, train_y))  
print(gnb.score(test_x, test_y))    

0.7886335868836952
0.7380239520958084


# Testing Scores for various models is :
Logistic Regression: 83.08 
---
Random Forest Classifier : 79.71
---
Gaussian Naive Bayes: 73.80
---

## Logistic Regression is Having the most accuracy