# Sarcasm Detector

## Get and Load Data

In [1]:
# !pip install --upgrade --no-cache-dir gdown

Collecting gdown
  Downloading gdown-4.7.1-py3-none-any.whl (15 kB)
Installing collected packages: gdown
  Attempting uninstall: gdown
    Found existing installation: gdown 4.6.6
    Uninstalling gdown-4.6.6:
      Successfully uninstalled gdown-4.6.6
Successfully installed gdown-4.7.1


In [2]:
# !gdown --id 1ytPDo88FEC2ArOjdqErAiarAZBNJzEJz

Downloading...
From: https://drive.google.com/uc?id=1ytPDo88FEC2ArOjdqErAiarAZBNJzEJz
To: /content/SarcasmDetect.json
100% 6.06M/6.06M [00:00<00:00, 35.3MB/s]


In [63]:
import contractions
from bs4 import BeautifulSoup
import numpy as np
import re
import tqdm
import unicodedata

def strip_html_tags(text):
  soup = BeautifulSoup(text, "html.parser")
  [s.extract() for s in soup(['iframe', 'script'])]
  stripped_text = soup.get_text()
  stripped_text = re.sub(r'[\r|\n|\r\n]+', '\n', stripped_text)
  return stripped_text

def remove_accented_chars(text):
  text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
  return text

def pre_process_corpus(doc):

  # remove HTML tags
  doc = strip_html_tags(doc)
  # remove extra newlines and tabs \ spaces
  doc = doc.translate(doc.maketrans("\n\t\r", "   "))
  # lowercase the text
  doc = doc.lower()
  # remove accented characters from text => convert to plain english
  doc = remove_accented_chars(doc)
  # expand contractions i.e. won't => would not
  doc = contractions.fix(doc)

  # lower case and remove special characters\whitespaces
  # doc = re.sub(r'[^a-zA-Z0-9\s]', '', doc, flags=re.I|re.A)
  doc = re.sub(' +', ' ', doc) # remove extra spaces between words
  doc = re.sub('"', '', doc)
  doc = re.sub('\'', '', doc)
  doc = re.sub('-', ' ', doc)
  doc = re.sub(':', '', doc)
  doc = re.sub(',', '', doc)
  doc = re.sub(r"\s+", " ", doc)

  doc = doc.strip()

  return doc

In [64]:
import pandas as pd

df = pd.read_json('./SarcasmDetect.json', lines=True)
df.head()

Unnamed: 0,is_sarcastic,headline,article_link
0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...
1,0,dem rep. totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...
2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...
3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...
4,1,mother comes pretty close to using word 'strea...,https://www.theonion.com/mother-comes-pretty-c...


## Remove all records with no headline text

In [65]:
df = df[df['headline'] != '']
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 28619 entries, 0 to 28618
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   is_sarcastic  28619 non-null  int64 
 1   headline      28619 non-null  object
 2   article_link  28619 non-null  object
dtypes: int64(1), object(2)
memory usage: 894.3+ KB


In [66]:
# remove article link
df = df.drop(columns="article_link")

In [67]:
# check counts
df["is_sarcastic"].value_counts()

0    14985
1    13634
Name: is_sarcastic, dtype: int64

It is balanced

In [68]:
df.sample(20, random_state=0)

Unnamed: 0,is_sarcastic,headline
151,1,exasperated huckabee sanders reminds press cor...
17228,0,if only all tampon ads were this honest
4725,1,moviegoer manages to sneak candy past teenage ...
4258,0,noaa predicts we'll see more hurricanes this y...
12250,1,new lawn-care product makes neighbor's lawn le...
12447,1,pence relaxes onstage by imagining entire deba...
25787,0,robert e. lee was not an 'honorable man.' he w...
2080,0,mark halperin says he is 'profoundly sorry' af...
14509,1,white castle bathroom stall celebrates 5th con...
26498,1,asshole even shoots pool like an asshole


In [9]:
# !pip install contractions

Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.9/289.9 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.0.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.8/110.8 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyahocorasick, anyascii, textsearch, contractions
Successfully installed anyascii-0.3.2 contractions-0.1.73 pyahocorasick-2.0.0 textsearch-0.0.24


In [69]:
df["headline_clean"] = df["headline"].apply(pre_process_corpus)

  soup = BeautifulSoup(text, "html.parser")


In [70]:
df

Unnamed: 0,is_sarcastic,headline,headline_clean
0,1,thirtysomething scientists unveil doomsday clo...,thirtysomething scientists unveil doomsday clo...
1,0,dem rep. totally nails why congress is falling...,dem rep. totally nails why congress is falling...
2,0,eat your veggies: 9 deliciously different recipes,eat your veggies 9 deliciously different recipes
3,1,inclement weather prevents liar from getting t...,inclement weather prevents liar from getting t...
4,1,mother comes pretty close to using word 'strea...,mother comes pretty close to using word stream...
...,...,...,...
28614,1,jews to celebrate rosh hashasha or something,jews to celebrate rosh hashasha or something
28615,1,internal affairs investigator disappointed con...,internal affairs investigator disappointed con...
28616,0,the most beautiful acceptance speech this week...,the most beautiful acceptance speech this week...
28617,1,mars probe destroyed by orbiting spielberg-gat...,mars probe destroyed by orbiting spielberg gat...


In [71]:
df.sample(20, random_state=0)

Unnamed: 0,is_sarcastic,headline,headline_clean
151,1,exasperated huckabee sanders reminds press cor...,exasperated huckabee sanders reminds press cor...
17228,0,if only all tampon ads were this honest,if only all tampon ads were this honest
4725,1,moviegoer manages to sneak candy past teenage ...,moviegoer manages to sneak candy past teenage ...
4258,0,noaa predicts we'll see more hurricanes this y...,noaa predicts we will see more hurricanes this...
12250,1,new lawn-care product makes neighbor's lawn le...,new lawn care product makes neighbors lawn les...
12447,1,pence relaxes onstage by imagining entire deba...,pence relaxes onstage by imagining entire deba...
25787,0,robert e. lee was not an 'honorable man.' he w...,robert e. lee was not an honorable man. he was...
2080,0,mark halperin says he is 'profoundly sorry' af...,mark halperin says he is profoundly sorry afte...
14509,1,white castle bathroom stall celebrates 5th con...,white castle bathroom stall celebrates 5th con...
26498,1,asshole even shoots pool like an asshole,asshole even shoots pool like an asshole


In [72]:
df = df.drop_duplicates()

In [73]:
df

Unnamed: 0,is_sarcastic,headline,headline_clean
0,1,thirtysomething scientists unveil doomsday clo...,thirtysomething scientists unveil doomsday clo...
1,0,dem rep. totally nails why congress is falling...,dem rep. totally nails why congress is falling...
2,0,eat your veggies: 9 deliciously different recipes,eat your veggies 9 deliciously different recipes
3,1,inclement weather prevents liar from getting t...,inclement weather prevents liar from getting t...
4,1,mother comes pretty close to using word 'strea...,mother comes pretty close to using word stream...
...,...,...,...
28614,1,jews to celebrate rosh hashasha or something,jews to celebrate rosh hashasha or something
28615,1,internal affairs investigator disappointed con...,internal affairs investigator disappointed con...
28616,0,the most beautiful acceptance speech this week...,the most beautiful acceptance speech this week...
28617,1,mars probe destroyed by orbiting spielberg-gat...,mars probe destroyed by orbiting spielberg gat...


In [76]:
df.to_csv("sarcasm_data_clean_v1.csv", index=False)