# Data Preparation

In [1]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import contractions
import emoji
import pandas
import re
import string

# Read Data

In [2]:
data = pandas.read_csv("../data/train-balanced-sarcasm.csv", index_col=False)

In [3]:
display(data)

Unnamed: 0,label,comment,author,subreddit,score,ups,downs,date,created_utc,parent_comment
0,0,NC and NH.,Trumpbart,politics,2,-1,-1,2016-10,2016-10-16 23:55:23,"Yeah, I get that argument. At this point, I'd ..."
1,0,You do know west teams play against west teams...,Shbshb906,nba,-4,-1,-1,2016-11,2016-11-01 00:24:10,The blazers and Mavericks (The wests 5 and 6 s...
2,0,"They were underdogs earlier today, but since G...",Creepeth,nfl,3,3,0,2016-09,2016-09-22 21:45:37,They're favored to win.
3,0,"This meme isn't funny none of the ""new york ni...",icebrotha,BlackPeopleTwitter,-8,-1,-1,2016-10,2016-10-18 21:03:47,deadass don't kill my buzz
4,0,I could use one of those tools.,cush2push,MaddenUltimateTeam,6,-1,-1,2016-12,2016-12-30 17:00:13,Yep can confirm I saw the tool they use for th...
...,...,...,...,...,...,...,...,...,...,...
1010821,1,I'm sure that Iran and N. Korea have the techn...,TwarkMain,reddit.com,2,2,0,2009-04,2009-04-25 00:47:52,"No one is calling this an engineered pathogen,..."
1010822,1,"whatever you do, don't vote green!",BCHarvey,climate,1,1,0,2009-05,2009-05-14 22:27:40,In a move typical of their recent do-nothing a...
1010823,1,Perhaps this is an atheist conspiracy to make ...,rebelcommander,atheism,1,1,0,2009-01,2009-01-11 00:22:57,Screw the Disabled--I've got to get to Church ...
1010824,1,The Slavs got their own country - it is called...,catsi,worldnews,1,1,0,2009-01,2009-01-23 21:12:49,I've always been unsettled by that. I hear a l...


# Process data

### Drop unused columns

In [4]:
data.drop(columns=["author", "subreddit", "score", "ups", "downs", "date", "created_utc"], inplace=True)

### Clean & Reformat Data

In [5]:
stop_words = set(stopwords.words('english'))

def clean_text(text):
    
    text = contractions.fix(str(text))
    text = str(text).lower()
    
    text = text.replace('.', ' PERIOD ')
    text = text.replace(',', ' COMMA ')
    text = text.replace('"', ' QUOTATIONMARK ')
    text = text.replace(';', ' SEMICOLON ')
    text = text.replace('!', ' EXCLAMATIONMARK ')
    text = text.replace('?', ' QUESTION_MARK ')
    text = text.replace('(', ' LEFTPAREN ')
    text = text.replace(')', ' RIGHT_PAREN ')
    text = text.replace('--', ' HYPHENS ')
    text = text.replace('?', ' QUESTIONMARK ')
    text = text.replace('\n', ' NEWLINE ')
    text = text.replace(':', ' COLON ')
    
    text = emoji.demojize(text)
    
    text = re.sub(r'[^\w\s]', '', text)
    
    tokens = word_tokenize(text)
    text = " ".join([token for token in tokens if token not in stop_words])
    
    # Drop row if the comment is empty after cleaning
    if text.strip() == "":
        return None

    return text

In [6]:
data['comment'] = data['comment'].apply(clean_text)
print("[COMMENTS CLEANED]")

data['parent_comment'] = data['parent_comment'].apply(clean_text)
print("[PARENT COMMENTS CLEANED]")

[COMMENTS CLEANED]
[PARENT COMMENTS CLEANED]


### Drop NA Rows

In [7]:
data = data.dropna(subset=['label'])
data = data.dropna(subset=['comment'])
data = data.dropna(subset=['parent_comment'])

data = data.reset_index(drop=True)

# Result

In [8]:
print(display(data))

Unnamed: 0,label,comment,parent_comment
0,0,nc nh PERIOD,yeah COMMA get argument PERIOD point COMMA wou...
1,0,know west teams play west teams east teams rig...,blazers mavericks LEFTPAREN wests 5 6 seed RIG...
2,0,underdogs earlier today COMMA since gronks ann...,favored win PERIOD
3,0,meme funny none QUOTATIONMARK new york nigga Q...,deadass kill buzz
4,0,could use one tools PERIOD,yep confirm saw tool use PERIOD made boy easpo...
...,...,...,...
1006661,1,sure iran n PERIOD korea technology create pig...,one calling engineered pathogen COMMA reports ...
1006662,1,whatever COMMA vote green EXCLAMATIONMARK,move typical recent donothing approach co2 emi...
1006663,1,perhaps atheist conspiracy make christians loo...,screw disabled HYPHENS got get church time
1006664,1,slavs got country called kosovo,always unsettled PERIOD hear lot jewish people...


None


In [9]:
print(data["label"].value_counts())

1    504230
0    502436
Name: label, dtype: int64


# Save 

In [11]:
data.to_csv("../data/processed_sarcasm-dataset.csv", index=False)