# Data Preparation

In [1]:
from nltk.tokenize import word_tokenize

import contractions
import emoji
import pandas
import re
import string

# Read Data

In [2]:
data = pandas.read_csv("../data/train-balanced-sarcasm.csv", index_col=False)

In [3]:
data.dropna(inplace = True)
data['label'] = data['label'].astype('int64')
data['comment'] = data['comment'].astype('string')
data['parent_comment'] = data['parent_comment'].astype('string')

In [4]:
display(data)

Unnamed: 0,label,comment,author,subreddit,score,ups,downs,date,created_utc,parent_comment
0,0,NC and NH.,Trumpbart,politics,2,-1,-1,2016-10,2016-10-16 23:55:23,"Yeah, I get that argument. At this point, I'd ..."
1,0,You do know west teams play against west teams...,Shbshb906,nba,-4,-1,-1,2016-11,2016-11-01 00:24:10,The blazers and Mavericks (The wests 5 and 6 s...
2,0,"They were underdogs earlier today, but since G...",Creepeth,nfl,3,3,0,2016-09,2016-09-22 21:45:37,They're favored to win.
3,0,"This meme isn't funny none of the ""new york ni...",icebrotha,BlackPeopleTwitter,-8,-1,-1,2016-10,2016-10-18 21:03:47,deadass don't kill my buzz
4,0,I could use one of those tools.,cush2push,MaddenUltimateTeam,6,-1,-1,2016-12,2016-12-30 17:00:13,Yep can confirm I saw the tool they use for th...
...,...,...,...,...,...,...,...,...,...,...
1010821,1,I'm sure that Iran and N. Korea have the techn...,TwarkMain,reddit.com,2,2,0,2009-04,2009-04-25 00:47:52,"No one is calling this an engineered pathogen,..."
1010822,1,"whatever you do, don't vote green!",BCHarvey,climate,1,1,0,2009-05,2009-05-14 22:27:40,In a move typical of their recent do-nothing a...
1010823,1,Perhaps this is an atheist conspiracy to make ...,rebelcommander,atheism,1,1,0,2009-01,2009-01-11 00:22:57,Screw the Disabled--I've got to get to Church ...
1010824,1,The Slavs got their own country - it is called...,catsi,worldnews,1,1,0,2009-01,2009-01-23 21:12:49,I've always been unsettled by that. I hear a l...


# Process data

### Drop unused columns

In [5]:
data.drop(columns=["author", "subreddit", "score", "ups", "downs", "date", "created_utc"], inplace=True)

In [6]:
print(data.dtypes)

label              int64
comment           string
parent_comment    string
dtype: object


### Clean & Reformat Data

In [7]:
punc_inplace = {
    ':)' : ' SMILEY ',
    ':D' : ' GRINNINGFACE ',
    ':(' : ' SADFACE ',
    ':/' : ' CONFUSEDFACE ',
    '\n' : ' NEWLINE ',
    '.' : ' PERIOD ',
    ',' : ' COMMA ',
    '"' : ' QUOTATIONMARK ',
    ';' : ' SEMICOLON ',
    '!' : ' EXCLAMATIONMARK ',
    '?' : ' QUESTION_MARK ',
    '(' : ' LEFTPAREN ',
    ')' : ' RIGHT_PAREN ',
    '/' : ' SLASH ',
    '-' : ' MINUSSIGN ',
    '?' : ' QUESTIONMARK ',
    ':' : ' COLON ',
    '#' : ' HASHTAG ',
    '%' : ' PERCENT ',
    '&' : ' AMPERSAND ',
    '=' : ' EQUALS ',
    '$' : ' DOLLARSIGN '
}

In [8]:
def clean_text(text):
    
    # Drop if text is too long
    if len(text) > 500:
        return None
        
    text = contractions.fix(text)
    text = text.lower()
    
    for key, value in punc_inplace.items():
        text = text.replace(key, value)
    
    text = emoji.demojize(text)
    
    text = re.sub(r'[^\w\s]', '', text)
    
    tokens = word_tokenize(text)
    text = " ".join([token for token in tokens])
    
    # Drop row if the comment is empty or too long after cleaning
    if (text.strip() == "") or (len(text) > 500):
        return None

    return text

In [9]:
data['comment'] = data['comment'].apply(clean_text)
print("[COMMENTS CLEANED]")

data['parent_comment'] = data['parent_comment'].apply(clean_text)
print("[PARENT COMMENTS CLEANED]")

[COMMENTS CLEANED]
[PARENT COMMENTS CLEANED]


### Drop NA Rows

In [10]:
data = data.dropna(subset=['label'])
data = data.dropna(subset=['comment'])
data = data.dropna(subset=['parent_comment'])

data = data.reset_index(drop=True)

# Result

In [11]:
print(display(data))

Unnamed: 0,label,comment,parent_comment
0,0,nc and nh PERIOD,yeah COMMA i get that argument PERIOD at this ...
1,0,you do know west teams play against west teams...,the blazers and mavericks LEFTPAREN the wests ...
2,0,they were underdogs earlier today COMMA but si...,they are favored to win PERIOD
3,0,this meme is not funny none of the QUOTATIONMA...,deadass do not kill my buzz
4,0,i could use one of those tools PERIOD,yep can confirm i saw the tool they use for th...
...,...,...,...
959545,1,i am sure that iran and n PERIOD korea have th...,no one is calling this an engineered pathogen ...
959546,1,whatever you do COMMA do not vote green EXCLAM...,in a move typical of their recent do MINUSSIGN...
959547,1,perhaps this is an atheist conspiracy to make ...,screw the disabled MINUSSIGN MINUSSIGN i have ...
959548,1,the slavs got their own country MINUSSIGN it i...,i have always been unsettled by that PERIOD i ...


None


In [12]:
print(data["label"].value_counts())

1    481037
0    478513
Name: label, dtype: int64


# Save 

In [13]:
data.to_csv("../data/processed_sarcasm-dataset.csv", index=False)