# Data Cleaning

### Cleaning the quotes

Replace special quotes by normal ones and remove single quotes to prevent parsing errors.

In [1]:
from src.utils.clean_dataset import cleanup_dataset
import pandas as pd
import string

In [2]:
cleanup_dataset("data/train.csv", "data/train_cleaned.csv")

In [3]:
cleanup_dataset("data/dev.csv", "data/dev_cleaned.csv")

### Cleaning the labels

#### Train dataset

In [4]:
train = pd.read_csv("data/train_cleaned.csv", header=None, names=["image_name", "Image_URL", "OCR_extracted_text", "Corrected_text", "Humour", "Sarcasm", "Offense", "Motivation", "Overall_sentiment", "Basis_of_classification"])

In [5]:
def text_cleaning(row):
    x = row["Corrected_text"]
    if not x or (type(x) == float and math.isnan(x)):
        return None
    cleaned = x.translate(str.maketrans('', '', string.punctuation))
    cleaned = " ".join(cleaned.lower().split())
    return cleaned

In [6]:
train.Humour.unique()

array(['hilarious', 'not_funny', 'very_funny', 'funny'], dtype=object)

In [7]:
train.Sarcasm.unique()

array(['general', 'not_sarcastic', 'twisted_meaning', 'very_twisted'],
      dtype=object)

In [8]:
train.Offense.unique()

array(['not_offensive', 'very_offensive', 'slight', 'hateful_offensive'],
      dtype=object)

In [9]:
train.Motivation.unique()

array(['not_motivational', 'motivational'], dtype=object)

In [10]:
train.Overall_sentiment.unique()

array(['very_positive', 'positive', 'neutral', 'negative',
       'very_negative',
       'positivechandler_Friday-Mood-AF.-meme-Friends-ChandlerBing.jpg'],
      dtype=object)

In [11]:
train.loc[train['Overall_sentiment'] == "positivechandler_Friday-Mood-AF.-meme-Friends-ChandlerBing.jpg"]

Unnamed: 0,image_name,Image_URL,OCR_extracted_text,Corrected_text,Humour,Sarcasm,Offense,Motivation,Overall_sentiment,Basis_of_classification
726,chandler_f50efbd3af8d0a93a2ecdead0dc5044a.jpg,https://i.pinimg.com/originals/f5/0e/fb/f50efb...,Chandler Trolling on someone else's account BE...,Chandler Trolling on someone else's account BE...,funny,not_sarcastic,not_offensive,not_motivational,positivechandler_Friday-Mood-AF.-meme-Friends-...,


In [12]:
train = train.replace("positivechandler_Friday-Mood-AF.-meme-Friends-ChandlerBing.jpg", "positive")

In [13]:
train.Overall_sentiment.unique()

array(['very_positive', 'positive', 'neutral', 'negative',
       'very_negative'], dtype=object)

In [14]:
train.Basis_of_classification.unique()

array([nan])

#### Dev dataset

In [15]:
dev = pd.read_csv("data/dev_cleaned.csv", header=0, names=["image_name", "Image_URL", "OCR_extracted_text", "Corrected_text", "Humour", "Sarcasm", "Offense", "Motivation", "Overall_sentiment", "Basis_of_classification"])

In [16]:
dev.Humour.unique()

array(['very_funny', 'funny', 'not_funny', 'hilarious'], dtype=object)

In [17]:
dev.Sarcasm.unique()

array(['general', 'not_sarcastic', 'twisted_meaning', 'very_twisted'],
      dtype=object)

In [18]:
dev.Offense.unique()

array(['hateful_offensive', 'not_offensive', 'slight', 'very_offensive'],
      dtype=object)

In [19]:
dev.Motivation.unique()

array(['not_motivational', 'motivational'], dtype=object)

In [20]:
dev.Overall_sentiment.unique()

array(['very_positive', 'negative', 'neutral', 'positive',
       'very_negative'], dtype=object)

In [21]:
dev.Basis_of_classification.unique()

array(['image_and_text ', 'image', 'text', 'image_and_text  '],
      dtype=object)

### Checking missing values

#### Train dataset

In [22]:
to_check = train.drop(["Basis_of_classification", "OCR_extracted_text"], axis=1)
to_check[to_check.isnull().any(axis=1)]

Unnamed: 0,image_name,Image_URL,Corrected_text,Humour,Sarcasm,Offense,Motivation,Overall_sentiment
4214,trump_1ciwua.jpg,https://i.imgflip.com/1ciwua.jpg,,very_funny,general,slight,motivational,positive
4230,trump_85486890.jpg,https://cdn.ebaumsworld.com/mediaFiles/picture...,,very_funny,general,slight,not_motivational,neutral
4231,trump_1486350110-meme-5.png,https://sportsdaydfw.imgix.net/1486350110-meme...,,funny,twisted_meaning,very_offensive,motivational,negative
4261,trump_qeqrech7dx3z.jpg,https://i.redd.it/qeqrech7dx3z.jpg,,not_funny,general,slight,motivational,very_positive
4807,trump_d88.jpg,https://i.kym-cdn.com/photos/images/facebook/0...,,very_funny,general,slight,motivational,neutral
5285,minion_itm-about-as-ok-with-libs-fuckin-minion...,https://pics.conservativememes.com/itm-about-a...,,very_funny,general,slight,motivational,neutral
6789,trump_1485530548-donald-trump-and-hillary-clin...,https://im.indiatimes.in/content/itimes/photo/...,,very_funny,twisted_meaning,not_offensive,not_motivational,positive
6792,trump_amusing-memes.jpg,http://worldwideinterweb.com/wp-content/upload...,,hilarious,general,not_offensive,not_motivational,positive
6794,trump_clinton-vs-trump-memes.jpg,http://worldwideinterweb.com/wp-content/upload...,,not_funny,not_sarcastic,very_offensive,motivational,positive


In [23]:
train[train["Corrected_text"].isnull()]

Unnamed: 0,image_name,Image_URL,OCR_extracted_text,Corrected_text,Humour,Sarcasm,Offense,Motivation,Overall_sentiment,Basis_of_classification
4214,trump_1ciwua.jpg,https://i.imgflip.com/1ciwua.jpg,,,very_funny,general,slight,motivational,positive,
4230,trump_85486890.jpg,https://cdn.ebaumsworld.com/mediaFiles/picture...,,,very_funny,general,slight,not_motivational,neutral,
4231,trump_1486350110-meme-5.png,https://sportsdaydfw.imgix.net/1486350110-meme...,,,funny,twisted_meaning,very_offensive,motivational,negative,
4261,trump_qeqrech7dx3z.jpg,https://i.redd.it/qeqrech7dx3z.jpg,,,not_funny,general,slight,motivational,very_positive,
4807,trump_d88.jpg,https://i.kym-cdn.com/photos/images/facebook/0...,,,very_funny,general,slight,motivational,neutral,
5285,minion_itm-about-as-ok-with-libs-fuckin-minion...,https://pics.conservativememes.com/itm-about-a...,,,very_funny,general,slight,motivational,neutral,
6789,trump_1485530548-donald-trump-and-hillary-clin...,https://im.indiatimes.in/content/itimes/photo/...,,,very_funny,twisted_meaning,not_offensive,not_motivational,positive,
6792,trump_amusing-memes.jpg,http://worldwideinterweb.com/wp-content/upload...,,,hilarious,general,not_offensive,not_motivational,positive,
6794,trump_clinton-vs-trump-memes.jpg,http://worldwideinterweb.com/wp-content/upload...,,,not_funny,not_sarcastic,very_offensive,motivational,positive,


In [24]:
train.loc[[4214, 4230, 4231, 4261, 4807, 5285, 6789, 6792, 6794],"Corrected_text"] = [
    "It's rigged ! it's rigged ! it's a left wing-- no, a right wing-- no, a media conspiracy", 
    "Stage #1 - Action complete Stage #2 make it stick!", 
    "Breaking: Trump signs executive order giving Tom Brady and the new england patriots 41 points", 
    "When you login to your neighbors router using 1234", 
    "I'm gonna build some fancy walls even though I have millions of extra dollars in gonna make the mexicans pay for it", 
    "I'm about as OK with libs burnin' the flag as I am with how much it burns when I piss Fukin minion memes. ~L.", 
    "If Donald and Hillary are together on a boat in the middle of the ocean and it sinks. who survives ? AMERICA",
    "Bruh why this tub of margarine look like Donald Trump ?", 
    "2016 election Trump vs Hillary still a better love story than Twilight", 
]

In [25]:
train.to_csv("data/train_cleaned_missing.csv", index=False)

#### Dev dataset

In [26]:
def replace_missing_text(x):
    if x["Corrected_text"] == ' ':
        return x["OCR_extracted_text"]
    return x["Corrected_text"]

In [27]:
dev["Corrected_text"] = dev.apply(lambda x: replace_missing_text(x), axis=1)

In [28]:
to_check = dev.drop(["Basis_of_classification", "OCR_extracted_text"], axis=1)
to_check[to_check.isnull().any(axis=1)]

Unnamed: 0,image_name,Image_URL,Corrected_text,Humour,Sarcasm,Offense,Motivation,Overall_sentiment


### Check for duplicates

#### Train dataset

In [29]:
train[train.duplicated()]

Unnamed: 0,image_name,Image_URL,OCR_extracted_text,Corrected_text,Humour,Sarcasm,Offense,Motivation,Overall_sentiment,Basis_of_classification
1971,hillary_c3fd01300e5bee2ba12a45ee2f160ed11b8926...,http://www.quickmeme.com/img/c3/c3fd01300e5bee...,WANTS TO BAN VIOLENT VIDEO GAMES DOESN'T TRY T...,WANTS TO BAN VIOLENT VIDEO GAMES DOESN'T TRY T...,very_funny,general,slight,motivational,positive,


In [30]:
train.drop_duplicates(keep="first",inplace=True)

In [31]:
train.to_csv("data/train_cleaned_final.csv", index=False)

#### Dev dataset

In [32]:
dev[dev.duplicated()]

Unnamed: 0,image_name,Image_URL,OCR_extracted_text,Corrected_text,Humour,Sarcasm,Offense,Motivation,Overall_sentiment,Basis_of_classification


In [33]:
dev.to_csv("data/dev_cleaned_final.csv", index=False)