## Load data

In [None]:
import pandas as pd
df = pd.read_csv('train_E6oV3lV.csv')

In [None]:
# Convert original tweets from ascii to unicode
import html
def convert_to_unicode(text):
    return html.unescape(text)

df['tweet'] = df['tweet'].apply(convert_to_unicode)

In [None]:
df['cleaned_tweet'] = df['tweet']
df.head()

# Preprocessing

## Remove url

In [None]:
# Remove urls
import re
def remove_urls(text):
    url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    return re.sub(url_regex, '', text)
# Example:
text = "Beautiful color combination of pink, orange, yellow & white. A Coll http://t.co/H0dYEBvnZB"
print("Example")
print("before: ", text)
print("after : ", remove_urls(text))

df['cleaned_tweet'] = df['cleaned_tweet'].apply(remove_urls)
# df.head(5) # tips: uncomment this line to visualize result

## Remove mentions (@User)

In [None]:
# Remove mentions
import re
def remove_mentions(text):
    mention_regex = '@[\w\-]+'
    return re.sub(mention_regex, '', text)
# Example:
text = "RT @C_G_Anderson: @viva_based she look like a tranny"
print("Example")
print("before: ", text)
print("after : ", remove_mentions(text))

df['cleaned_tweet'] = df['cleaned_tweet'].apply(remove_mentions)
# df.head(5) # tips: uncomment this line to visualize result

## Remove RT (retweet)

In [None]:
# Remove 'RT'
import re
def remove_rt(text):
    return re.sub('RT','', text)

df['cleaned_tweet'] = df['cleaned_tweet'].apply(remove_rt)
# df.head(5) # tips: uncomment this line to visualize result

## Remove emoticons


In [None]:
!pip install emot

In [None]:
# Remove emoticons
import html
import emot # library to extract emojis and emoticons, installation: pip install emot
import re
def remove_emoticons(text):
    text = html.unescape(text) # convert text to unicode format
    remove_synonym_regex = '[\s][or][ \w-]+|,\s\w.+' # some emoticons have multiple meanings, keep the first meaning
    emoticons = emot.emoticons(text) # find all emoticons by using the library emot
    if len(emoticons) > 0 and "value" in emoticons: # not empty
        emoticons_values = emoticons['value']
        for i in range(len(emoticons_values)):
            text = text.replace(emoticons_values[i], " ")
            text = re.sub('\s+', ' ', text) # remove additional spaces
    return text
# Example:
text = "I love python :-):-(:/ yaya ho.. cute avi &#128553;"
text = html.unescape(text) # convert text to unicode format
print("Example")
print("before: ", text)
print("after : ", remove_emoticons(text))

df['cleaned_tweet'] = df['cleaned_tweet'].apply(remove_emoticons)
# df.head(5) # tips: uncomment this line to visualize result

## Remove emojis

In [None]:
!pip install emoji

In [None]:
# Remove emojis
import html
import emoji
def remove_emojis(text):
    return emoji.get_emoji_regexp().sub(u'', html.unescape(text))
# Example :
text = "I love python :-):-(:/ yaya ho.. cute avi tho RT @ViVaLa_Ari I had no idea she was sleep &#128553;"
text = html.unescape(text) # convert text to unicode format
print("Example")
print("before: ", text)
print("after : ", remove_emojis(text))

df['cleaned_tweet'] = df['cleaned_tweet'].apply(remove_emojis)
# df.head(5) # tips: uncomment this line to visualize result

## Convert informal contraction to formal writing 
e.g. isn't -> is not

In [None]:
!pip install contractions

In [None]:
# Convert informal contraction to formal writing 
# e.g. isn't -> is not, mayn't -> may not, she'd -> she would, etc.
import contractions # library for deal with informal contractions, installation: pip install contractions
def convert_contraction(text):
    return contractions.fix(text)
# Example:
text = "isn't mayn't she'd yall asap i'm!"
print("Example")
print("before: ", text)
print("after : ", convert_contraction(text))

df['cleaned_tweet'] = df['cleaned_tweet'].apply(convert_contraction)
# df.head(5) # tips: uncomment this line to visualize result

## Lowercase letters, clean non-ascii

In [None]:
# Lowercase letters
df['cleaned_tweet'] = df['cleaned_tweet'].str.lower()
# df.head(5) # tips: uncomment this line to visualize result

In [None]:
def clean_ascii(text):
    # function to remove non-ASCII chars from data
    return ''.join(i for i in text if ord(i) < 128)
df['cleaned_tweet'] = df['cleaned_tweet'].apply(clean_ascii)

def add_space_between_hashtags(text):
    return text.replace("#", " #")
df['cleaned_tweet'] = df['cleaned_tweet'].apply(add_space_between_hashtags)

In [None]:
# Remove whitespaces '\s+' by a real space
import re
def remove_whitespaces(text):
    space_regex = '\s+'
    text = re.sub(space_regex, ' ', text)
    return text

df['cleaned_tweet'] = df['cleaned_tweet'].apply(remove_whitespaces)

# Save processed dataframe to csv

In [None]:
# Set column width for long text display
pd.set_option('display.max_colwidth', 350)

In [None]:
df[df.label == 1].tail(10)

In [None]:
df.to_csv('train_E6oV3lV_cleaned.csv')