# Data Preprocessing

## The Goal
- to clean, transform, and prepare raw data into a suitable format that enhances the performance and effectiveness of the algorithms

In [1]:
import os
import pandas as pd

In [2]:
os.chdir("..")

In [3]:
data_dir = os.path.join("data")
raw_dir = os.path.join(data_dir, "raw")
processed_dir = os.path.join(data_dir, "processed")
assets_dir = os.path.join("assets")

## Load & Prepare Data

In [4]:
data = pd.read_csv(os.path.join(raw_dir, "tweet_topic_multi.csv"), index_col=False)

In [5]:
data.head()

Unnamed: 0,text,date,label,label_name,id
0,Game day for {{USERNAME}} U18‚Äôs against {{USER...,2019-09-08,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",['sports'],1170606779568463874
1,‚öΩÔ∏è This afternoon we have our first League gam...,2019-09-08,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",['sports'],1170607436924952576
2,Might watch the {@Tennessee Titans@} game only...,2019-09-08,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",['sports'],1170666727845376000
3,. {@Asante Kotoko SC@} should have moved for t...,2019-09-08,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, ...","['fitness_&_health', 'news_&_social_concern', ...",1170728379290673154
4,Massive WELL DONE to BSLFC Reserves today in t...,2019-09-08,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",['sports'],1170788768690970624


In [6]:
data.dtypes

text          object
date          object
label         object
label_name    object
id             int64
dtype: object

## Check For Missing values

In [7]:
data.isna().sum()

text          0
date          0
label         0
label_name    0
id            0
dtype: int64

In [8]:
data.isnull().sum()

text          0
date          0
label         0
label_name    0
id            0
dtype: int64

## Text Preprocessing

In [9]:
import nltk
from nltk.corpus import stopwords
import re

In [10]:
nltk.download("stopwords")
STOPWORDS = stopwords.words("english")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ahmad\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### 1. Remove Links

In [11]:
def remove_links(text: str):
    return re.sub(r'http\S+', "", text)

In [12]:
remove_links("this is a link http://tr.im/kwk9 that gonna be removed")

'this is a link  that gonna be removed'

### 2. Emoji Handling  

In [13]:
import pickle
import emot

In [14]:
with open(os.path.join(assets_dir, "Emoji_Dict.p"), 'rb') as fp:
    emoji_dict = pickle.load(fp)
    emoji_dict = {v: k for k, v in emoji_dict.items()}

In [15]:
print(emoji_dict["üò≠"])
print(emoji_dict["üòã"])
print(emoji_dict["üê™"])

:loudly_crying_face:
:face_savouring_delicious_food:
:camel:


In [16]:
def convert_emojis(text):
    for emot in emoji_dict:
        text = re.sub(r"("+emot+")", "_".join(emoji_dict[emot].replace(",","").replace(":","").split()), text)
    return text

In [17]:
convert_emojis("i feel sad üò¢")

'i feel sad crying_face'

In [18]:
def remove_emojis(text):
    for emot in emoji_dict:
        text = re.sub(r"("+emot+")", "", text)
    return text

In [19]:
remove_emojis("i feel sad üò¢")

'i feel sad '

In [20]:
EMOTICONS = emot.emo_unicode.EMOTICONS_EMO

In [21]:
# Function for converting emoticons into word
def convert_emoticons(text):
    for k, v in EMOTICONS.items():
        text = re.sub(u"("+re.escape(k)+")", "_".join(v.replace(",","").split()), text)
    return text

In [22]:
convert_emoticons("Hello :-) :-)")

'Hello Happy_face_smiley Happy_face_smiley'

In [23]:
def remove_emoticons(text):
    for k, _ in EMOTICONS.items():
        text = re.sub(u"("+re.escape(k)+")", "", text)
    return text

In [24]:
remove_emoticons("Hello :-) :-)")

'Hello  '

### 3. remove punctuations

In [25]:
def remove_punctuations(text):
    tokens = text.split()

    # Define a regular expression pattern to match all punctuation except "@", "#", "!", "{", "}", ".", ","
    punctuation_pattern = re.compile(r'[^\w\s@#\!\}\{\.\,]')

    # Replace all matches of the punctuation pattern with an empty string
    tokens = [punctuation_pattern.sub("", token) for token in tokens]
    
    # Join tokens
    text = " ".join(tokens)
    # Return the updated text
    return text

In [26]:
def remove_extra_spaces(text):
    text = text.strip()
    text = " ".join(text.split())
    return text

### 4. Stemming

In [27]:
from nltk.stem import PorterStemmer

In [28]:
stemmer = PorterStemmer()

## Preprocess

In [29]:
def preprocess(text: str, strip_links=True, strip_punctuations=True, decode_emojis=False, stem=True, lower=True):
    if lower:
        text = text.lower()
    if strip_links:
        text = remove_links(text)
    if decode_emojis:
        text = convert_emojis(text)
        text = convert_emoticons(text)
    if strip_punctuations:
        text = remove_punctuations(text)
    if stem:
        text = " ".join([stemmer.stem(token) for token in text.split()])
    return remove_extra_spaces(text)

In [30]:
sample = data["text"][0]
sample

'Game day for {{USERNAME}} U18‚Äôs against {{USERNAME}} U18‚Äôs. Even though it‚Äôs a ‚Äòhome‚Äô game for the people that have settled in Mid Wales it‚Äôs still a 4 hour round trip for us up to Colwyn Bay. Still enjoy it though!'

In [31]:
preprocess(sample)

'game day for {{username}} u18 against {{username}} u18s. even though it a home game for the peopl that have settl in mid wale it still a 4 hour round trip for us up to colwyn bay. still enjoy it though!'