# 4 - Data Preprocessing (Part 2)

In [None]:
import pandas as pd

import rt # remove tweets

df = pd.read_csv("inputs/tweets_2.csv") # import data

In [None]:
df.info()

In [None]:
df.head()

## Some tweets that were not translated are missing values

* Some tweets that were not translated are missing values.
* Can't convert missing values to lowercase letters.
* There are 6 missing values.
* Replace/Substitute missing vaues with `content_4`.

In [None]:
missing_values = df[df["content_5"].isna()]
missing_values.head(10)

In [None]:
df.loc[missing_values.index, "content_5"] = df.loc[missing_values.index, "content_4"]

In [None]:
missing_values = df[df["content_5"].isna()]
missing_values.head(10)

## 2.1. Case folding

In [None]:
df["content_5"] = df["content_5"].apply(lambda x: x.lower())

## 2.2. Remove hastags, mentions, links, and non-alphabetical characters (including numerical characters and special characters)

In [None]:
import regex as re

def remove(x):
    
    # remove non-white space characters after # (hashtags)
    # remove non-white space characters after @ (mentions)
    # remove non-white space characters after http (links)
    x = re.sub("(\#\S+)|(\@\S+)|(http\S+)", "", x)
    
    # substitute non-alphabetical characters into a single space
    x = re.sub("([^a-z])", " ", x)
    
    # substitute multiple spaces into a single space
    x = re.sub("(\s+)", " ", x)
    
    x = x.strip()
    
    return x

df["content_6"] = df["content_5"].apply(lambda x: remove(x))

## 2.3. Remove stop words

In [None]:
# english stop words

from nltk.corpus import stopwords

sw_english = stopwords.words("english")

# tagalog stop words

import advertools

sw_tagalog = list(advertools.stopwords["tagalog"])

# domain stop words

sw_domain = ["converge", "globe", "pldt"]

# stop words

sw = sw_english + sw_tagalog + sw_domain

sw = [remove(i) for i in sw]

# remove stop words

df["content_7"] = df["content_6"].apply(lambda x: " ".join([i for i in x.split() if i not in sw]))

## 2.4 Remove low quality words

* Remove words with a character count less than or equal to 2 and greater than or equal to 30.
* Remove words that are permutations of one or two alphabetical characters.

In [None]:
def remove_low_quality_words(x):
    
    lst = list()
    
    for i in x.split():
        if len(i) > 2:
            if len(i) < 30:
                if len(set(i)) > 2:
                    lst.append(i)
                    
    return " ".join(lst)

df["content_8"] = df["content_7"].apply(lambda x: remove_low_quality_words(x))

## Term Frequency

In [None]:
terms = " ".join(df["content_8"])

term_frequency = dict()

for term in terms.split():
    if term not in term_frequency:
        term_frequency[term] = 1
    else:
        term_frequency[term] = term_frequency[term] + 1
        
tf = pd.DataFrame.from_dict(term_frequency, orient = "index")

tf.reset_index(inplace = True)

tf.rename(columns = {"index" : "term", 0 : "frequency"}, inplace = True)

tf.sort_values(by = ["frequency", "term"], ascending = [0, 1], inplace = True, ignore_index = True)

In [None]:
tf[tf.frequency >= tf.frequency.quantile(q = 0.9999)]

In [None]:
tf[tf.frequency <= tf.frequency.quantile(q = 0.0100)]

In [None]:
round(len(tf[tf["frequency"] == 1]) / len(tf) * 100, 2)

## 2.5. Remove low frequency words

In [None]:
low_frequency_words = list(tf[tf["frequency"] == 1]["term"])

from tqdm import tqdm

lst = list()

for i in tqdm(range(len(df))):
    lst.append(" ".join([i for i in df["content_8"][i].split() if i not in low_frequency_words]))
    
df["content_9"] = lst

## 2.6. Filter by word count 2

In [None]:
df["word_count_2"] = df["content_9"].apply(lambda x: len(x.split()))

df = rt.remove_tweets(df = df,
                      condition = df["word_count_2"] > 2,
                      column = "content_9")

## 2.7. Lemmatization

In [58]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

def lemmatization(x):
    
    words = list()
    
    for i in x.split():
        word = i
        
        for j in ["a", "n", "r", "s", "v"]:
            word = lemmatizer.lemmatize(word, pos = j)
            
        words.append(word)
        
    return " ".join(words)

df["content_10"] = df["content_9"].apply(lambda x: lemmatization(x))

## `month`

In [None]:
df["month"] = df["date"].apply(lambda x: int(x[5:7]))

## `company_x`

In [None]:
import numpy as np

df["company_x"] = np.select(condlist = [df["company_a"] == 1, df["company_b"] == 1, df["company_c"] == 1],
                            choicelist = ["a", "b", "c"],
                            default = np.nan)

In [None]:
rt.rt(345667, 236956)

## Export data

In [None]:
# df.to_csv("inputs/tweets_3.csv", index = False)

In [None]:
df.info()

In [None]:
df.head()