# 2 - Data Preprocessing (Part 1)

In [None]:
import pandas as pd

# remove tweets
import rt

# import data
df = pd.read_csv("inputs/tweets_raw.csv")

# drop columns
df.drop(columns = ["user_location", "coordinates", "place"], inplace = True)

# rename columns
df.rename(columns = {"rawContent" : "content", "id" : "tweet_id"}, inplace = True)

# convert datetime to date
df["date"] = df["date"].apply(lambda x: x[:10])

# convert uppercase letters to lowercase letters
df["content"] = df["content"].str.lower()

# convert integer to string
df["tweet_id"] = df["tweet_id"].astype(str)

# convert uppercase letters to lowercase letters
df["user_username"] = df["user_username"].str.lower()

# convert integer to string
df["conversation_id"] = df["conversation_id"].astype(str)

In [None]:
df.info()

In [None]:
df.head()

## 1.1. Remove duplicates

In [None]:
df = rt.remove_tweets(df = df,
                      condition = ~df[["content", "user_username"]].duplicated(),
                      column = "content")

## Tags

In [None]:
converge = ["@converge_csu", "@experiencecnvrg"]

globe = ["@enjoyglobe", "@talk2globe"]

pldt = ["@pldthome", "@pldtenterprise", "@pldtent_cares", "@pldt_cares", "@pldt"]

tags = converge + globe + pldt

## 1.2. Remove tweets from telecommunication companies

In [None]:
df = rt.remove_tweets(df = df,
                      condition = ~df["user_username"].isin([tag.strip("@") for tag in tags]),
                      column = "content")

## 1.3. Remove tweets that tagged less or more than 1 company.

In [None]:
def company(x):
    for i in converge:
        x = x.replace(i, "@company_a")
    for i in globe:
        x = x.replace(i, "@company_b")
    for i in pldt:
        x = x.replace(i, "@company_c")
    return x

df["content_1"] = df["content"].apply(lambda x: company(x))

df["company_a"] = df["content_1"].apply(lambda x: 1 if "@company_a" in x else 0)
df["company_b"] = df["content_1"].apply(lambda x: 1 if "@company_b" in x else 0)
df["company_c"] = df["content_1"].apply(lambda x: 1 if "@company_c" in x else 0)

df["company"] = df["company_a"] + df["company_b"] + df["company_c"]

df = rt.remove_tweets(df = df,
                      condition = df["company"] == 1,
                      column = "content_1")

## X.1. Case folding

In [None]:
df["content_1"] = df["content_1"].apply(lambda x: x.lower())

## X.2. Remove hastags, mentions, links, and non-alphabetical characters (including numerical characters and special characters)

In [None]:
import regex as re

def remove(x):
    
    # remove non-white space characters after # (hashtags)
    # remove non-white space characters after @ (mentions)
    # remove non-white space characters after http (links)
    x = re.sub("(\#\S+)|(\@\S+)|(http\S+)", "", x)
    
    # substitute non-alphabetical characters into a single space
    x = re.sub("([^a-z])", " ", x)
    
    # substitute multiple spaces into a single space
    x = re.sub("(\s+)", " ", x)
    
    x = x.strip()
    
    return x

df["content_2"] = df["content_1"].apply(lambda x: remove(x))

## X.3. Remove stop words

In [None]:
# english stop words

from nltk.corpus import stopwords

sw_english = stopwords.words("english")

# tagalog stop words

import advertools

sw_tagalog = list(advertools.stopwords["tagalog"])

# domain stop words

sw_domain = ["converge", "globe", "pldt"]

# stop words

sw = sw_english + sw_tagalog + sw_domain

sw = [remove(i) for i in sw]

# remove stop words

df["content_3"] = df["content_2"].apply(lambda x: " ".join([i for i in x.split() if i not in sw]))

## X.4. Remove low quality words

* Remove words with a character count less than or equal to 2 and greater than or equal to 30.
* Remove words that are permutations of one or two alphabetical characters.

In [None]:
def remove_low_quality_words(x):
    
    lst = list()
    
    for i in x.split():
        if len(i) > 2:
            if len(i) < 30:
                if len(set(i)) > 2:
                    lst.append(i)
                    
    return " ".join(lst)

df["content_4"] = df["content_3"].apply(lambda x: remove_low_quality_words(x))

## X.5. Filter by word count 1

In [None]:
df["word_count_1"] = df["content_4"].apply(lambda x: len(x.split()))

df = rt.remove_tweets(df = df,
                      condition = df["word_count_1"] > 2,
                      column = "content_4")

In [None]:
rt.rt(345667, 260313)

## Export data

In [None]:
# df.to_csv("inputs/tweets_1.csv", index = False)

In [None]:
df.info()

In [None]:
df.head()