## Configurations and Dependencies

In [1]:
import pandas as pd # pip install pandas
import re # pip install re
import markdown # pip install markdown
from bs4 import BeautifulSoup # pip install beautifulsoup4
import csv
import emoji
import nltk

pd.set_option('display.max_columns', None)  # or 1000
pd.set_option('display.max_rows', None)  # or 1000
pd.set_option('display.max_colwidth', None)  # or 199

hot = pd.read_csv("../dataset/raw/hot.csv")
top = pd.read_csv("../dataset/raw/top.csv")
controversial = pd.read_csv("../dataset/raw/controversial.csv")
new = pd.read_csv("../dataset/raw/new.csv")

df_list = [hot, top, controversial, new, pd.concat([hot, top, controversial, new])]

## Preprocessing

#### Duplicates and Empty Cells

In [125]:
check_list = []

for df in df_list:
    check_list.append((df.duplicated().sum(), df.loc[:, ~df.columns.isin(['Content', 'Flair'])].isnull().sum().sum()))

count = 0
for check in check_list:
    if check != (0, 0): 
        count += 1
        print(f"ERROR AT DF {count}")
    count += 1

if count == len(check_list): print("ALL CLEAR")

ALL CLEAR


#### Data Cleaning

In [126]:
URL_REGEX = r".*(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w\.-]*).*"

# Remove URLs
def unurl(cell):
    return re.sub(URL_REGEX, "", str(cell))

# Convert markdown to plaintext 
def md_to_text(cell):
    html = markdown.markdown(cell)
    soup = BeautifulSoup(html, features='html.parser')
    return soup.get_text()

# Replace NaN and \n 
def cleantext(cell):
    return re.sub(r"(\n|NaN|nan)", " ", str(cell))

# Convert slangs
def unslang(cell):
    user_string = str(cell)
    user_string = user_string.split(" ")
    j = 0
    for _str in user_string:
        fileName = "slang.txt"
        accessMode = "r"
        with open(fileName, accessMode) as myCSVfile:
            dataFromFile = csv.reader(myCSVfile, delimiter="=")
            _str = re.sub('[^a-zA-Z0-9-_.]', '', _str)
            for row in dataFromFile:
                # if re.findall(fr"{_str}", row[0], flags=re.IGNORECASE):
                if _str.upper() == row[0].upper():
                    user_string[j] = row[1]
            myCSVfile.close()
        j = j + 1
    return ' '.join(user_string)

def unemoji(text):
    return emoji.replace_emoji(text, replace="")

In [127]:
count = 1
for df in df_list[0:4]:
    df['Content'] = df['Content'].apply(unurl)
    df['Content'] = df['Content'].apply(md_to_text)
    df['Content'] = df['Content'].apply(cleantext)
    df['Content'] = df['Content'].apply(unslang)
    df['Content'] = df['Content'].apply(unemoji)
    # df['Content'] = df['Content'].str.replace('\d+', '')

    df['Title'] = df['Title'].apply(unurl)
    df['Title'] = df['Title'].apply(md_to_text)
    df['Title'] = df['Title'].apply(cleantext)
    df['Title'] = df['Title'].apply(unslang)
    df['Title'] = df['Title'].apply(unemoji)
    # df['Title'] = df['Title'].str.replace('\d+', '')

    df['Flair'] = df['Flair'].apply(cleantext)

    print(f"Cleaning DF {count}...")
    count += 1

Cleaning DF 1...
Cleaning DF 2...
Cleaning DF 3...
Cleaning DF 4...


#### Create `Title+Content`

In [128]:
for df in df_list[0:4]:
    df.insert(4, "Title+Content", df["Title"] + " " + df["Content"]) 

#### Make CSVs

In [129]:
hot.to_csv(f"../dataset/clean/hot.csv", index=False)
top.to_csv(f"../dataset/clean/top.csv", index=False)
new.to_csv(f"../dataset/clean/new.csv", index=False)
controversial.to_csv(f"../dataset/clean/controversial.csv", index=False)