In [1]:
import pandas as pd # pip install pandas
import re # pip install re
import string 
import markdown # pip install markdown
from bs4 import BeautifulSoup # pip install beautifulsoup4
import csv
import emoji
import nltk

pd.set_option('display.max_columns', None)  # or 1000
pd.set_option('display.max_rows', None)  # or 1000
pd.set_option('display.max_colwidth', None)  # or 199

hot = pd.read_csv("../dataset/working/Hot.csv")
top = pd.read_csv("../dataset/working/Top.csv")
controversial = pd.read_csv("../dataset/working/Controversial.csv")
new = pd.read_csv("../dataset/working/New.csv")

df = pd.concat([hot, top, controversial, new], ignore_index=True)

In [2]:
df.dropna(subset=['Content'], inplace=True)
df.shape

(2054, 11)

In [3]:
URL_REGEX = r".*(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w\.-]*).*"

# Remove URLs
def unurl(cell):
    return re.sub(URL_REGEX, "", str(cell))

# Convert markdown to plaintext 
def md_to_text(cell):
    html = markdown.markdown(cell)
    soup = BeautifulSoup(html, features='html.parser')
    return soup.get_text()

# Replace NaN and \n 
def cleantext(cell):
    return re.sub(r"(\n|NaN|nan)", " ", str(cell))

def remove_punctuation(cell):
    return cell.translate(str.maketrans('', '', string.punctuation))

# Convert slangs
def unslang(cell):
    user_string = str(cell)
    user_string = user_string.split(" ")
    j = 0
    for _str in user_string:
        fileName = "slang.txt"
        accessMode = "r"
        with open(fileName, accessMode) as myCSVfile:
            dataFromFile = csv.reader(myCSVfile, delimiter="=")
            _str = re.sub('[^a-zA-Z0-9-_.]', '', _str)
            for row in dataFromFile:
                # if re.findall(fr"{_str}", row[0], flags=re.IGNORECASE):
                if _str.upper() == row[0].upper():
                    user_string[j] = row[1]
            myCSVfile.close()
        j = j + 1
    return ' '.join(user_string)

def unemoji(text):
    return emoji.replace_emoji(text, replace="")

def remove_numbers(cell):
    return re.sub(r"\d+", " ", str(cell))

def lowercase(cell):
    return str(cell).lower()

In [4]:
df['Content'] = df['Content'].apply(unurl)
df['Content'] = df['Content'].apply(md_to_text)
df['Content'] = df['Content'].apply(cleantext)
df['Content'] = df['Content'].apply(unslang)
df['Content'] = df['Content'].apply(unemoji)
df['Content'] = df['Content'].apply(remove_punctuation)
df['Content'] = df['Content'].apply(remove_numbers)
df['Content'] = df['Content'].apply(lowercase)

df['Title'] = df['Title'].apply(unurl)
df['Title'] = df['Title'].apply(md_to_text)
df['Title'] = df['Title'].apply(cleantext)
df['Title'] = df['Title'].apply(unslang)
df['Title'] = df['Title'].apply(unemoji)
df['Title'] = df['Title'].apply(remove_punctuation)
df['Title'] = df['Title'].apply(remove_numbers)
df['Title'] = df['Title'].apply(lowercase)

In [5]:
df["Title+Content"] = df["Title"] + " " + df["Content"]
df.reset_index(drop=True, inplace=True)

### Translation to English

In [6]:
from deep_translator import GoogleTranslator
import math

In [7]:
def trans(text):
    if len(text) > 5000:
        text_1 = GoogleTranslator(source='tl', target='en').translate(text[:math.floor(len(text)/2)]) 
        text_2 = GoogleTranslator(source='tl', target='en').translate(text[math.floor(len(text)/2):]) 
        return text_1 + " " + text_2
    else:   
        return GoogleTranslator(source='tl', target='en').translate(text)

In [8]:
trans_1 = df['Title+Content'][:500].apply(trans)

In [11]:
trans_2 = df['Title+Content'][500:1000].apply(trans)

In [14]:
trans_3 = df['Title+Content'][1000:1500].apply(trans)

In [16]:
trans_4 = df['Title+Content'][1500:2000].apply(trans)

In [17]:
trans_5 = df['Title+Content'][2000:].apply(trans)

In [18]:
df_test = pd.concat([trans_1, trans_2, trans_3, trans_4, trans_5])

In [19]:
df_test = df_test.to_frame().rename(columns={"Title+Content":"Translated"})

In [20]:
df_test['Translated'] = df_test['Translated'].apply(unslang)
df_test['Translated'] = df_test['Translated'].apply(lowercase)
df_test['Translated'] = df_test['Translated'].apply(remove_punctuation)

In [57]:
df.insert(5, "Translated", df_test)

ValueError: cannot insert Translated, already exists

In [25]:
df.to_csv(f"../dataset/_compiled/Compiled.csv", index=False)

In [58]:
# df_save = df.copy(deep=True)