In [1]:
import pandas as pd # pip install pandas
import re # pip install re
import string 
import markdown # pip install markdown
from bs4 import BeautifulSoup # pip install beautifulsoup4
from datetime import datetime
import csv
import emoji
import nltk
import numpy as np
import spacy    
import spacy_transformers
nlp = spacy.load('en_core_web_trf')

pd.set_option('display.max_columns', None)  # or 1000
pd.set_option('display.max_rows', None)  # or 1000
pd.set_option('display.max_colwidth', None)  # or 199

hot = pd.read_csv("../dataset/raw/hot.csv")
top = pd.read_csv("../dataset/raw/top.csv")
controversial = pd.read_csv("../dataset/raw/controversial.csv")
new = pd.read_csv("../dataset/raw/new.csv")

df = pd.concat([hot, top, controversial, new], ignore_index=True)

  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [131]:
df['Content'] = df['Content'].replace('', np.nan)
df['Content'] = df['Content'].replace('\u200b', np.nan)

df['Title'] = df['Title'].replace('', np.nan)
df['Title'] = df['Title'].replace('\u200b', np.nan)

df.dropna(subset=['Content'], inplace=True, ignore_index=True)
df.dropna(subset=['Title'], inplace=True, ignore_index=True)

df.shape

(1978, 10)

In [133]:
import unicodedata
import sys
URL_REGEX = r".*(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w\.-]*).*"
punctuation = "".join((chr(i) for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith('P')))

# Remove URLs
def remove_url(cell):
    return re.sub(URL_REGEX, "", str(cell))

# Convert markdown to plaintext 
def md_to_text(cell):
    html = markdown.markdown(cell)
    soup = BeautifulSoup(html, features='html.parser')
    return soup.get_text()

# Replace NaN and \n 
def remove_NaN_newline(cell):
    return re.sub(r"(\n|NaN|nan)", " ", str(cell))

def remove_punctuation(cell):
    return cell.translate(str.maketrans('', '', punctuation))

# Convert slangs
def replace_abbrv(cell):
    user_string = str(cell)
    user_string = user_string.split(" ")
    j = 0
    for _str in user_string:
        fileName = "slang.txt"
        accessMode = "r"
        with open(fileName, accessMode) as myCSVfile:
            dataFromFile = csv.reader(myCSVfile, delimiter="=")
            _str = re.sub('[^a-zA-Z0-9-_.]', '', _str)
            for row in dataFromFile:
                # if re.findall(fr"{_str}", row[0], flags=re.IGNORECASE):
                if _str.lower() == row[0].lower():
                    user_string[j] = row[1]
            myCSVfile.close()
        j = j + 1
    return ' '.join(user_string)

def remove_emoji(text):
    return emoji.replace_emoji(text, replace="")

def remove_numbers(cell):
    return re.sub(r"\d+", " ", str(cell))

def lowercase(cell):
    return str(cell).lower()

def convert_utc(cell):
    date = [int(x) for x in cell.split(" ")[0].split("-")] 
    return datetime(date[0],date[1],date[2]).strftime('%s')

def remove_names(text):
    doc = nlp(str(text))
    newString = str(text)
    for e in reversed(doc.ents):
        if e.label_ == "PERSON": # Only if the entity is a PERSON
            newString = newString[:e.start_char] + newString[e.start_char + len(e.text):]
    return newString

In [134]:
df['Content'] = df['Content'].apply(replace_abbrv)
df['Content'] = df['Content'].apply(remove_url)
df['Content'] = df['Content'].apply(md_to_text)
df['Content'] = df['Content'].apply(remove_NaN_newline)
df['Content'] = df['Content'].apply(remove_emoji)
df['Content'] = df['Content'].apply(remove_numbers)
df['Content'] = df['Content'].apply(lowercase)
df['Content'] = df['Content'].apply(remove_punctuation)
df['Content'] = df['Content'].apply(remove_names)

df['Title'] = df['Title'].apply(replace_abbrv)
df['Title'] = df['Title'].apply(remove_url)
df['Title'] = df['Title'].apply(md_to_text)
df['Title'] = df['Title'].apply(remove_NaN_newline)
df['Title'] = df['Title'].apply(remove_emoji)
df['Title'] = df['Title'].apply(remove_numbers)
df['Title'] = df['Title'].apply(lowercase)
df['Title'] = df['Title'].apply(remove_punctuation)
df['Title'] = df['Title'].apply(remove_names)

df['Epoch'] = df['Timestamp'].apply(convert_utc)

In [135]:
df.insert(4, "Title+Content", df["Title"] + " " + df["Content"]) 

In [136]:
df['Title+Content'] = df['Title+Content'].replace('', np.nan)
df['Title+Content'] = df['Title+Content'].replace('\u200b', np.nan)
df.dropna(subset=['Title+Content'], inplace=True, ignore_index=True)

In [138]:
df.shape

(1978, 12)

### Translation to English

In [139]:
from deep_translator import GoogleTranslator
import math

In [140]:
def trans(text):
    if len(text) > 5000:
        text_1 = GoogleTranslator(source='tl', target='en').translate(text[:math.floor(len(text)/2)]) 
        text_2 = GoogleTranslator(source='tl', target='en').translate(text[math.floor(len(text)/2):]) 
        return text_1 + " " + text_2
    else:   
        return GoogleTranslator(source='tl', target='en').translate(text)

In [142]:
trans_1 = df['Title+Content'][:500].apply(trans)

In [143]:
trans_2 = df['Title+Content'][500:1000].apply(trans)

In [144]:
trans_3 = df['Title+Content'][1000:1500].apply(trans)

In [147]:
trans_4 = df['Title+Content'][1500:2000].apply(trans)

In [148]:
trans_5 = df['Title+Content'][2000:].apply(trans)

In [149]:
df_test = pd.concat([trans_1, trans_2, trans_3, trans_4, trans_5])

In [150]:
df_test = df_test.to_frame().rename(columns={"Title+Content":"Translated"})

In [151]:
df_test["Translated"] = df_test["Translated"].apply(remove_names)
df_test["Translated"] = df_test["Translated"].apply(remove_punctuation)
df_test["Translated"] = df_test["Translated"].apply(lowercase)

In [None]:
df.dropna(subset=['Translated'], inplace=True, ignore_index=True)

In [152]:
df_test["Engagements"] = (df["Upvotes Count"] + df["Comments Count"]) / df["Upvote Ratio"] 

In [153]:
df.insert(5, "Translated", df_test["Translated"])
df.insert(8, "Engagements", df_test["Engagements"])

In [4]:
df.to_csv(f"../dataset/_compiled/Compiled.csv", index=False)