In [102]:
import pandas as pd # pip install pandas
import re # pip install re
import string 
import markdown # pip install markdown
from bs4 import BeautifulSoup # pip install beautifulsoup4
import csv
import emoji
import nltk

pd.set_option('display.max_columns', None)  # or 1000
pd.set_option('display.max_rows', None)  # or 1000
pd.set_option('display.max_colwidth', None)  # or 199

hot = pd.read_csv("../dataset/raw/hot.csv")
top = pd.read_csv("../dataset/raw/top.csv")
controversial = pd.read_csv("../dataset/raw/controversial.csv")
new = pd.read_csv("../dataset/raw/new.csv")

df = pd.concat([hot, top, controversial, new], ignore_index=True)

In [103]:
df.dropna(subset=['Content'], inplace=True)
df.shape

(1978, 10)

In [104]:
import unicodedata
import sys
URL_REGEX = r".*(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w\.-]*).*"
punctuation = "".join((chr(i) for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith('P')))

# Remove URLs
def remove_url(cell):
    return re.sub(URL_REGEX, "", str(cell))

# Convert markdown to plaintext 
def md_to_text(cell):
    html = markdown.markdown(cell)
    soup = BeautifulSoup(html, features='html.parser')
    return soup.get_text()

# Replace NaN and \n 
def remove_NaN_newline(cell):
    return re.sub(r"(\n|NaN|nan)", " ", str(cell))

def remove_punctuation(cell):
    return cell.translate(str.maketrans('', '', punctuation))

# Convert slangs
def replace_abbrv(cell):
    user_string = str(cell)
    user_string = user_string.split(" ")
    j = 0
    for _str in user_string:
        fileName = "slang.txt"
        accessMode = "r"
        with open(fileName, accessMode) as myCSVfile:
            dataFromFile = csv.reader(myCSVfile, delimiter="=")
            _str = re.sub('[^a-zA-Z0-9-_.]', '', _str)
            for row in dataFromFile:
                # if re.findall(fr"{_str}", row[0], flags=re.IGNORECASE):
                if _str.lower() == row[0].lower():
                    user_string[j] = row[1]
            myCSVfile.close()
        j = j + 1
    return ' '.join(user_string)

def remove_emoji(text):
    return emoji.replace_emoji(text, replace="")

def remove_numbers(cell):
    return re.sub(r"\d+", " ", str(cell))

def lowercase(cell):
    return str(cell).lower()

In [105]:
df['Content'] = df['Content'].apply(replace_abbrv)
df['Content'] = df['Content'].apply(remove_url)
df['Content'] = df['Content'].apply(md_to_text)
df['Content'] = df['Content'].apply(remove_NaN_newline)
df['Content'] = df['Content'].apply(remove_emoji)
df['Content'] = df['Content'].apply(remove_numbers)
df['Content'] = df['Content'].apply(lowercase)
df['Content'] = df['Content'].apply(remove_punctuation)

df['Title'] = df['Title'].apply(replace_abbrv)
df['Title'] = df['Title'].apply(remove_url)
df['Title'] = df['Title'].apply(md_to_text)
df['Title'] = df['Title'].apply(remove_NaN_newline)
df['Title'] = df['Title'].apply(remove_emoji)
df['Title'] = df['Title'].apply(remove_numbers)
df['Title'] = df['Title'].apply(lowercase)
df['Title'] = df['Title'].apply(remove_punctuation)

In [106]:
df["Title+Content"] = df["Title"] + " " + df["Content"]
df.reset_index(drop=True, inplace=True)

### Translation to English

In [107]:
from deep_translator import GoogleTranslator
import math

In [108]:
def trans(text):
    if len(text) > 5000:
        text_1 = GoogleTranslator(source='tl', target='en').translate(text[:math.floor(len(text)/2)]) 
        text_2 = GoogleTranslator(source='tl', target='en').translate(text[math.floor(len(text)/2):]) 
        return text_1 + " " + text_2
    else:   
        return GoogleTranslator(source='tl', target='en').translate(text)

In [109]:
trans_1 = df['Title+Content'][:500].apply(trans)

In [110]:
trans_2 = df['Title+Content'][500:1000].apply(trans)

In [111]:
trans_3 = df['Title+Content'][1000:1500].apply(trans)

In [112]:
trans_4 = df['Title+Content'][1500:2000].apply(trans)

In [113]:
trans_5 = df['Title+Content'][2000:].apply(trans)

In [114]:
df_test = pd.concat([trans_1, trans_2, trans_3, trans_4, trans_5])

In [115]:
df_test = df_test.to_frame().rename(columns={"Title+Content":"Translated"})

In [116]:
df_test["Translated"] = df_test["Translated"].apply(remove_punctuation)
df_test["Translated"] = df_test["Translated"].apply(lowercase)

In [117]:
df.insert(5, "Translated", df_test)

In [118]:
df.head(5)

Unnamed: 0,Timestamp,Content Type,Title,Content,Upvotes Count,Translated,Comments Count,Upvote Ratio,Flair,Permalink,Reddit ID,Title+Content
0,2024-04-21 13:01:23,Text-only,esena,help i filed a request for assistance sa esena re last pay na inabot ng months at coe last friday meron na akong ref number ng request for assistance kaso that same day biglang nagmessage ang human resources na okay na ang cheke ng last pay ko at pwede ko na sya ipick up my question is paano icancel yung request for assistance since nakuha ko na yung last pay at coe ko thank you po,2,esena help i filed a request for assistance at esena re last pay that took months and coe last friday i have the ref number of the request for assistance case that same day suddenly the human resources messaged that the check of my last pay is okay and can i have already picked him up my question is how to cancel the request for assistance since i already got my last pay and coe thank you,3,1.0,Advice Needed 🤔,https://www.reddit.com//r/AntiworkPH/comments/1c99rt3/esena/,1c99rt3,esena help i filed a request for assistance sa esena re last pay na inabot ng months at coe last friday meron na akong ref number ng request for assistance kaso that same day biglang nagmessage ang human resources na okay na ang cheke ng last pay ko at pwede ko na sya ipick up my question is paano icancel yung request for assistance since nakuha ko na yung last pay at coe ko thank you po
1,2024-04-20 14:50:31,Text-only,dodged a bad company and human resources i guess,ive applied for one of the subsidiaries of aboitiz and id really like to share how horrible the application process or how the human resources handled it i guess was bale i just graduated from my electrical engineering degree this january and ive started to apply to a few companies including this subsidiary from aboitiz when compared to companies ive also done interviews with like pldt jg summit and smc sobrang nakakadisappoint yung experience ko with aboitiz i had my expectations for aboitiz since it was known to be one of the top companies and employers sa philippines kaya siguro naman formal and may class talaga yung way kung paano makipagcommunicate yung human resources right kaso hindi para bang nakikipagcommunicate lang ako with someone sa isang lowend company who is deeply struggling in handling communications ang problema pa rito was that i was the one who was understanding and patient kahit na sila yung may lapses in handling my application three months have passed and theyve contacted me saying that my application was already being considered but it was placed on hold due to a problem sa vacancy ng position with that i just had to withdraw and say that i wasnt already comfortable with the process and how poorly they were handling my application i withdrew kasi baka my application was a hassle on their end pala tsaka baka hinihintay lang talaga ng human resources na magbreak ako towards their treatmentattitude nagreply yung human resources sa withdrawal ko hindi ko malagay exact phrases but heres the gist human resources said the usual stuff muna like were deeply sorry about the inconviences made on your end blablabla sorry to see you go tapos ayon sa dulo sinabi na sobrang arte ko raw for complaining and that i should man up more to understand reallife problems that are being encounted sa office nila i shouldnt be complaining din daw kasi hindi naman ako employee to ask for something on their end what i was just asking for was respect heres one one out of a lot inconvenienceproblem that happened they asked me to attend an f f interview sa office nila the two hour travel going to their office didnt matter kasi i was enjoying my stay sa province namin to avoid the city heat however when i arrived then waited for two hours sa lobby nobody sa division division ata or department kung saan yung mga mamemeet ko remembered that i had a scheduled interview the human resources responsible for this was called to have a meeting sa ibang office at a different city so nobody was left sa office nila to where i was to arrangeconduct the interview so ayon ending was that i was asked to go home instead forgave them for that kasi understandable na baka may miscommunications sila sa office kaso ayun lang binerate pa ako ng hr,26,dodged a bad company and human resources i guess ive applied for one of the subsidiaries of aboitiz and id really like to share how horrible the application process or how the human resources handled it i guess was bale i just graduated from my electrical engineering degree this january and ive started to apply to a few companies including this subsidiary from aboitiz when compared to companies ive also done interviews with like pldt jg summit and smc my experience with aboitiz was very disappointing i had my expectations for aboitiz since it was known to be one of the top companies and employers in the philippines so maybe the way human resources communicate is formal and classy ​​right its not like im just communicating with someone in a lowend company who is deeply struggling in handling communications the problem here was that i was the one who was understanding and patient even though they were the ones who had lapses in handling my application three months have passed and theyve contacted me saying that my application was already being considered but it was placed on hold due to a problem with the vacancy of the position with that i just had to withdraw and say that i wasnt already comfortable with the process and how poorly they were handling my application i withdrew because maybe my application was a hassle on their end besides maybe human resources is really just waiting for me to break towards their treatmentattitude human resources replied to my withdrawal i cant put exact phrases but heres the gist human resources said the usual stuff first like were deeply sorry about the inconveniences made on your end blablabla sorry to see you go then according to the end they said i was too arty raw for complaining and that i should man up more to understand reallife problems that are being encountered in their office i shouldnt be complaining too because i am not an employee to ask for something on their end what i was just asking for was respect heres one one out of a lot inconvenience problem that happened they asked me to attend an f f interview in their office the two hour travel going to their office didnt matter because i was enjoying my stay in our province to avoid the city heat however when i arrived then waited for two hours in the lobby nobody in the division division or department where the people i met remembered that i had a scheduled interview the human resources responsible for this was called to have a meeting in another office in a different city so nobody was left in their office where i was to arrangeconduct the interview so according to the ending was that i was asked to go home instead forgave them for that because it is understandable that they might have miscommunications in the office in that case the hr still rated me,2,0.93,Story 🗣️,https://www.reddit.com//r/AntiworkPH/comments/1c8jhvr/dodged_a_bad_company_and_hr_i_guess/,1c8jhvr,dodged a bad company and human resources i guess ive applied for one of the subsidiaries of aboitiz and id really like to share how horrible the application process or how the human resources handled it i guess was bale i just graduated from my electrical engineering degree this january and ive started to apply to a few companies including this subsidiary from aboitiz when compared to companies ive also done interviews with like pldt jg summit and smc sobrang nakakadisappoint yung experience ko with aboitiz i had my expectations for aboitiz since it was known to be one of the top companies and employers sa philippines kaya siguro naman formal and may class talaga yung way kung paano makipagcommunicate yung human resources right kaso hindi para bang nakikipagcommunicate lang ako with someone sa isang lowend company who is deeply struggling in handling communications ang problema pa rito was that i was the one who was understanding and patient kahit na sila yung may lapses in handling my application three months have passed and theyve contacted me saying that my application was already being considered but it was placed on hold due to a problem sa vacancy ng position with that i just had to withdraw and say that i wasnt already comfortable with the process and how poorly they were handling my application i withdrew kasi baka my application was a hassle on their end pala tsaka baka hinihintay lang talaga ng human resources na magbreak ako towards their treatmentattitude nagreply yung human resources sa withdrawal ko hindi ko malagay exact phrases but heres the gist human resources said the usual stuff muna like were deeply sorry about the inconviences made on your end blablabla sorry to see you go tapos ayon sa dulo sinabi na sobrang arte ko raw for complaining and that i should man up more to understand reallife problems that are being encounted sa office nila i shouldnt be complaining din daw kasi hindi naman ako employee to ask for something on their end what i was just asking for was respect heres one one out of a lot inconvenienceproblem that happened they asked me to attend an f f interview sa office nila the two hour travel going to their office didnt matter kasi i was enjoying my stay sa province namin to avoid the city heat however when i arrived then waited for two hours sa lobby nobody sa division division ata or department kung saan yung mga mamemeet ko remembered that i had a scheduled interview the human resources responsible for this was called to have a meeting sa ibang office at a different city so nobody was left sa office nila to where i was to arrangeconduct the interview so ayon ending was that i was asked to go home instead forgave them for that kasi understandable na baka may miscommunications sila sa office kaso ayun lang binerate pa ako ng hr
2,2024-04-20 12:11:03,Image,sick leave na pahirapan,im under an agency nag file ako ng sick leave for some reason not because im sick prior to that nag ask na ako ng permission sa client ko days ahead i also provided med cert tapos itong company nato need pa ng screenshotdocumentation na nag ask ako ng permission sa client so balewala yung med cert ko wala yung acknowledgement from client ganito ba talaga kahirap mah file ng sick leave sa pilipinas,37,sick leave to be tortured im under an agency i filed a sick leave for some reason not because im sick prior to that i asked permission from my client days ahead i also provided med cert then this company we still need a screenshotdocumentation that i asked permission from the client so my med cert is irrelevant there is no acknowledgment from the client is it really this difficult to file a sick leave in the philippines,6,0.94,Rant 😡,https://www.reddit.com//r/AntiworkPH/comments/1c8gv4u/sick_leave_na_pahirapan/,1c8gv4u,sick leave na pahirapan im under an agency nag file ako ng sick leave for some reason not because im sick prior to that nag ask na ako ng permission sa client ko days ahead i also provided med cert tapos itong company nato need pa ng screenshotdocumentation na nag ask ako ng permission sa client so balewala yung med cert ko wala yung acknowledgement from client ganito ba talaga kahirap mah file ng sick leave sa pilipinas
3,2024-04-21 00:36:41,Text-only,help a nervous young adult out,​ overkill ba yung hours a week magkakaroon pa kaya ako nito ng worklife balance oks ba yung k starting pay as probationary for months hindi ko rin matukoy kung oks kasi hindi ko pa alam masyado yung workload eh kung chill oks naman pero kung magiging overworked ako edi super kulang haha for people who have worked this role kumusta kindly let me know if maganda ba tong role na to or what haha ​ ayun maraming salamat sa mga sasagot pasensiya na sa gulo at sa mga tanong wala kasi akong magulangolder adult sa buhay kaya walang mahingian ng guidance tips and encouragement haha thank you uli,1,help a nervous young adult out are the hours a week too much will i still have a worklife balance is starting pay as probationary for months ok but if i become overworked its super lacking haha ​​for people who have worked this role how are you kindly let me know if this role is good or what haha ​​​ well thank you very much to those who will answer sorry for the trouble and the questions because i dont have parents adult in life so no one can ask for guidance tips and encouragement haha ​​thank you again,0,0.67,Advice Needed 🤔,https://www.reddit.com//r/AntiworkPH/comments/1c8u95a/help_a_nervous_young_adult_out/,1c8u95a,help a nervous young adult out ​ overkill ba yung hours a week magkakaroon pa kaya ako nito ng worklife balance oks ba yung k starting pay as probationary for months hindi ko rin matukoy kung oks kasi hindi ko pa alam masyado yung workload eh kung chill oks naman pero kung magiging overworked ako edi super kulang haha for people who have worked this role kumusta kindly let me know if maganda ba tong role na to or what haha ​ ayun maraming salamat sa mga sasagot pasensiya na sa gulo at sa mga tanong wala kasi akong magulangolder adult sa buhay kaya walang mahingian ng guidance tips and encouragement haha thank you uli
4,2024-04-21 00:15:20,Text-only,absent without leave kumustahan,i know this maybe off pero curious talaga ako sa former government employees dito na nagawol kumusta kayo hirap panong magclearance daw kaso gusto ko na talagang umalis dito as soon as possible since super toxic na never again na talaga akong magovernment,1,absent without leave how are you i know this maybe off but im really curious about the former government employees here who have been lost how are you how are you having a hard time getting clearance i really want to leave here as soon as possible since its super toxic and ill never really be in government again,4,1.0,Discussions 💭,https://www.reddit.com//r/AntiworkPH/comments/1c8trku/awol_kumustahan/,1c8trku,absent without leave kumustahan i know this maybe off pero curious talaga ako sa former government employees dito na nagawol kumusta kayo hirap panong magclearance daw kaso gusto ko na talagang umalis dito as soon as possible since super toxic na never again na talaga akong magovernment


In [119]:
df.to_csv(f"../dataset/_compiled/Compiled.csv", index=False)

In [120]:
# df_save = df.copy(deep=True)