# Loading Dataset and Merging

In [1]:
import pandas as pd
import os

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/princessventures/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/princessventures/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
dataset_path = '../dataset'

dataframes = []

for filename in os.listdir(dataset_path):
    if filename.endswith('.csv'):
        
        file_path = os.path.join(dataset_path, filename)
       
        parts = filename.split('_')
        subreddit_name = parts[1]
        search_input = parts[2]
        
        df = pd.read_csv(file_path, parse_dates=['created'], dtype={
            'title': 'string', 'body': 'string', 'link': 'string', 
            'author': 'string', 'upvotes': 'int64'
        })
        
        df['subreddit'] = subreddit_name
        df['search_input'] = search_input
        
        if not df.empty and not df.isna().all().all():
            dataframes.append(df)

if dataframes:
    combined_df = pd.concat(dataframes, ignore_index=True)
else:
    print("No valid CSV files found or all files are empty.")

In [3]:
combined_df.head()

Unnamed: 0,title,body,link,author,created,upvotes,subreddit,search_input
0,"I don't care if you're a girl, unsolicited pic...","So like many of the people on this subreddit, ...",https://www.reddit.com/r/alasjuicy/comments/10...,t2_uzp1eor9,2023-01-10 18:35:34,127,alasjuicy,unsolicited pics
1,Unsolicited Dick Pics Enjoyer,Bago lang ako naging active dito sa AJ. Mostly...,https://www.reddit.com/r/alasjuicy/comments/1a...,t2_9x1nt0ze2,2024-02-08 03:58:45,70,alasjuicy,unsolicited pics
2,UNSOLICITED DICK PIC,Hello men of aj! Eto sakin lang ah. Para saki...,https://www.reddit.com/r/alasjuicy/comments/18...,t2_q6lir5af,2023-11-27 19:50:18,84,alasjuicy,unsolicited pics
3,Anong meron sa unsolicited pics?,Hello there! So curious lang ako sa sagot niyo...,https://www.reddit.com/r/alasjuicy/comments/12...,t2_tbimt3mv,2023-03-28 23:40:43,10,alasjuicy,unsolicited pics
4,unsolicited poging dck pics,good afternoon aj peeps! your 20 y/o girl is b...,https://www.reddit.com/r/alasjuicy/comments/15...,t2_t3ml38tb,2023-08-08 17:09:15,46,alasjuicy,unsolicited pics


In [4]:
combined_df.shape

(1921, 8)

In [5]:
combined_df.describe()

Unnamed: 0,created,upvotes
count,1921,1921.0
mean,2022-06-18 22:52:46.305049600,66.346694
min,2013-08-07 01:22:56,0.0
25%,2021-11-26 19:02:16,2.0
50%,2023-03-08 13:59:45,9.0
75%,2023-10-16 16:50:43,36.0
max,2024-03-05 05:01:06,4927.0
std,,216.99102


In [6]:
combined_df.count()

title           1921
body            1734
link            1921
author          1921
created         1921
upvotes         1921
subreddit       1921
search_input    1921
dtype: int64

In [7]:
combined_df['subreddit'].unique()

array(['alasjuicy', 'OffmychestPH', 'relationship advicePH',
       'MentalHealthPH', 'Philippines'], dtype=object)

In [8]:
combined_df['search_input'].unique()

array(['unsolicited pics', 'abused', 'emotional abuse', 'Cyberbullying',
       'stalking', 'physical abuse', 'domestic violence',
       'sexual harassment', 'harassed'], dtype=object)

# Pre-processing Steps

In [9]:
combined_df.isnull().sum()

title             0
body            187
link              0
author            0
created           0
upvotes           0
subreddit         0
search_input      0
dtype: int64

In [10]:
combined_df.dropna(subset=['body'], inplace=True)

In [11]:
# text tokenization and stop words removal

stop_words = set(stopwords.words('english'))

def tokenize_and_remove_stopwords(text):
    words = word_tokenize(text)
    filtered_words = [word for word in words if word not in stop_words]
    return filtered_words

In [12]:
combined_df['title'] = combined_df['title'].apply(tokenize_and_remove_stopwords)
combined_df['body'] = combined_df['body'].apply(tokenize_and_remove_stopwords)

In [13]:
# feature engineering

# extract year, month, day from the 'created' column
combined_df['year'] = combined_df['created'].dt.year
combined_df['month'] = combined_df['created'].dt.month
combined_df['day'] = combined_df['created'].dt.day

In [14]:
# calculate post length
combined_df['post_length'] = combined_df['body'].apply(len)

In [15]:
# calculate day of the week and other time-related features
combined_df['day_of_week'] = combined_df['created'].dt.dayofweek
combined_df['is_weekend'] = combined_df['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)
combined_df['time_of_day'] = pd.cut(combined_df['created'].dt.hour, 
                                   bins=[0, 6, 12, 18, 24], 
                                   labels=['Night', 'Morning', 'Afternoon', 'Evening'], 
                                   right=False)

In [16]:
# relative_upvotes
# normalize upvotes by the average upvotes per month to see if certain posts got unusually high engagement
monthly_avg_upvotes = combined_df.groupby('month')['upvotes'].transform('mean')
combined_df['upvotes_relative'] = combined_df['upvotes'] / monthly_avg_upvotes

In [17]:
membership_sizes = {
    'alasjuicy': 247000,
    'OffmychestPH': 493000,
    'relationship advicePH': 130000,
    'MentalHealthPH': 46000,
    'Philippines': 1700000
}

combined_df['subreddit_size'] = combined_df['subreddit'].map(membership_sizes)

In [18]:
combined_df['upvotes_per_capita'] = combined_df['upvotes'] / combined_df['subreddit_size']

In [19]:
# possible sana gumawa ng engangement metric to determine if mayroon tayong # of comments

In [None]:
# possible din gumawa ng post timing relative sa subreddit peak hours

In [20]:
combined_df

Unnamed: 0,title,body,link,author,created,upvotes,subreddit,search_input,year,month,day,post_length,day_of_week,is_weekend,time_of_day,upvotes_relative,subreddit_size,upvotes_per_capita
0,"[I, n't, care, 're, girl, ,, unsolicited, pics...","[So, like, many, people, subreddit, ,, I, take...",https://www.reddit.com/r/alasjuicy/comments/10...,t2_uzp1eor9,2023-01-10 18:35:34,127,alasjuicy,unsolicited pics,2023,1,10,140,1,0,Evening,2.685342,247000,0.000514
1,"[Unsolicited, Dick, Pics, Enjoyer]","[Bago, lang, ako, naging, active, dito, sa, AJ...",https://www.reddit.com/r/alasjuicy/comments/1a...,t2_9x1nt0ze2,2024-02-08 03:58:45,70,alasjuicy,unsolicited pics,2024,2,8,125,3,0,Night,1.048631,247000,0.000283
2,"[UNSOLICITED, DICK, PIC]","[Hello, men, aj, !, Eto, sakin, lang, ah, ., P...",https://www.reddit.com/r/alasjuicy/comments/18...,t2_q6lir5af,2023-11-27 19:50:18,84,alasjuicy,unsolicited pics,2023,11,27,146,0,0,Evening,1.112583,247000,0.000340
3,"[Anong, meron, sa, unsolicited, pics, ?]","[Hello, !, So, curious, lang, ako, sa, sagot, ...",https://www.reddit.com/r/alasjuicy/comments/12...,t2_tbimt3mv,2023-03-28 23:40:43,10,alasjuicy,unsolicited pics,2023,3,28,105,1,0,Evening,0.331898,247000,0.000040
4,"[unsolicited, poging, dck, pics]","[good, afternoon, aj, peeps, !, 20, y/o, girl,...",https://www.reddit.com/r/alasjuicy/comments/15...,t2_t3ml38tb,2023-08-08 17:09:15,46,alasjuicy,unsolicited pics,2023,8,8,164,1,0,Afternoon,0.822628,247000,0.000186
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1916,"[The, Sacrificial, Lamb, (, 1, )]","[[, !, !, !, TRIGGER, WARNING, !, !, !, death,...",https://www.reddit.com/r/OffMyChestPH/comments...,t2_bggost36,2022-11-11 04:50:55,10,OffmychestPH,sexual harassment,2022,11,11,893,4,0,Night,0.132450,493000,0.000020
1917,"[Why, I, ca, n't, survive, person, autism, ., ...","[\, [, long, story, ,, pasensya, na, po\, ], H...",https://www.reddit.com/r/OffMyChestPH/comments...,t2_f81f4p4m,2023-03-06 19:21:46,6,OffmychestPH,sexual harassment,2023,3,6,1442,0,0,Evening,0.199139,493000,0.000012
1918,"[I, fix, /s]","[Hello, mga, ka-, r/OffMyChestPH, ,, first, re...",https://www.reddit.com/r/OffMyChestPH/comments...,t2_8qrq5cgm,2024-02-23 07:43:34,20,OffmychestPH,sexual harassment,2024,2,23,536,4,0,Morning,0.299609,493000,0.000041
1919,"[It, 's, difficult, recover, addiction, ,, esp...","[Trigger, Warning, ., I, 've, hypersexual, lon...",https://www.reddit.com/r/OffMyChestPH/comments...,t2_935kirllt,2023-04-19 11:03:04,1,OffmychestPH,sexual harassment,2023,4,19,422,2,0,Morning,0.019596,493000,0.000002


In [21]:
combined_df.to_csv('preprocessed_dataset.csv', index=False)

# PH Holidays

In [29]:
data = {
    'Date': ['01-01', '03-28', '03-29', '04-09', '05-01', '06-12', '08-26', '11-30', '12-25', '12-30', 
             '04-10', '08-21', '11-01', '12-08', '12-31', '02-09', '02-10', '03-30', '11-02', '12-24', '03-01'],
    'Holiday': ['New Year\'s Day', 'Maundy Thursday', 'Good Friday', 'Araw ng Kagitingan', 'Labor Day', 
                'Independence Day', 'National Heroes Day', 'Bonifacio Day', 'Christmas Day', 'Rizal Day', 
                'Eid\'l Fitr', 'Ninoy Aquino Day', 'All Saints\' Day', 'Feast of the Immaculate Conception of Mary', 
                'Last Day of the Year', 'Additional Special (Non-Working) Day', 'Chinese New Year', 'Black Saturday', 
                'All Souls\' Day', 'Christmas Eve', 'Women\'s Month Start'],
    'Type': ['Regular', 'Regular', 'Regular', 'Regular', 'Regular', 'Regular', 'Regular', 'Regular', 'Regular', 
             'Regular', 'Regular', 'Special Non-Working', 'Special Non-Working', 'Special Non-Working', 
             'Special Non-Working', 'Special Non-Working', 'Special Non-Working', 'Special Non-Working', 
             'Special Non-Working', 'Special Non-Working', 'Special Non-Working']
}

holidays_df = pd.DataFrame(data)

holidays_df


Unnamed: 0,Date,Holiday,Type
0,01-01,New Year's Day,Regular
1,03-28,Maundy Thursday,Regular
2,03-29,Good Friday,Regular
3,04-09,Araw ng Kagitingan,Regular
4,05-01,Labor Day,Regular
5,06-12,Independence Day,Regular
6,08-26,National Heroes Day,Regular
7,11-30,Bonifacio Day,Regular
8,12-25,Christmas Day,Regular
9,12-30,Rizal Day,Regular


In [30]:
holidays_df.to_csv('ph_holidays.csv', index=False)