In [1]:
# import libraries

import praw
import pandas as pd
import os
import time
from dotenv import load_dotenv

In [2]:
# load environment variables
load_dotenv()

# read environment variables
client_id = os.getenv("REDDIT_CLIENT_ID")
client_secret = os.getenv("REDDIT_CLIENT_SECRET")
user_agent = os.getenv("REDDIT_USER_AGENT")

In [3]:
# create reddit instance
reddit = praw.Reddit(client_id=client_id, client_secret=client_secret, user_agent=user_agent)

In [4]:
rphilippines = reddit.subreddit('philippines')

In [10]:
data = pd.DataFrame(columns=['id', 'title', 'score', 'subreddit', 'url', 'num_comments', 'body', 'created']).set_index('id')

In [16]:
def get_submissions(data, subreddit, keyword, limit):
        
    try:
        for submission in subreddit.search(keyword, limit=limit, syntax='cloudsearch'):
            data.loc[submission.id] = [submission.title, submission.score, submission.subreddit, submission.url, submission.num_comments, submission.selftext, submission.created]
    except praw.exceptions.RedditAPIException as e:
        if e.error_type == 'RATELIMIT':
            delay_seconds = int(e.message.split(' ')[-1])
            print(f'Hit rate limit. Sleeping for {delay_seconds} seconds.')
            time.sleep(delay_seconds + 1)
        else:
            raise
    
    return data

In [17]:
# get submissions
data = get_submissions(data, rphilippines, 'sinovac', 10000)

In [18]:
data.shape

(238, 7)

In [19]:
data = get_submissions(data, rphilippines, 'pfizer', 10000)

In [21]:
data = get_submissions(data, rphilippines, 'moderna', 10000)

In [23]:
data = get_submissions(data, rphilippines, 'astrazeneca', 10000)

In [24]:
data = get_submissions(data, rphilippines, 'J and J', 10000)

In [25]:
data = get_submissions(data, rphilippines, 'vaccine', 10000)

In [26]:
data = get_submissions(data, rphilippines, 'vaccination', 10000)

In [27]:
keywords = [
    'COVID vaccine Philippines',
    'Vaccine hesitancy Philippines',
    'Sinovac Philippines',
    'Pfizer Philippines',
    'Moderna Philippines',
    'AstraZeneca Philippines',
    'Vaccination rollout Philippines',
    'Vaccine efficacy Philippines',
    'Vaccine safety Philippines',
    'Vaccination program Philippines',
    'Vaccine side effects Philippines',
    'Vaccination drive Philippines',
    'COVID vaccination Philippines',
    'Vaccine misinformation Philippines',
    'Vaccine skepticism Philippines',
    'Vaccine mandates Philippines',
    'Vaccine distribution Philippines',
    'Vaccination campaign Philippines',
    '#COVIDVaccinePH',
    '#VaccinePH',
    '#GetVaccinatedPH',
    '#VaccineForAllPH',
    '#VaccineRolloutPH',
    '#VaccineSafetyPH',
    '#VaccineEfficiencyPH'
]

for kw in keywords:
    data = get_submissions(data, rphilippines, kw, 10000)

In [28]:
# save data
data.to_csv('reddit_posts_data.csv')

In [29]:
# check duplicates
data[data.duplicated(subset=['title', 'body'], keep=False)].sort_values(by='title')

Unnamed: 0_level_0,title,score,subreddit,url,num_comments,body,created
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
mcv85r,'Wala pa': Duterte says Philippines yet to buy...,10,Philippines,https://news.abs-cbn.com/news/03/24/21/wala-pa...,10,,1616669000.0
mcnyrc,'Wala pa': Duterte says Philippines yet to buy...,19,Philippines,https://news.abs-cbn.com/news/03/24/21/wala-pa...,15,,1616640000.0
nkje4i,Covid made the Philippines' hunger crisis wors...,27,Philippines,https://www.cnn.com/2021/05/24/asia/philippine...,16,,1621926000.0
nzcv9o,Covid made the Philippines' hunger crisis wors...,3,Philippines,https://edition.cnn.com/2021/05/24/asia/philip...,4,,1623639000.0
kkw5bc,Duterte to US: Deliver COVID-19 vaccine to PH ...,1,Philippines,https://news.abs-cbn.com/news/12/27/20/duterte...,8,,1609039000.0
kkx6kf,Duterte to US: Deliver COVID-19 vaccine to PH ...,13,Philippines,https://i.redd.it/1dr57cbyqn761.jpg,25,,1609044000.0
j01ti7,FDA says 'best case scenario' of COVID-19 vacc...,3,Philippines,https://www.rappler.com/nation/fda-says-covid-...,1,,1601106000.0
izg5jl,FDA says 'best case scenario' of COVID-19 vacc...,0,Philippines,https://www.rappler.com/nation/fda-says-covid-...,3,,1601024000.0
kpm4z5,PH seeks $300-M World Bank loan to buy COVID v...,22,Philippines,https://newsinfo.inquirer.net/1378725/ph-seeks...,2,,1609687000.0
kpb81x,PH seeks $300-M World Bank loan to buy COVID v...,7,Philippines,https://newsinfo.inquirer.net/1378725/ph-seeks...,6,,1609640000.0


In [30]:
# drop duplicates
data.drop_duplicates(subset=['title', 'body'], keep='first', inplace=True)

In [37]:
data.columns

Index(['title', 'score', 'subreddit', 'url', 'num_comments', 'body',
       'created'],
      dtype='object')

In [36]:
# Check if the body is empty
data[data['body'] != '']

Unnamed: 0_level_0,title,score,subreddit,url,num_comments,body,created
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
lw0grz,AMA: I am an HCW who received their first dose...,1092,Philippines,https://www.reddit.com/r/Philippines/comments/...,397,Edit - 5 hour solo AMA grabe. Thanks to all t...,1.614687e+09
p2fwha,"Sinovac worked, I guess",163,Philippines,https://www.reddit.com/r/Philippines/comments/...,96,"My brother's friend, fully vaccinated with Sin...",1.628697e+09
kv06lx,Harry Roque has just reiterated multiple times...,562,Philippines,https://www.reddit.com/r/Philippines/comments/...,153,I have had enough! Bastusan na talaga ito. \n\...,1.610361e+09
mrch5k,Efficacy of Sinovac's CoronaVac vaccine,215,Philippines,https://www.reddit.com/r/Philippines/comments/...,68,There has been a lot of conflicting reports on...,1.618485e+09
mecovq,AMA 2: I'm an HCW who received Sinovac COVID19...,94,Philippines,https://www.reddit.com/r/Philippines/comments/...,145,Previous AMA: [https://www.reddit.com/r/Philip...,1.616847e+09
...,...,...,...,...,...,...,...
lrvb8f,Mon Tulfo Full Transcript interview about his ...,4,Philippines,https://www.reddit.com/r/Philippines/comments/...,1,The entire transcript can be read here: [https...,1.614220e+09
hjcrwl,"Longshot, but anyone here not able to see thei...",0,Philippines,https://www.reddit.com/r/Philippines/comments/...,2,"To clarify, I totally understand why we have t...",1.593618e+09
6sz7fb,Planning to buy a puppy,2,Philippines,https://www.reddit.com/r/Philippines/comments/...,24,Hi dog owners of r/Ph! Tanong ko lang if masya...,1.502430e+09
gd372h,If only Duterte had acted on his initial insti...,0,Philippines,https://www.reddit.com/r/Philippines/comments/...,21,"""It is really the world has passed on many kin...",1.588559e+09


In [45]:
# extract all the comments from the submission
def get_submission_comments(submission_id):
    submission = reddit.submission(id=submission_id)
    submission.comments.replace_more(limit=None) # Replace "MoreComments" objects with actual comments
    comments = []
    for comment in submission.comments:
        comments.append(comment)
        comments.extend(comment.replies)
    return comments

In [61]:
# convert comments to dataframe

comments_data = pd.DataFrame(columns=['id', 'body', 'score', 'created']).set_index('id')

In [62]:
import prawcore

# get comments
submission_ids = data.index

for submission_id in submission_ids:
    while True:
        try:
            comments = get_submission_comments(submission_id)
            break
        except (praw.exceptions.APIException, prawcore.exceptions.ServerError) as e:
            if isinstance(e, praw.exceptions.APIException):
                print(f"Encountered an API exception: {e}")
            elif isinstance(e, prawcore.exceptions.ServerError):
                print(f"Encountered a server error: {e}")
            else:
                print(f"Encountered an exception: {e}")
                
            print(f"Waiting for {30} seconds...")
            time.sleep(30)

    for comment in comments:
        comments_data.loc[comment.id] = [comment.body, comment.score, comment.created]


KeyboardInterrupt: 

In [68]:
# remove duplicates
comments_data.drop_duplicates(subset=['body'], keep='first', inplace=True)

In [69]:
# save data as pickle
comments_data.to_pickle('reddit_comments_data.pkl')

In [None]:
# Load pickle file
comments_data = pd.read_pickle('reddit_comments_data.pkl')

In [81]:
comments_data.body

id
h02hbk3    hahahaha. yung pinsan kong dds ganyan yung nan...
h02shqk    his mistake was bakit siya umuwi na di pa pala...
h02i5ib    i'm sure there is a place for him with the dds...
h02iiq9                  comment mo na lang sa amok post nya
h02iy2n    i think its better giving them advise to reach...
                                 ...                        
i89lggp    yeah but i worry that marcos won't do anything...
i89lhrd    he might acquire this to solidify their power ...
i89r3ao    kahit ano sa pic basta yung top of the line fu...
i89t0c0    lahat ng nasa pic ay nabili na natin hindi pa ...
ihuevdx    "let's take a break muna from the election" "m...
Name: body, Length: 35303, dtype: object

In [84]:
comments_data.columns

Index(['body', 'score', 'created'], dtype='object')

In [82]:
# remove empty submissions
data = data[data['body'] != '']

In [88]:
# concat data[['body', 'score', 'created']] to comments_data

comments_data = pd.concat([comments_data, data[['body', 'score', 'created']]])

In [89]:
comments_data.shape

(35790, 3)

In [90]:
# remove \n and \t in the body
comments_data['body'] = comments_data['body'].str.replace('\n', ' ')
comments_data['body'] = comments_data['body'].str.replace('\t', ' ')
# remove punctuations in the body
comments_data['body'] = comments_data['body'].str.replace('[^\w\s]', '')
# remove numbers in the body
comments_data['body'] = comments_data['body'].str.replace('\d+', '')
# remove extra spaces in the body
comments_data['body'] = comments_data['body'].str.replace(' +', ' ')
# remove empty comments
comments_data = comments_data[comments_data['body'] != '']
# lowercase the body
comments_data['body'] = comments_data['body'].str.lower()

In [92]:
# save data as pickle
comments_data.to_pickle('reddit_comments_data.pkl')

In [94]:
keywords = [ 'sinovac', 'pfizer', 'moderna', 'astrazeneca', 'j and j', 'vaccine', 'vaccination', 'booster']

In [95]:
# Filter comments that only contain the keywords
comments_data = comments_data[comments_data['body'].str.contains('|'.join(keywords))]

In [97]:
# save data as csv

comments_data.to_csv('reddit_comments_data.csv', index=False)