In [1]:
import os, json, time, gzip, io, datetime as dt
from pathlib import Path
from dotenv import load_dotenv
import boto3, praw

In [2]:
# config
# ROOT = Path(__file__).resolve().parents[1]
ROOT = Path('/home/ubuntu/deds2025b_proj/opt/reddit_pipeline')    # FOR NOTEBOOK ONLY
load_dotenv(ROOT / '.env')

BUCKET = os.environ["LAKE_BUCKET"]
PREFIX = "bronze/reddit"
REDDIT_SECRET_ARN = os.environ["REDDIT_SECRET_ARN"]
s3 = boto3.client("s3")
secrets = boto3.client("secretsmanager")

In [3]:
subreddits = [
    'Philippines', 'OffMyChestPH', 'adultingph', 'AskPH'
]
keywords = [
    'cancer', 'malignant', 'Malignant', 'Metastasis', 'Neoplasm', 'Tumor',
    'Carcinoma', 'Sarcoma', 'Benign', 'Tumor Grade', 'Prognosis', 
    'Chemotherapy', 'Immunotherapy', 'Biopsy', 'Remission', 'Oncology'
]
max_posts = 150
since_hours = 8760

In [4]:
# --- helper functions ---
def reddit_client():
    cfg = json.loads(secrets.get_secret_value(SecretId=REDDIT_SECRET_ARN)['SecretString'])
    return praw.Reddit(
        client_id=cfg['client_id'],
        client_secret=cfg['client_secret'],
        user_agent='aws:batch-ec2:1.0 (by u/Entire-Success-5370)'
        )

def dump_jsonl_gz(objs, key):
    buf = io.BytesIO()
    with gzip.GzipFile(fileobj=buf, mode='wb') as gz:
        for o in objs:
            gz.write((json.dumps(o, ensure_ascii=False)+'\n').encode('utf-8'))
    buf.seek(0)
    s3.upload_fileobj(buf, BUCKET, key)

def run(subs=subreddits, max_posts=150, since_hours=168):
    rd = reddit_client()
    run_id = dt.datetime.now(dt.UTC).strftime('%Y%m%dT%H%M%SZ')
    dt_part = dt.datetime.now(dt.UTC).strftime('%Y-%m-%d')
    cutoff = time.time() - since_hours*3600

    for sr in subs:
        posts, comments = [], []
        for p in rd.subreddit(sr).new(limit=None):
            if len(posts) >= max_posts or p.created_utc < cutoff:
                break
            if any(k.lower() in p.title.lower() for k in keywords):
                posts.append(
                    {
                        # for post table
                        'kind':'post',
                        'post_name':p.name,
                        'title': p.title,
                        'selftext': p.selftext or '',
                        'score': p.score,
                        'upvote_ratio': p.upvote_ratio,
                        'num_comments': p.num_comments,
                        'url': p.url or '',
                        'created_utc': p.created_utc,                        
                    
                        #for author table
                        'author_fullname': p.author_fullname,
                        'author': str(p.author).split("='")[-1].replace("')", ""),
                        'author_premium': p.author_premium,
                        
                        # for subreddit table
                        'subreddit_id':p.subreddit_id,
                        'subreddit_name_prefixed': str(p.subreddit_name_prefixed),
                        'subreddit_type': p.subreddit_type,
                        'subreddit_subscribers': p.subreddit_subscribers,
                    }
                )
                p.comments.replace_more(limit=None)
                for c in p.comments.list():
                    comments.append(
                        {
                            'kind':'comment',
                            'id':c.id,
                            'name':c.name,
                            'link_id':c.link_id,
                            'parent_id':c.parent_id, # links the comment to parent `name`
                            'subreddit':str(c.subreddit),
                            'created_utc':c.created_utc,
                            'body':c.body or '',
                            'author':str(c.author) if c.author else None,
                            'score':c.score
                        }
                    )
                time.sleep(0.5)

        base = f'{PREFIX}/dt={dt_part}/subreddit={sr}/run_id={run_id}'
        if posts:
            dump_jsonl_gz(posts, f'{base}/posts.jsonl.gz')
        if comments:
            dump_jsonl_gz(comments, f'{base}/comments.jsonl.gz')
    print('DONE', run_id)

In [6]:
run(subreddits, max_posts=1, since_hours=since_hours)

DONE 20250823T165659Z


In [3]:
cfg = json.loads(secrets.get_secret_value(SecretId=REDDIT_SECRET_ARN)['SecretString'])

reddit = praw.Reddit(
    client_id=cfg['client_id'],
    client_secret=cfg['client_secret'],
    user_agent='aws:batch-ec2:1.0 (by u/Entire-Success-5370)',
    # ratelimit_seconds=None
)

In [9]:
for p in reddit.subreddit("IndustrialPharmacy").new(limit=None):
    if p.name == 't3_1mzxh08':
        for c in p.comments.list():
            if c.name == 't1_naq3cyb':
                print(vars(c))
                break

{'_replies': <praw.models.comment_forest.CommentForest object at 0x7f6550e11b50>, '_submission': Submission(id='1mzxh08'), '_reddit': <praw.reddit.Reddit object at 0x7f6550d96510>, 'subreddit_id': 't5_35ezb', 'approved_at_utc': None, 'author_is_blocked': False, 'comment_type': None, 'awarders': [], 'mod_reason_by': None, 'banned_by': None, 'author_flair_type': 'text', 'total_awards_received': 0, 'subreddit': Subreddit(display_name='IndustrialPharmacy'), 'author_flair_template_id': None, 'likes': None, 'user_reports': [], 'saved': False, 'id': 'naq3cyb', 'banned_at_utc': None, 'mod_reason_title': None, 'gilded': 0, 'archived': False, 'collapsed_reason_code': None, 'no_follow': True, 'author': Redditor(name='Previous-Lobster129'), 'can_mod_post': False, 'created_utc': 1756192984.0, 'send_replies': True, 'parent_id': 't3_1mzxh08', 'score': 1, 'author_fullname': 't2_1u9coamw95', 'approved_by': None, 'mod_note': None, 'all_awardings': [], 'collapsed': False, 'body': 'Maybe due to feel more 

In [33]:
keywords = [
    'cancer', 'malignant', 'Malignant', 'Metastasis', 'Neoplasm', 'Tumor',
    'Carcinoma', 'Sarcoma', 'Benign', 'Tumor Grade', 'Prognosis', 
    'Chemotherapy', 'Immunotherapy', 'Biopsy', 'Remission', 'Oncology'
]

for p in reddit.subreddit("AskPH").new(limit=None):
    if any(k.lower() in p.title.lower() for k in keywords):
        # if p.selftext:
        print(p.title)
        p.comments.replace_more(limit=0)
        take=0
        break

If curing cancer and AIDS forever required sacrificing your sibling, how would you decide and could you live with the choice?


In [12]:
submission = reddit.submission("1bc4pw5")
print(submission.title)  # to make it non-lazy
print(submission.selftext)

Are you in favor of medically assisted suicide?
Why and why not?

 If yes, what are the conditions or qualifications of a possible applicant you are in favor to be allowed to go through the process? 


In [13]:
submission = reddit.submission("1bc4pw5")
print(submission.title)  # to make it non-lazy
print(vars(submission))

Are you in favor of medically assisted suicide?
{'comment_limit': 2048, 'comment_sort': 'confidence', 'id': '1bc4pw5', '_reddit': <praw.reddit.Reddit object at 0x7b6812e256d0>, '_fetched': True, '_additional_fetch_params': {}, '_comments_by_id': {'t1_kudfurz': Comment(id='kudfurz'), 't1_kudmu09': Comment(id='kudmu09'), 't1_kudn3dl': Comment(id='kudn3dl'), 't1_kufzbdv': Comment(id='kufzbdv'), 't1_kudmzo8': Comment(id='kudmzo8'), 't1_kuhf9b2': Comment(id='kuhf9b2'), 't1_kuhichq': Comment(id='kuhichq'), 't1_kudtdob': Comment(id='kudtdob'), 't1_kudyk4p': Comment(id='kudyk4p'), 't1_kudun7t': Comment(id='kudun7t'), 't1_kueq3sr': Comment(id='kueq3sr'), 't1_kugtti9': Comment(id='kugtti9'), 't1_kuh2c16': Comment(id='kuh2c16'), 't1_kuhm2rr': Comment(id='kuhm2rr'), 't1_kuhm509': Comment(id='kuhm509'), 't1_kudsfha': Comment(id='kudsfha'), 't1_kue6sl6': Comment(id='kue6sl6'), 't1_kue7rul': Comment(id='kue7rul'), 't1_kuel1g9': Comment(id='kuel1g9'), 't1_kudoh8l': Comment(id='kudoh8l'), 't1_kueoh8j':

In [17]:
submission = reddit.submission("1bc4pw5")
print(submission.title)  # to make it non-lazy
for c in submission.comments.list():
    print(vars(c))
    break

Are you in favor of medically assisted suicide?
{'_replies': <praw.models.comment_forest.CommentForest object at 0x7b681233e510>, '_submission': Submission(id='1bc4pw5'), '_reddit': <praw.reddit.Reddit object at 0x7b6812e256d0>, 'subreddit_id': 't5_3a7odq', 'approved_at_utc': None, 'author_is_blocked': False, 'comment_type': None, 'awarders': [], 'mod_reason_by': None, 'banned_by': None, 'author_flair_type': 'text', 'total_awards_received': 0, 'subreddit': Subreddit(display_name='AskPH'), 'author_flair_template_id': None, 'likes': None, 'user_reports': [], 'saved': False, 'id': 'kudfurz', 'banned_at_utc': None, 'mod_reason_title': None, 'gilded': 0, 'archived': True, 'collapsed_reason_code': None, 'no_follow': False, 'author': Redditor(name='toinks1345'), 'can_mod_post': False, 'created_utc': 1710168427.0, 'send_replies': True, 'parent_id': 't3_1bc4pw5', 'score': 103, 'author_fullname': 't2_uxt1nmmi', 'approved_by': None, 'mod_note': None, 'all_awardings': [], 'collapsed': True, 'body'

In [18]:
submission = reddit.submission("1bc4pw5")
print(submission.title)  # to make it non-lazy
for c in submission.comments.list():
    if c.id == 'kufzbdv':
        print(vars(c))
        break

Are you in favor of medically assisted suicide?
{'_replies': <praw.models.comment_forest.CommentForest object at 0x7b681233dfd0>, '_submission': Submission(id='1bc4pw5'), '_reddit': <praw.reddit.Reddit object at 0x7b6812e256d0>, 'subreddit_id': 't5_3a7odq', 'approved_at_utc': None, 'author_is_blocked': False, 'comment_type': None, 'awarders': [], 'mod_reason_by': None, 'banned_by': None, 'author_flair_type': 'text', 'total_awards_received': 0, 'subreddit': Subreddit(display_name='AskPH'), 'author_flair_template_id': None, 'likes': None, 'user_reports': [], 'saved': False, 'id': 'kufzbdv', 'banned_at_utc': None, 'mod_reason_title': None, 'gilded': 0, 'archived': True, 'collapsed_reason_code': None, 'no_follow': True, 'author': Redditor(name='Jon_Irenicus1'), 'can_mod_post': False, 'created_utc': 1710199496.0, 'send_replies': True, 'parent_id': 't1_kudfurz', 'score': 1, 'author_fullname': 't2_awynrqva', 'removal_reason': None, 'approved_by': None, 'mod_note': None, 'all_awardings': [], '

In [26]:
submission = reddit.submission("1mnyxnd")
print(submission.title)  # to make it non-lazy
for c in submission.comments.list():
    if c.parent_id == 't1_n88ngmu':
        print(vars(c))
        break

My mom has cancer and I want to leave her
{'_replies': <praw.models.comment_forest.CommentForest object at 0x7b68122ab7d0>, '_submission': Submission(id='1mnyxnd'), '_reddit': <praw.reddit.Reddit object at 0x7b6812e256d0>, 'subreddit_id': 't5_28r1xe', 'approved_at_utc': None, 'author_is_blocked': False, 'comment_type': None, 'awarders': [], 'mod_reason_by': None, 'banned_by': None, 'author_flair_type': 'text', 'total_awards_received': 0, 'subreddit': Subreddit(display_name='OffMyChestPH'), 'author_flair_template_id': None, 'likes': None, 'user_reports': [], 'saved': False, 'id': 'n88ngnz', 'banned_at_utc': None, 'mod_reason_title': None, 'gilded': 0, 'archived': False, 'collapsed_reason_code': None, 'no_follow': True, 'author': Redditor(name='AutoModerator'), 'can_mod_post': False, 'created_utc': 1754975912.0, 'send_replies': False, 'parent_id': 't1_n88ngmu', 'score': 1, 'author_fullname': 't2_6l4z3', 'removal_reason': None, 'approved_by': None, 'mod_note': None, 'all_awardings': [], '

In [31]:
submission = reddit.submission("1mnyxnd")
print(submission.title)  # to make it non-lazy
for c in submission.comments.list():
    if c.name == 't1_n88ngmu':
        print(type(vars(c)))
        print(vars(c))
        break

My mom has cancer and I want to leave her
<class 'dict'>
{'_replies': <praw.models.comment_forest.CommentForest object at 0x7b68122f1bb0>, '_submission': Submission(id='1mnyxnd'), '_reddit': <praw.reddit.Reddit object at 0x7b6812e256d0>, 'total_awards_received': 0, 'approved_at_utc': None, 'author_is_blocked': False, 'comment_type': None, 'awarders': [], 'mod_reason_by': None, 'banned_by': None, 'ups': 1, 'removal_reason': None, 'link_id': 't3_1mnyxnd', 'author_flair_template_id': None, 'likes': None, 'user_reports': [], 'saved': False, 'id': 'n88ngmu', 'banned_at_utc': None, 'mod_reason_title': None, 'gilded': 0, 'archived': False, 'collapsed_reason_code': None, 'no_follow': True, 'author': None, 'can_mod_post': False, 'send_replies': True, 'parent_id': 't3_1mnyxnd', 'score': 1, 'approved_by': None, 'report_reasons': None, 'all_awardings': [], 'subreddit_id': 't5_28r1xe', 'body': '[removed]', 'edited': False, 'downs': 0, 'author_flair_css_class': None, 'collapsed': False, 'is_submitte

In [57]:
submission = reddit.submission("1mnyxnd")
print(submission.title)  # to make it non-lazy
for c in submission.comments.list():
    if c.name == 't1_n88ngmu':
        # print(f'{c.author_fullname if c.author_fullname else None}')
        print(getattr(c, 'author_fullname', None))
        print(getattr(c, 'author', None))
        print(c.author)
        print(str(c.author).split("='")[-1].replace("')", ""))

My mom has cancer and I want to leave her
None
None
None
None


In [55]:
submission = reddit.submission("1mnyxnd")
print(submission.title)  # to make it non-lazy
for c in submission.comments.list():
    if c.parent_id == 't1_n88ngmu':
        print(c.parent_id)
        print(c.author_fullname)
        print(str(c.author).split("='")[-1].replace("')", ""))              
        break

My mom has cancer and I want to leave her
t1_n88ngmu
t2_6l4z3
AutoModerator


In [52]:
submission = reddit.submission("1mnyxnd")

getattr(submission, 'author', None)

Redditor(name='egghe4d-')