In [27]:
import s3fs
s3 = s3fs.S3FileSystem()

In [1]:
import os
import sys
import yaml
import configparser
import numpy as np
import pandas as pd
import praw
from praw import Reddit

In [2]:
parser = configparser.ConfigParser()
parser.read('../config/config.conf')

REDDIT_SECRET_KEY = parser.get('api_keys', 'REDDIT_SECRET_KEY')
REDDIT_CLIENT_ID = parser.get('api_keys', 'REDDIT_CLIENT_ID')

with open('../config/post_fields.yml', 'r') as file:
    data = yaml.safe_load(file)

POST_FIELDS = data['POST_FIELDS']


In [3]:
def connect_reddit(client_id, client_secret, user_agent) -> Reddit:
    try:
        reddit = praw.Reddit(client_id=client_id,
                             client_secret=client_secret,
                             user_agent=user_agent)
        print("Connected to Reddit!")
        return reddit
    except Exception as e:
        print(e)
        sys.exit(1)

reddit_instance = connect_reddit(REDDIT_CLIENT_ID, REDDIT_SECRET_KEY, 'AGENT')

Connected to Reddit!


Version 7.7.1 of praw is outdated. Version 7.8.0 was released 1 day ago.


In [5]:
def extract_posts(reddit_instance: Reddit, subreddit: str, limit=None):

    subreddit = reddit_instance.subreddit('dataengineering')
    posts = subreddit.hot(limit=limit)

    posts_list = []
    comments_list = []

    for post in posts:
        post_dict = vars(post)
        post_data = {field: post_dict[field] for field in POST_FIELDS}
        posts_list.append(post_data)

        post.comments.replace_more(limit=None)
        for comment in post.comments.list():
            comment_data = {
            "post_id": post.id,
            "comment_id": comment.id,
            "author": str(comment.author),
            "body": comment.body,
            "score": comment.score,
            "created_utc": comment.created_utc,
            "parent_id": comment.parent_id
        }
            comments_list.append(comment_data)
    
    return posts_list, comments_list

subreddit = 'dataengineering'
limit = 50
posts_list, comments_list = extract_posts(reddit_instance, subreddit, limit)

In [23]:
def transform_data(posts: list, comments: list):

    posts_df = pd.DataFrame(posts)
    comments_df = pd.DataFrame(comments)

    df = posts_df.merge(comments_df, right_on=['post_id'], left_on=['id'], how='left', suffixes=('_post', '_cmnt'))

    df['created_utc_post'] = pd.to_datetime(df['created_utc_post'], unit='s')
    df['created_utc_cmnt'] = pd.to_datetime(df['created_utc_cmnt'], unit='s')
    df['author_post'] = df['author_post'].astype(str)
    df['author_cmnt'] = df['author_cmnt'].astype(str)
    df['num_comments'] = df['num_comments'].astype(int)
    df['score_post'] = df['score_post'].astype(int)
    df['score_cmnt'] = pd.to_numeric(df['score_cmnt'], errors='coerce')
    df['title'] = df['title'].astype(str)

    return df

In [6]:
posts_df = pd.DataFrame(posts_list)
comments_df = pd.DataFrame(comments_list)

In [12]:
df = posts_df.merge(comments_df, right_on=['post_id'], left_on=['id'], how='left', suffixes=('_post', '_cmnt'))

In [24]:
df = transform_data(posts_list, comments_list)