In [17]:
import os
import pandas as pd
import praw
import datetime
import re
import csv

In [18]:
from user_definition import * 

In [19]:
def praw_setup(client_id, client_secret, user_agent, password, username):
    '''
    Instantiate the Python Reddit API Wrapper (PRAW)
    object in order to access Reddit data.
    
    client_id = client_id from your app info on 
        Reddit's dev website
    client_secret = client_secret from from your
        app info on Reddit's dev website
    user_agent = A string representing whoever is
        accessing the data. Per Reddit's API rules,
        must include your Reddit username.
    password = Your reddit account's password.
    username = Your reddit username.
    '''
    reddit = praw.Reddit(
        client_id=client_id,
        client_secret=client_secret,
        user_agent=user_agent,
        password=password,
        username=username
    )
    return reddit

In [20]:
# def get_post_titles_text_and_features(reddit, post_limit, timeframe='day', one_sub=False, sub=None):
#     '''
#     Obtain the top n (post_limit) posts in the requested 
#     timeframe on all 11 subreddits, as well as desired attributes. Can also specify if 
#     just want post info on one subreddit.
    
#     reddit = PRAW instance
#     post_limit = # of posts you want to get
#     timeframe = hour, day (the default), week, month, year, 
#         or all (which is all time)
#     one_sub = If True, gathers data on specific subreddit.
#     sub = Specific subreddit to get data.
#     '''
#     # regex_pattern = r'1\.\s.*\n2\.\s.*'
#     # regex = re.compile(regex_pattern, re.IGNORECASE | re.DOTALL)

#     subreddits = ['RandomTables','BehindTheTables','d100']
#     table = []
#     for subreddit in subreddits:
#         for submission in reddit.subreddit(subreddit).top(limit=post_limit, time_filter=timeframe):
#             # if regex.search(submission.selftext):
#             if 'table' in submission.title.lower() or 'table' in submission.selftext.lower():
#                 table.append([submission.id,
#                             submission.title,
#                             submission.selftext, # get post text
#                             submission.subreddit.display_name,
#                             datetime.datetime.utcfromtimestamp(submission.created_utc),
#                             submission.score,
#                             submission.num_comments,
#                             submission.total_awards_received])
#     return pd.DataFrame(table, columns=['post_id', 'post_title', 'post_text', 'post_subreddit',
#                                            'creation_datetime', 'score', 'num_comments', 'total_awards_received'])


In [59]:
def get_post_titles_text_and_features(reddit, post_limit, timeframe='day', one_sub=False, sub=None):
    '''
    Obtain the top n (post_limit) posts in the requested 
    timeframe on all 11 subreddits, as well as desired attributes. Can also specify if 
    just want post info on one subreddit.
    
    reddit = PRAW instance
    post_limit = # of posts you want to get
    timeframe = hour, day (the default), week, month, year, 
        or all (which is all time)
    one_sub = If True, gathers data on specific subreddit.
    sub = Specific subreddit to get data.
    '''
    subreddits = ['DnD','RandomTables', 'BehindTheTables', 'd100']
    keywords = ['table','random tables', 'table rolls', 'd100 tables', 'table generators', 'tabletop rpg tables',
                'tabletop game tables', 'randomized tables', 'randomizer tables', 'table charts',
                'tabletop game charts']

    table = []
    for subreddit in subreddits:
        for submission in reddit.subreddit(subreddit).top(limit=post_limit, time_filter=timeframe):
            title_lower = submission.title.lower() if submission.title else ''
            selftext_lower = submission.selftext.lower() if submission.selftext else ''
            if any(keyword in title_lower or keyword in selftext_lower for keyword in keywords):
                table.append([
                    submission.id,
                    submission.title,
                    submission.selftext,
                    submission.subreddit.display_name,
                    datetime.datetime.utcfromtimestamp(submission.created_utc),
                    submission.score,
                    submission.num_comments,
                    submission.total_awards_received
                ])
    return pd.DataFrame(table, columns=['post_id', 'post_title', 'post_text', 'post_subreddit',
                                        'creation_datetime', 'score', 'num_comments', 'total_awards_received'])


In [21]:
def get_comments_and_features(reddit, post_limit, timeframe='day', one_sub=False, sub=None):
    """
    Obtain the top n posts' comments from 11 particular subreddits
    in requested timeframe, as well as desired attributes.
    
    reddit = PRAW instance
    post_limit = # of desired posts
    timeframe = hour, day (the default), week, month, year, 
        or all (which is all time)
    one_sub = If True, gathers data on specific subreddit.
    sub = Specific subreddit to get data.
    """
    if one_sub == True:
        table = []
        for submission in reddit.subreddit(sub).top(limit=post_limit, time_filter=timeframe):
            comments = submission.comments[:-1] # not taking into account the MoreComments object
            for comment in comments:
                match = re.search(r'\b\d+d\d+\b', comment.body) # Search for pattern
                if match:
                    table.append([submission.id,
                                  comment.id,
                                  comment.body,
                                  submission.subreddit.display_name,
                                  datetime.datetime.utcfromtimestamp(comment.created_utc),
                                  comment.score,
                                  match.group()]) # Add matched string to table
        return pd.DataFrame(table, columns = ['post_id', 'comment_id', 'comment_text',
                                              'subreddit', 'creation_datetime', 'comment_karma',
                                              'matched_string'])
    else:
        subreddits = ['RandomTables','BehindTheTables','d100']
        table = []
        for subreddit in subreddits:
            for submission in reddit.subreddit(subreddit).top(limit=post_limit, time_filter=timeframe):
                comments = submission.comments[:-1] # not taking into account the MoreComments object
                for comment in comments:
                    match = re.search(r'\b\d+d\d+\b', comment.body) # Search for pattern
                    if match:
                        table.append([submission.id,
                                      comment.id,
                                      comment.body,
                                      submission.subreddit.display_name,
                                      datetime.datetime.utcfromtimestamp(comment.created_utc),
                                      comment.score,
                                      match.group()]) # Add matched string to table
        return pd.DataFrame(table, columns = ['post_id', 'comment_id', 'comment_text',
                                              'subreddit', 'creation_datetime', 'comment_karma',
                                              'matched_string'])


In [60]:
def _download_reddit_data():
    """
    Create reddit instance and collect data to write to gcs as two csv's,
    one for posts, and the other for comments on those posts.
    """
    reddit = praw_setup(client_id, client_secret, user_agent, password, username)
    blob_name_posts = f'{yesterday}/posts.csv' # names for the files
    blob_name_comments = f'{yesterday}/comments.csv'
    
    df_posts = get_post_titles_text_and_features(reddit, post_limit=1000, timeframe='week', one_sub=False, sub=None)
    # df_comments = get_comments_and_features(reddit, post_limit=1000, timeframe='day', one_sub=False, sub=None)

    return df_posts

In [50]:
def _download_reddit_data_comments():
    """
    Create reddit instance and collect data to write to gcs as two csv's,
    one for posts, and the other for comments on those posts.
    """
    reddit = praw_setup(client_id, client_secret, user_agent, password, username)
    blob_name_posts = f'{yesterday}/posts.csv' # names for the files
    blob_name_comments = f'{yesterday}/comments.csv'
    
    # df_posts = get_post_titles_text_and_features(reddit, post_limit=1000, timeframe='day', one_sub=False, sub=None)
    df_comments = get_comments_and_features(reddit, post_limit=1000, timeframe='week', one_sub=False, sub=None)

    return df_comments

In [61]:
# save the post data
df_post_1 = _download_reddit_data()

In [62]:
# display which subreddit are the posts coming from 
print(df_post_1.groupby('post_subreddit').sum())

                 score  num_comments  total_awards_received
post_subreddit                                             
BehindTheTables     58             1                      0
DnD              12588          2301                      1
RandomTables         2             0                      0
d100               209            21                      0


In [64]:
# display the list of posts
print(df_post_1)

    post_id                                         post_title  \
0   13337v6  Dungeons & Dragons: Honor Among Thieves Just B...   
1   1351z82                This Anti-DM mentality has to stop.   
2   12ysjjk         Why don't more tables use the Mark action?   
3   133dpez                                      Thank your DM   
4   134k8xg  I am giving English lessons and have played a ...   
..      ...                                                ...   
70   zz8m3k      Happy Cakeday, r/RandomTables! Today you're 8   
71   ynylgw   Happy Cakeday, r/BehindTheTables! Today you're 7   
72  133wwcj  D100 Epithets for Good Bosses (Priests, Paladi...   
73  130nbon                           D20x5 Variegated Vikings   
74  13488hd                    Rumors for an all-aquatic world   

                                            post_text   post_subreddit  \
0   Looks like the D&D movie just made it past its...              DnD   
1   So many comments always end the same way. "The...      

In [27]:
df_comment_1 = _download_reddit_data_comments()
df_comment_1

Unnamed: 0,post_id,comment_id,comment_text,subreddit,creation_datetime,comment_karma,matched_string


In [28]:
# # # save the post to csv and txt
# df_post_1.to_csv('./data/table.csv')
# df_post_1['post_text'].to_csv('./data/table_text_.csv')

In [29]:
# #cleaning the txt file 

# # open the input file
# with open('./data/reddit_post_text.txt') as file:
#     text = file.read()

# # define a regular expression to match differnt patterns
# pattern_https = re.compile(r'https?://\S+') #HTTPS links
# pattern_u = re.compile(r'\s[uU]/\w+|\\u|[\(\)\[\]/]u|u/') # u/123abc or [\u] or (\u) or /u


# # remove all the HTTPS links from the text
# text = re.sub(pattern_https, '', text)
# text = re.sub(pattern_u, '', text)

# # open the output file and write the modified text 
# with open('./data/reddit_post_text.txt', 'w') as file:
#     file.write(text)


In [30]:

def write_csv_to_s3(bucket_name, object_name, aws_access_key_id, aws_secret_access_key, df):
    '''
    Write a dataframe (df) as a CSV file to S3 bucket.
    '''
    s3 = boto3.resource('s3',
                        aws_access_key_id=aws_access_key_id,
                        aws_secret_access_key=aws_secret_access_key)
    csv_buffer = StringIO()
    df.to_csv(csv_buffer, index=False)
    s3.Object(bucket_name, object_name).put(Body=csv_buffer.getvalue())