In [None]:
import os
import pandas as pd
import praw
import datetime
import re
import csv

In [None]:
from user_definition import * 

In [None]:
def praw_setup(client_id, client_secret, user_agent, password, username):
    '''
    Instantiate the Python Reddit API Wrapper (PRAW)
    object in order to access Reddit data.
    
    client_id = client_id from your app info on 
        Reddit's dev website
    client_secret = client_secret from from your
        app info on Reddit's dev website
    user_agent = A string representing whoever is
        accessing the data. Per Reddit's API rules,
        must include your Reddit username.
    password = Your reddit account's password.
    username = Your reddit username.
    '''
    reddit = praw.Reddit(
        client_id=client_id,
        client_secret=client_secret,
        user_agent=user_agent,
        password=password,
        username=username
    )
    return reddit

In [None]:
def get_post_titles_text_and_features(reddit, post_limit, timeframe='day', one_sub=False, sub=None):
    '''
    Obtain the top n (post_limit) posts in the requested 
    timeframe on all 11 subreddits, as well as desired attributes. Can also specify if 
    just want post info on one subreddit.
    
    reddit = PRAW instance
    post_limit = # of posts you want to get
    timeframe = hour, day (the default), week, month, year, 
        or all (which is all time)
    one_sub = If True, gathers data on specific subreddit.
    sub = Specific subreddit to get data.
    '''
    regex_pattern = r'1\.\s.*\n2\.\s.*'
    regex = re.compile(regex_pattern, re.IGNORECASE | re.DOTALL)

    subreddits = ['RandomTables','BehindTheTables','d100']
    table = []
    for subreddit in subreddits:
        for submission in reddit.subreddit(subreddit).top(limit=post_limit, time_filter=timeframe):
            if regex.search(submission.selftext):
                table.append([submission.id,
                            submission.title,
                            submission.selftext, # get post text
                            submission.subreddit.display_name,
                            datetime.datetime.utcfromtimestamp(submission.created_utc),
                            submission.score,
                            submission.num_comments,
                            submission.total_awards_received])
    return pd.DataFrame(table, columns=['post_id', 'post_title', 'post_text', 'post_subreddit',
                                           'creation_datetime', 'score', 'num_comments', 'total_awards_received'])


In [None]:
def get_comments_and_features(reddit, post_limit, timeframe='day', one_sub=False, sub=None):
    """
    Obtain the top n posts' comments from 11 particular subreddits
    in requested timeframe, as well as desired attributes.
    
    reddit = PRAW instance
    post_limit = # of desired posts
    timeframe = hour, day (the default), week, month, year, 
        or all (which is all time)
    one_sub = If True, gathers data on specific subreddit.
    sub = Specific subreddit to get data.
    """
    if one_sub == True:
        table = []
        for submission in reddit.subreddit(sub).top(limit=post_limit, time_filter=timeframe):
            comments = submission.comments[:-1] # not taking into account the MoreComments object
            for comment in comments:
                match = re.search(r'\b\d+d\d+\b', comment.body) # Search for pattern
                if match:
                    table.append([submission.id,
                                  comment.id,
                                  comment.body,
                                  submission.subreddit.display_name,
                                  datetime.datetime.utcfromtimestamp(comment.created_utc),
                                  comment.score,
                                  match.group()]) # Add matched string to table
        return pd.DataFrame(table, columns = ['post_id', 'comment_id', 'comment_text',
                                              'subreddit', 'creation_datetime', 'comment_karma',
                                              'matched_string'])
    else:
        subreddits = ['RandomTables','BehindTheTables','d100']
        table = []
        for subreddit in subreddits:
            for submission in reddit.subreddit(subreddit).top(limit=post_limit, time_filter=timeframe):
                comments = submission.comments[:-1] # not taking into account the MoreComments object
                for comment in comments:
                    match = re.search(r'\b\d+d\d+\b', comment.body) # Search for pattern
                    if match:
                        table.append([submission.id,
                                      comment.id,
                                      comment.body,
                                      submission.subreddit.display_name,
                                      datetime.datetime.utcfromtimestamp(comment.created_utc),
                                      comment.score,
                                      match.group()]) # Add matched string to table
        return pd.DataFrame(table, columns = ['post_id', 'comment_id', 'comment_text',
                                              'subreddit', 'creation_datetime', 'comment_karma',
                                              'matched_string'])


In [None]:
def _download_reddit_data():
    """
    Create reddit instance and collect data to write to gcs as two csv's,
    one for posts, and the other for comments on those posts.
    """
    reddit = praw_setup(client_id, client_secret, user_agent, password, username)
    blob_name_posts = f'{yesterday}/posts.csv' # names for the files
    blob_name_comments = f'{yesterday}/comments.csv'
    
    df_posts = get_post_titles_text_and_features(reddit, post_limit=1000, timeframe='year', one_sub=False, sub=None)
    df_comments = get_comments_and_features(reddit, post_limit=1000, timeframe='year', one_sub=False, sub=None)

    return df_posts

In [None]:
# save the post data
df_post_1 = _download_reddit_data()

In [None]:
# display which subreddit are the posts coming from 
print(df_post_1.groupby('post_subreddit').sum())

In [None]:
# display the list of posts
print(df_post_1['post_text'])

In [None]:
# # save the post to csv and txt
df_post_1.to_csv('./data/table.csv')
df_post_1['post_text'].to_csv('./data/table_text_.csv')

In [17]:
#cleaning the txt file 

# open the input file
with open('./data/reddit_post_text.txt') as file:
    text = file.read()

# define a regular expression to match differnt patterns
pattern_https = re.compile(r'https?://\S+') #HTTPS links
pattern_u = re.compile(r'\s[uU]/\w+|\\u|[\(\)\[\]/]u|u/') # u/123abc or [\u] or (\u) or /u


# remove all the HTTPS links from the text
text = re.sub(pattern_https, '', text)
text = re.sub(pattern_u, '', text)

# open the output file and write the modified text 
with open('./data/reddit_post_text.txt', 'w') as file:
    file.write(text)
