# Webscraper (for reddit.com, project 3)
(Example: [https://www.youtube.com/watch?v=AcrjEWsMi_E](https://www.youtube.com/watch?v=AcrjEWsMi_E))

In [1]:
# Last Updated: 2021-04-24, 00:14

import datetime
import time
import os

import requests
import pandas as pd
import numpy as np

In [2]:
# GLOBALS
BASE_URL = 'https://api.pushshift.io/reddit/search/'

In [3]:
def call_scraper(size, sub_red_1 = 'AMA', sub_red_2 = 'AskReddit', coms_only=False, before=1, debug=False):
    """
    Calls Reddit API and grabs two subreddit's comments and submissions 
    Takes 'size' and subreddit' params and passes them to the API
    By default, this grabs comments and submissions, resulting in 4 Data Frames
    coms_only boolean ONLY grabs comments (2 Data Frames, no submissions)
    Function reads in files 'before_coms.csv' and 'before_subm.csv' to get
    the oldest 'created_utc' from the last run (so it can pickup where it left off)
    which is passed to the params as the value for 'before'
    
    """
    print_special_line_to_log(0)
    print_special_line_to_log(3)
    log_to_file(f'  Begin Iteration of call_scraper')
    print_special_line_to_log(3)
    
    c = 0
    sub_reddit = [sub_red_1, sub_red_2]
    b4 = []
    list_df = [[],[],[],[]]
    
    if(coms_only):
        submission_comment = ['comment']    # FOR COMS ONLY
    else:
        submission_comment = ['comment', 'submission']   # FOR SUB AND COMS
    
    # read in before.csv values as a list, and apply them 
    if(coms_only):
        old_before_list = read_in_before_csv('before_coms.csv')   # FOR COMS ONLY
    else:
        ob4_coms = read_in_before_csv('before_coms.csv')   # FOR SUB AND COMS
        ob4_subm = read_in_before_csv('before_subm.csv')
        old_before_list = [ ob4_coms[0], ob4_coms[1], ob4_subm[0], ob4_subm[1] ]
    
    for sc in submission_comment:
        for sr in sub_reddit:
            before = 0
            if(c < len(old_before_list)):
                before = old_before_list[c]
            url = BASE_URL + sc
            ###### WHERE THE MAGIC HAPPENS ######
            df = start_scraping(url, sr, sc, size, before)
            b4v = df.iloc[-1]['created_utc']
            b4.append(b4v)
            list_df[c].append(df)
            if(debug):
                log_to_file(f'Looping through Coms/Subm, and subreddits: {c}')
            c += 1
    
    if(coms_only):
        df_before_c = make_before_vals_df(sub_red_1, sub_red_2, submission_comment[0], b4[0], b4[1])   # FOR COMS ONLY
        df_before_c.to_csv('./before_coms.csv', index=False)
    else:
        df_before_c = make_before_vals_df(sub_red_1, sub_red_2, submission_comment[0], b4[0], b4[1])   # FOR SUB AND COMS
        df_before_c.to_csv('./before_coms.csv', index=False)
        df_before_s = make_before_vals_df(sub_red_1, sub_red_2, submission_comment[1], b4[2], b4[3])
        df_before_s.to_csv('./before_subm.csv', index=False)
        
    if(debug):
        log_to_file(f'Length of list_df: {len(list_df)}')
    return list_df

In [4]:
def log_to_file(log_msg, log_file_name='', print_to_terminal=True, apply_date=True, show_info_tag = 1):
    """
    Simple way to write to log.
    Put in log message, and by default writes to '%Y-%m-%d_LOG.txt'
    can change filename if you like with param 'log_file_name'
    Also prints 'log_msg' to terminal by default with 
    param 'print_to_terminal=True'
    Returns 'log_msg'
    """
    info_tag = [ '', '[INFO] ','[WARN] ','[ERROR] ' ]
    
    if(len(log_file_name) == 0):
        log_file_name = datetime.datetime.now().strftime('%Y-%m-%d_LOG.txt')

    if(not os.path.exists(log_file_name)):
        log_file = open(log_file_name, "x") # create file
    
    log_file = open(log_file_name, 'a') # appends msg to log
    if(apply_date):
        log_file.write(get_date_time())
    if(show_info_tag > 0):
        log_file.write(info_tag[show_info_tag])
    log_file.write(log_msg)
    log_file.write('\n')
    log_file.close()
    if (print_to_terminal):
        print(log_msg)
    return log_msg

In [5]:
def print_special_line_to_log(line_type=0, log_file_name='', print_to_terminal=True, apply_date=False, show_info_tag = 0):
    """
    Uses 'log_to_file' Function
    Special commands to write to Log (and terminal if you want, but not by default):
        0 = newline in log
        1 = Long single line
        2 = Long double line
        3 = Short single line
        4 = Short double line
        5 = Long Star Line
        6 = Short Star Line
    """
    log_msg = [
        '',
        '--------------------------------------------------------------------------------------------------',
        '==================================================================================================',
        '--------------------------------------------------------------------',
        '====================================================================',
        '**************************************************************************************************',
        '********************************************************************'
    ]
    
    return log_to_file(log_msg[line_type], log_file_name, print_to_terminal, apply_date, show_info_tag)

In [6]:
def get_date_time():
    """ Returns the date as a formatted string for logs: '%Y-%m-%d, %H:%M:%S - ' """
    return datetime.datetime.now().strftime('%Y-%m-%d, %H:%M:%S ')

In [7]:
def read_in_before_csv(filename):
    """
    Read in CSV with Before Value and returns the 2 values in an array
    Function returns array of 2 zeros if no file is found
    """
    print(datetime.datetime.now().strftime('\n\nSTART:  %Y-%m-%d, %H:%M'))
    try:
        b4v = pd.read_csv(filename)
        b4v_s = b4v['before_value'].str.replace('_', '').apply(int)
        log_to_file('READ IN "BEFORE" VALS: ')
        log_to_file(str(b4v_s), show_info_tag=0)
    except:
        log_to_file(' *** FILE NOT FOUND: {filename} ***')
        log_to_file('      - SETTING "BEFORE" VALUES TO ZERO (0)')
        return [0, 0]
    return b4v_s

In [8]:
def start_scraping(url, sub_reddit, sub_com, size=100, before=0, num_retry=10):
    """
    Calls Function 'set_params_and_request' and prints/logs status
    If there is a 500 error, function can retry; Default is 10 times
    requests data is converted to json and put into a Data Frame
    Returns Data Frame of requests data
    """
    time.sleep(10)
    req = set_params_and_request(url, sub_reddit, size, before)
    status = req.status_code
    
    log_tag = np.where(status == 200, 1, 2)
    log_to_file(f'{sub_reddit} STATUS ({sub_com}): {status}', show_info_tag=log_tag)
    
    count = 0
    while((status >= 500) & (count < num_retry)):
        log_to_file('CONNECTION TIMED OUT', show_info_tag=2)
        time.sleep(10)
        req = set_params_and_request(url, sub_reddit, size, before)
        status = req.status_code
        log_to_file(f'{sub_reddit} STATUS ({sub_com}): {status}')
        if(count == num_retry -1):
            log_to_file('COULD NOT RESOLVE CONNECTION!', show_info_tag=3)
        count += 1
    
    data = req.json()['data']
    df_req = pd.DataFrame(data)
    #write_to_csv(df_req, sub_reddit, sub_com)   # Old way
    
    return df_req

In [9]:
def set_params_and_request(url, sr, size, before, get_metadata = 'true'):
    """
    Function sets up 'params' for pushshift.io reddit API and
    Returns requests call: 'requests.get(url, params)'
    """
    params = {}

    if(before <=0):
        params = {
            'subreddit': sr,
            'size': size,
            'metadata': get_metadata
        }
    else:
        params = {
            'subreddit': sr,
            'size': size,
            'before': before,    # 'created_utc'
            'metadata': get_metadata
        }
    return requests.get(url, params)

In [10]:
def make_before_vals_df(sr1, sr2, sub_com, value_sr1, value_sr2, to_terminal=True):
    """
    'Before' values put into DataFrames and exported to CSV.
    """
    before_list = [[sr1, sub_com, value_sr1],
                   [sr2, sub_com, value_sr2]]
    before = pd.DataFrame(before_list, columns=['subreddit', 'sub_com', 'before_value'])
    before['before_value'] = '_' + before['before_value'].apply(str)
    
    log_to_file(f'BEFORE VAL, {sr1} {sub_com}: {value_sr1}', print_to_terminal=to_terminal)
    log_to_file(f'BEFORE VAL, {sr2} {sub_com}: {value_sr2}', print_to_terminal=to_terminal)
    print(datetime.datetime.now().strftime('FINISH: %Y-%m-%d, %H:%M\n'))
    return before

In [11]:
def write_to_csv(df_output, sr, sub_com, fn = ''):
    """
    Writes a DataFrame to a csv file. Takes args for 
    'sr' (subreddit) and for 'sub_com' (whether 'submission' 
    or 'comment') and 'fn (filename)' - if: fn == '', filename 
    is given default, e.g. '2021-04-24_ama_comments.csv'. 
    Output is designated to '../data/' and cannot be altered. 
    Returns nothing.
    """
    if not os.path.exists('../data/'):
        log_to_file(f'  - Create Data Directory: ../data/')
        os.makedirs('../data/')
    if(len(fn) == 0):
        fn = datetime.datetime.now().strftime('%Y-%m-%d_%H%M_') + sr + '_' + sub_com + '.csv'
    output_file = '../data/' + fn
    log_to_file('  - Writing to file: {fn}')
    df_output.to_csv(output_file, index=False)
    return

In [12]:
def loop_call_scraper(num_loops, size=100, sub_reddit_1 = 'AMA', sub_reddit_2 = 'AskReddit', comments_only = False):
    """
    Wrapper to the Function 'call_scraper' where it loops that function.
    But it also grabs the returned list of DataFrames and compiles them 
    into four (or two) separate files per each subreddit and each 
    type (comments and if you want it, submissions - with 
    submissions is the default)
    Returns nothing.
    """
    ersion = '1.1.0.1'
    
    list_df_sr1_comments = []
    list_df_sr2_comments = []
    list_df_sr1_submissions = []
    list_df_sr2_submissions = []
    
    print_special_line_to_log(0)
    print_special_line_to_log(2)
    log_to_file(f'  BEGIN LOOP SCRAPER - v{ersion}')
    print_special_line_to_log(1)
    
    for i in range(num_loops):
        log_to_file(f'(lOOP # {i + 1} of {num_loops})')
        ldf = call_scraper(size, sub_reddit_1, sub_reddit_2, coms_only=comments_only)
        if(comments_only):
            list_df_sr1_comments.append(ldf[0][0])
            list_df_sr2_comments.append(ldf[1][0])
        else:
            list_df_sr1_comments.append(ldf[0][0])
            list_df_sr2_comments.append(ldf[1][0])
            list_df_sr1_submissions.append(ldf[2][0])
            list_df_sr2_submissions.append(ldf[3][0])
            
    log_to_file(f' *** Writing to CSV file(s) ***')
    if(comments_only):
        big_ol_sr1_comm_df = pd.concat(list_df_sr1_comments)
        big_ol_sr2_comm_df = pd.concat(list_df_sr2_comments)
        write_to_csv(big_ol_sr1_comm_df, sub_reddit_1, 'comments')
        write_to_csv(big_ol_sr2_comm_df, sub_reddit_2, 'comments')
    else:
        big_ol_sr1_comm_df = pd.concat(list_df_sr1_comments)
        big_ol_sr2_comm_df = pd.concat(list_df_sr2_comments)
        big_ol_sr1_subm_df = pd.concat(list_df_sr1_submissions)
        big_ol_sr2_subm_df = pd.concat(list_df_sr2_submissions)
        write_to_csv(big_ol_sr1_comm_df, sub_reddit_1, 'comments')
        write_to_csv(big_ol_sr2_comm_df, sub_reddit_2, 'comments')
        write_to_csv(big_ol_sr1_subm_df, sub_reddit_1, 'submissions')
        write_to_csv(big_ol_sr2_subm_df, sub_reddit_2, 'submissions')
    
    print_special_line_to_log(0)
    print_special_line_to_log(1)
    log_to_file(f'  FINISH LOOP SCRAPER')
    print_special_line_to_log(2)
    print_special_line_to_log(0)
    print_special_line_to_log(0)
    
    return

In [13]:

#######################################
###### CALL THE MAIN FUNCTION!!! ######
#######################################


#loop_call_scraper(10, size=200, sub_reddit_1 = 'AMA', sub_reddit_2 = 'AskReddit', comments_only = False)


In [15]:
loop_call_scraper(20, size=100, sub_reddit_1 = 'AMA', sub_reddit_2 = 'AskReddit', comments_only = True)


  BEGIN LOOP SCRAPER - v1.1.0.1
--------------------------------------------------------------------------------------------------
(lOOP # 1 of 20)

--------------------------------------------------------------------
  Begin Iteration of call_scraper
--------------------------------------------------------------------


START:  2021-05-04, 09:12
 *** FILE NOT FOUND: {filename} ***
      - SETTING "BEFORE" VALUES TO ZERO (0)
AMA STATUS (comment): 200
AskReddit STATUS (comment): 200
BEFORE VAL, AMA comment: 1620093380
BEFORE VAL, AskReddit comment: 1620094754
FINISH: 2021-05-04, 09:12

(lOOP # 2 of 20)

--------------------------------------------------------------------
  Begin Iteration of call_scraper
--------------------------------------------------------------------


START:  2021-05-04, 09:12
READ IN "BEFORE" VALS: 
0    1620093380
1    1620094754
Name: before_value, dtype: int64
AMA STATUS (comment): 200
AskReddit STATUS (comment): 200
BEFORE VAL, AMA comment: 1620091560
BEFORE

In [16]:
loop_call_scraper(100, size=100, sub_reddit_1 = 'AMA', sub_reddit_2 = 'AskReddit', comments_only = True)


  BEGIN LOOP SCRAPER - v1.1.0.1
--------------------------------------------------------------------------------------------------
(lOOP # 1 of 100)

--------------------------------------------------------------------
  Begin Iteration of call_scraper
--------------------------------------------------------------------


START:  2021-05-04, 09:20
READ IN "BEFORE" VALS: 
0    1620062079
1    1620094071
Name: before_value, dtype: int64
AMA STATUS (comment): 200
AskReddit STATUS (comment): 200
BEFORE VAL, AMA comment: 1620059282
BEFORE VAL, AskReddit comment: 1620094036
FINISH: 2021-05-04, 09:21

(lOOP # 2 of 100)

--------------------------------------------------------------------
  Begin Iteration of call_scraper
--------------------------------------------------------------------


START:  2021-05-04, 09:21
READ IN "BEFORE" VALS: 
0    1620059282
1    1620094036
Name: before_value, dtype: int64
AMA STATUS (comment): 200
AskReddit STATUS (comment): 200
BEFORE VAL, AMA comment: 162005