# Data Collection

### Imported Libraries

In [1]:
import time
import requests
import numpy as np
import pandas as pd

The first step in this project was collecting reddit data from reddit.com. Pushshift.io reddit API facilitated the data collection by collecting 100 posts per request and restricting the search to a specific subreddit. Additionally, pushshift has a "before" parameter that allowed me to specify the date and time of the desired posts.

In [2]:
def get_data(subreddit, size, before_time=1611893764):
   
    # The base_url is provided in the pushshift documentation
    base_url = 'https://api.pushshift.io/reddit/search/submission'
    # A list to capture each request
    posts = [] 
    posts_length = 0 
    
    #The while loops runs until the desired number of posts have been accumulated
    while posts_length < size:
        res = requests.get(base_url, params = {"subreddit": subreddit, "size": 100, "before": before_time}).json()
        data = res['data']
        df = pd.DataFrame(data)
        
        # Captures the before time so the next request pulls older posts
        before_time = int(df[['created_utc']].sort_values('created_utc').values[0])
        posts.append(df)
        posts_length += len(data)
    
        # A print statement to show progress (larger requests will take more time)
        if (posts_length) % 2000 == 0:
            print(f'Post {posts_length} of {size}')
        
        # Best practice - slow the number of requests to lessen the demand on the website
        time.sleep(3)
    
    return pd.concat(posts)

# Code Help:

# push api -- https://github.com/pushshift/api
# tutorial --> https://www.youtube.com/watch?v=AcrjEWsMi_E&feature=youtu.be
# time --> https://stackoverflow.com/questions/52004801/how-to-slow-down-asynchrounous-api-calls-to-match-api-limits
# utc time converter --> https://www.epochconverter.com
# web scaping loop --> https://medium.com/better-programming/how-to-scrape-multiple-pages-of-a-website-using-a-python-web-scraper-4e2c641cff8
# web scaping loop --> https://levelup.gitconnected.com/make-your-python-web-scraper-smarter-6233f2d10c3f

### Bodyweight Fitness Data

In [None]:
bodyweight_fitness = get_data('bodyweightfitness', 10_000)

In [6]:
bodyweight_fitness.to_csv('../datasets/bodyweight_fitness', index=False)

### Powerlifting Data

In [7]:
#Gathering data from PS4 subreddit 
powerlifting = get_data('powerlifting', 10000)

Post 1000 of 10000
Post 2000 of 10000
Post 3000 of 10000
Post 4000 of 10000
Post 5000 of 10000
Post 6000 of 10000
Post 7000 of 10000
Post 8000 of 10000
Post 9000 of 10000
Post 10000 of 10000


In [8]:
powerlifting.to_csv('../datasets/powerlifting', index=False)