In [1]:
# import library
import requests
import pandas as pd
from datetime import datetime
import time

In [2]:
def gather_data(subreddit, limit, since, until):
    """
    Connect to API and pull data.
    
    Args:
        subreddit (string): The name of subreddit
        limit (number): The number of submisison returned
        since (string): The start epoch time
        until (string): The end epoch time
        
    Returns:
        list: The list of each submission with 'selftext', 'title' and 'subreddit'
    """
    
    url = 'https://api.pushshift.io/reddit/search/submission'
    
    params = {
        'subreddit': subreddit,
        'limit': limit,
        'since': since,
        'until': until
    }
    
    res = requests.get(url, params)
    if res.status_code == 200:
        data = res.json()
           
        return [{'selftext': row['selftext'], 'title': row['title'], 'subreddit': row['subreddit']} for row in data['data']]
            
    else:
        return None

In [3]:
def create_data(subreddit):
    """
    Combine data from each pull and transform it into a single pandas dataframe.
    
    Args:
        subreddit (string): The name of subreddit
        
    Returns:
        None
    """
    
    
    cur_time = datetime(2023, 4, 19, 19, 19, 19)
    epoch_time = datetime.timestamp(cur_time)
    epoch_time_interval = 24 * 60 * 60 * 60
    time_interval = [(str(int(epoch_time - (i + 1) * epoch_time_interval)), str(int(epoch_time - i * epoch_time_interval - 360))) for i in range(12)]
    
    remain_interval = [*time_interval]
    
    all_data = []
    while remain_interval:
        for since, until in remain_interval:
            cur_data = gather_data(subreddit, 500, since, until)
            if cur_data:
                all_data.extend(cur_data)
                remain_interval.remove((since, until))

            time.sleep(15)
        print(remain_interval)
    pd.DataFrame(all_data).to_csv(f'../data/{subreddit}.csv', index = False)

In [4]:
# Iterate over two subreddits and invoke the function to collect and generate the data
for sub in ['SoftwareEngineering', 'datascience']:
    create_data(sub)

[('1671589159', '1676772799'), ('1661221159', '1666404799'), ('1650853159', '1656036799'), ('1640485159', '1645668799'), ('1635301159', '1640484799'), ('1624933159', '1630116799')]
[('1661221159', '1666404799'), ('1640485159', '1645668799'), ('1624933159', '1630116799')]
[('1640485159', '1645668799')]
[]
[('1671589159', '1676772799'), ('1661221159', '1666404799'), ('1650853159', '1656036799'), ('1640485159', '1645668799'), ('1630117159', '1635300799'), ('1619749159', '1624932799')]
[('1661221159', '1666404799'), ('1640485159', '1645668799'), ('1619749159', '1624932799')]
[('1640485159', '1645668799')]
[]
