In [13]:
import requests
from datetime import datetime
import datetime as dt
from dateutil.relativedelta import relativedelta
import traceback
import time
import json
import sys
import csv
import json
import pandas as pd
import numpy as np
from pathlib import Path
import os.path
# username = ""  # put the username you want to download in the quotes
# subreddit = "pushshift"  # put the subreddit you want to download in the quotes
# thread_id = ""  # put the id of the thread you want to download in the quotes, it's the first 5 to 7 character string of letters and numbers from the url, like 107xayi
# leave either one blank to download an entire user's or subreddit's history
# or fill in both to download a specific users history from a specific subreddit

# change this to one of "human", "csv" or "json"
# - human: the score, creation date, author, link and then the comment/submission body on a second line. Objects are separated by lines of dashes
# - csv: a comma seperated value file with the fields score, date, title, author, link and then body or url
# - json: the full json object
output_format = "csv"

# default start time is the current time and default end time is all history
# you can change out the below lines to set a custom start and end date. The script works backwards, so the end date has to be before the start date
# start_time = datetime.strptime("09/01/2020", "%m/%d/%Y" #datetime.utcnow()  #datetime.strptime("10/05/2021", "%m/%d/%Y")

# year = "2020"
# start_time = datetime.strptime(f"04/03/{year}", "%m/%d/%Y") #datetime.utcnow()  #datetime.strptime("10/05/2021", "%m/%d/%Y")
# end_time = datetime.strptime(f"04/01/{year}", "%m/%d/%Y")  #datetime.strptime("09/25/2021", "%m/%d/%Y")

convert_to_ascii = False  # don't touch this unless you know what you're doing
convert_thread_id_to_base_ten = True  # don't touch this unless you know what you're doing



In [14]:
def write_human_line(handle, obj, is_submission, convert_to_ascii):
    handle.write(str(obj['score']))
    handle.write(" : ")
    handle.write(datetime.fromtimestamp(obj['created_utc']).strftime("%Y-%m-%d"))
    if is_submission:
        handle.write(" : ")
        if convert_to_ascii:
            handle.write(obj['title'].encode(encoding='ascii', errors='ignore').decode())
        else:
            handle.write(obj['title'])
    handle.write(" : u/")
    handle.write(obj['author'])
    handle.write(" : ")
    handle.write(f"https://www.reddit.com{obj['permalink']}")
    handle.write("\n")
    if is_submission:
        if obj['is_self']:
            if 'selftext' in obj:
                if convert_to_ascii:
                    handle.write(obj['selftext'].encode(encoding='ascii', errors='ignore').decode())
                else:
                    handle.write(obj['selftext'])
        else:
            handle.write(obj['url'])
    else:
        if convert_to_ascii:
            handle.write(obj['body'].encode(encoding='ascii', errors='ignore').decode())
        else:
            handle.write(obj['body'])
    handle.write("\n-------------------------------\n")
    
def write_json_line(handle, obj):
    handle.write(json.dumps(obj))
    handle.write("\n")

In [15]:
# This function converts parent_id from base10 float/int to string
import math
def base36encode(number):
    if math.isnan(number):
        return None
    if isinstance(number, float):
        number = int(number)
    if not isinstance(number, (float, int)):
        return None
    if number == 0:
        return None
    base36 = "0123456789abcdefghijklmnopqrstuvwxyz"
    result = ""
    while number:
        number, i = divmod(number, 36)
        result = base36[i] + result
    return result


thread_id_int = 33989953075
thread_id_str = base36encode(thread_id_int)
thread_id_str

test = "fm4qnn78"
base36encode(int(test, 36))

'fm4qnn78'

In [16]:
def write_csv_line(obj, is_submission):
    
    # Return different dictionaries for posts or comment searches
    if is_submission:
        # "No customised post return fields, if you want to scrape posts pls update"
        return obj
    else:
        row_dict = {'author': obj.get("author", np.nan),
                     'author_fullname': obj.get("author_fullname", np.nan),
                     'created_utc': obj.get("created_utc", np.nan),
                     'utc_datetime_str': obj.get("utc_datetime_str", np.nan),
                     'permalink': obj.get("permalink", np.nan),
                     'score': obj.get("score", np.nan),
                     'score_hidden': obj.get("score_hidden", np.nan),
                     'total_awards_received': obj.get("total_awards_received", np.nan),
                     'body': obj.get("body", np.nan),
                     'is_submitter': obj.get("is_submitter", np.nan),
                     'id': obj.get("id", np.nan),
                     'link_id': obj.get("link_id", np.nan),
                     'parent_id': obj.get("parent_id", np.nan),
                     'nest_level': obj.get("nest_level", np.nan),
                     'subreddit': obj.get("subreddit", np.nan),
                     'subreddit_id': obj.get("subreddit_id", np.nan)}
        if isinstance(row_dict['parent_id'], int):
            row_dict['parent_id'] = base36encode(row_dict['parent_id'])
        return row_dict


In [26]:
def download_from_url(filename, url_base, output_format, start_datetime, end_datetime, is_submission, convert_to_ascii, debug=False):
    print(f"Now searching for period {end_datetime} until {start_datetime}")
    count = 0
    if output_format == "human" or output_format == "json":
        if convert_to_ascii:
            handle = open(filename, 'w', encoding='ascii')
        else:
            handle = open(filename, 'w', encoding='UTF-8')

    previous_epoch = int(start_datetime.timestamp())
    break_out = False
    first_save = True
    while True:
        new_url = url_base+str(previous_epoch)
        json_text = requests.get(new_url, headers={'User-Agent': "Post downloader by /u/Watchful1"})
        time.sleep(1)  # pushshift has a rate limit, if we send requests too fast it will start returning error messages

        
        if debug:
            print(new_url)
            
        try:
            json_data = json_text.json()
        except json.decoder.JSONDecodeError:
            time.sleep(1)
            continue

        if 'data' not in json_data:
            break
        objects = json_data['data']
        if len(objects) == 0:
            break
#         df = pd.DataFrame(objects)
#         df.to_csv("test1233.csv")
        
        row_list = []
        for obj in objects:

            previous_epoch = obj['created_utc'] - 1
            if end_datetime is not None and datetime.utcfromtimestamp(previous_epoch) < end_datetime:
                break_out = True
                break
            count += 1

            # Check if comment body exists, removing comment from df is it doesnt exist
            if obj['body'] == "[removed]":
                continue
                
            try:
                if output_format == "human":
                    write_human_line(handle, obj, is_submission, convert_to_ascii)
                elif output_format == "csv":
                    row_list.append(write_csv_line(obj, is_submission))
                elif output_format == "json":
                    write_json_line(handle, obj)
            except Exception as err:
                if 'permalink' in obj:
                    print(f"Couldn't print object: https://www.reddit.com{obj['permalink']}")
                else:
                    print(f"Couldn't print object, missing permalink: {obj['id']}")
                print(err)
                print(traceback.format_exc())
        
        # Create dataframe from rowlist (which can be saved later)
        df = pd.DataFrame(row_list)

        # Columns to keep
        columns = ['author', 
                   'author_fullname', 
                   'created_utc',
                   'utc_datetime_str',
                   'permalink',
                   'score',
                   'score_hidden',
                   'total_awards_received',
                   'body', 
                   'is_submitter',
                   'id',
                   'link_id',
                   'parent_id',
                   'nest_level',
                   'subreddit',
                   'subreddit_id']
        # When you want to receive the whole dataframe, uncomment the line below
        # columns = df.columns
        
        # Check if file exists, if so append dataframe. Else create new dataframe.
        if len(df.index) == 0:
            print(f"No observations found, saving empty dataframe")
            df.to_csv(filename)
        elif first_save:
            print(f"Saving new dataframe at {filename} with [{len(df.index)}] obs")
            df.to_csv(filename, encoding='utf-8', index=False, columns=columns)
            first_save = False
        else:
            print(f"Appending to {filename} with [{len(df.index)}] obs")
            df.to_csv(filename, mode='a', header=False, index=False, columns=columns)
        
        if break_out:
            break

    if output_format == "human" or output_format == "json":
        handle.close()


# if __name__ == "__main__":
def run(start_time, end_time, subreddit="", username="", thread_id="", comments_save_loc="comments.csv", debug=False):
    filter_string = None
    if username == "" and subreddit == "" and thread_id == "":
        print("Fill in username, subreddit or thread id")
        sys.exit(0)
    if output_format not in ("human", "csv", "json"):
        print("Output format must be one of human, csv, json")
        sys.exit(0)

    filters = []
    if username:
        filters.append(f"author={username}")
    if subreddit:
        filters.append(f"subreddit={subreddit}")
    if thread_id:
        if convert_thread_id_to_base_ten:
            filters.append(f"link_id={int(thread_id, 36)}")
        else:
            filters.append(f"link_id=t3_{thread_id}")
    filter_string = '&'.join(filters)

    url_template = "https://api.pushshift.io/reddit/{}/search?limit=1000&order=desc&{}&before="

    if not thread_id:
        test = "remove this when done"
#         download_from_url(posts_save_loc, url_template.format("submission", filter_string), output_format, start_time, end_time, True, convert_to_ascii)
    download_from_url(comments_save_loc, url_template.format("comment", filter_string), output_format, start_time, end_time, False, convert_to_ascii, debug=False)


In [27]:

username = ""  # put the username you want to download in the quotes
subreddit = "pennystocks"  # put the subreddit you want to download in the quotes
thread_id = ""  

year = "2020"
start_time = datetime.strptime(f"04/05/{year}", "%m/%d/%Y") #datetime.utcnow() 
end_time = datetime.strptime(f"04/01/2020", "%m/%d/%Y")
start_time = dt.datetime(2018, 5, 1, 0, 0)
end_time = dt.datetime(2018, 3, 1, 0, 0)

dir_path = r"E:\Users\Christiaan\Large_Files\Thesis\reddit\comments"
comments_save_loc = f"{dir_path}\comments.csv"
posts_save_loc = f"{dir_path}\posts.csv"

run(start_time=start_time, end_time=end_time, subreddit=subreddit, comments_save_loc=comments_save_loc, debug=False)


Now searching for period 2018-03-01 00:00:00 until 2018-05-01 00:00:00
https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1525125600
Saving new dataframe at E:\Users\Christiaan\Large_Files\Thesis\reddit\comments\comments.csv with [3] obs


In [28]:
reddit_dir = r"E:\Users\Christiaan\Large_Files\Thesis\reddit\comments"

# Starting datetime
start_date = dt.datetime(2018, 4, 1, 0, 0)
end_date = dt.datetime(2020, 9, 1, 0, 0)

subreddits = ['wallstreetbets', 'stocks', 'investing', 'stockmarket', 'pennystocks']
subreddits = ['pennystocks']

save = True
for subreddit in subreddits:
    # Create folder to save output
    folder_loc = os.path.join(reddit_dir, subreddit).replace('\\', '/')
    Path(folder_loc).mkdir(parents=True, exist_ok=True)

    date_time = start_date
    while date_time < end_date:
        # Create 1 month search period in epoch time
        year_and_month = date_time.strftime('%Y_%m')
        
        # As the search run() and download_from_url() function work time backwards, start time and end time are swapped
        end = date_time
        start = date_time + relativedelta(months=1)

        # Create file to save output
        file_loc = os.path.join(folder_loc, year_and_month).replace('\\', '/') + ".csv"
        print(file_loc)
        # Check if file already exists and skip API request if file exists
        if os.path.isfile(file_loc) and save:
            print(f"File exists: [{file_loc}]")
            date_time = date_time + relativedelta(months=1)

            continue
        
        if save:
            run(start_time=start, end_time=end, subreddit=subreddit, comments_save_loc=file_loc, debug=False)
        else:
            print("Save is off")

        # Adding 1 month to date_time tracker
        date_time = date_time + relativedelta(months=1)

E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2018_04.csv
File exists: [E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2018_04.csv]
E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2018_05.csv
File exists: [E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2018_05.csv]
E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2018_06.csv
File exists: [E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2018_06.csv]
E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2018_07.csv
File exists: [E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2018_07.csv]
E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2018_08.csv
File exists: [E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2018_08.csv]
E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2018_09.csv
File exists: [E:/Users/Christiaan/Large_Files/Thesis/red

https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1590550237
Appending to E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2020_05.csv with [987] obs
https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1590529959
Appending to E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2020_05.csv with [980] obs
https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1590515536
Appending to E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2020_05.csv with [979] obs
https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1590502594
Appending to E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2020_05.csv with [976] obs
https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1590462802
Appending to E:/Users/Chris

https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1589471837
Appending to E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2020_05.csv with [969] obs
https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1589460093
Appending to E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2020_05.csv with [973] obs
https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1589425968
Appending to E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2020_05.csv with [975] obs
https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1589409231
Appending to E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2020_05.csv with [977] obs
https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1589400218
Appending to E:/Users/Chris

https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1588715216
Appending to E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2020_05.csv with [977] obs
https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1588708093
Appending to E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2020_05.csv with [969] obs
https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1588703252
Appending to E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2020_05.csv with [976] obs
https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1588696102
Appending to E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2020_05.csv with [967] obs
https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1588688377
Appending to E:/Users/Chris

https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1593297097
Appending to E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2020_06.csv with [989] obs
https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1593283944
Appending to E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2020_06.csv with [978] obs
https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1593270047
Appending to E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2020_06.csv with [993] obs
https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1593238390
Appending to E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2020_06.csv with [988] obs
https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1593224513
Appending to E:/Users/Chris

https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1593010387
Appending to E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2020_06.csv with [989] obs
https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1593007425
Appending to E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2020_06.csv with [982] obs
https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1593001643
Appending to E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2020_06.csv with [973] obs
https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1592978247
Appending to E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2020_06.csv with [991] obs
https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1592965268
Appending to E:/Users/Chris

https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1592612872
Appending to E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2020_06.csv with [990] obs
https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1592603950
Appending to E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2020_06.csv with [986] obs
https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1592597909
Appending to E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2020_06.csv with [996] obs
https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1592593067
Appending to E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2020_06.csv with [989] obs
https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1592588663
Appending to E:/Users/Chris

https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1592344989
Appending to E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2020_06.csv with [991] obs
https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1592337414
Appending to E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2020_06.csv with [994] obs
https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1592332788
Appending to E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2020_06.csv with [981] obs
https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1592327217
Appending to E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2020_06.csv with [978] obs
https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1592322428
Appending to E:/Users/Chris

https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1591928523
Appending to E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2020_06.csv with [985] obs
https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1591919502
Appending to E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2020_06.csv with [983] obs
https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1591913452
Appending to E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2020_06.csv with [978] obs
https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1591909707
Appending to E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2020_06.csv with [986] obs
https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1591905866
Appending to E:/Users/Chris

https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1591709381
Appending to E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2020_06.csv with [965] obs
https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1591700794
Appending to E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2020_06.csv with [955] obs
https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1591677118
Appending to E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2020_06.csv with [958] obs
https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1591669224
Appending to E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2020_06.csv with [977] obs
https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1591662874
Appending to E:/Users/Chris

https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1591326620
Appending to E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2020_06.csv with [981] obs
https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1591319653
Appending to E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2020_06.csv with [982] obs
https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1591314631
Appending to E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2020_06.csv with [989] obs
https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1591310589
Appending to E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2020_06.csv with [979] obs
https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1591305816
Appending to E:/Users/Chris

https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1591112402
Appending to E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2020_06.csv with [974] obs
https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1591105540
Appending to E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2020_06.csv with [975] obs
https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1591065277
Appending to E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2020_06.csv with [987] obs
https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1591044966
Appending to E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2020_06.csv with [982] obs
https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1591032943
Appending to E:/Users/Chris

https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1595610427
Appending to E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2020_07.csv with [977] obs
https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1595602412
Appending to E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2020_07.csv with [978] obs
https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1595592710
Appending to E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2020_07.csv with [981] obs
https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1595559750
Appending to E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2020_07.csv with [991] obs
https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1595544777
Appending to E:/Users/Chris

https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1595006048
Appending to E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2020_07.csv with [984] obs
https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1594999060
Appending to E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2020_07.csv with [959] obs
https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1594991549
Appending to E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2020_07.csv with [956] obs
https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1594963129
Appending to E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2020_07.csv with [982] obs
https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1594948609
Appending to E:/Users/Chris

https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1594404334
Appending to E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2020_07.csv with [978] obs
https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1594396414
Appending to E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2020_07.csv with [978] obs
https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1594388744
Appending to E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2020_07.csv with [978] obs
https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1594360868
Appending to E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2020_07.csv with [972] obs
https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1594342613
Appending to E:/Users/Chris

https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1594047882
Appending to E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2020_07.csv with [995] obs
https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1594045289
Appending to E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2020_07.csv with [993] obs
https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1594042660
Appending to E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2020_07.csv with [995] obs
https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1594037573
Appending to E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2020_07.csv with [997] obs
https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1594018028
Appending to E:/Users/Chris

https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1598911200
Saving new dataframe at E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2020_08.csv with [992] obs
https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1598889191
Appending to E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2020_08.csv with [975] obs
https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1598839939
Appending to E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2020_08.csv with [968] obs
https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1598769009
Appending to E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2020_08.csv with [991] obs
https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1598687340
Appending to E:/

https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1597614633
Appending to E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2020_08.csv with [952] obs
https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1597534560
Appending to E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2020_08.csv with [977] obs
https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1597479912
Appending to E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2020_08.csv with [979] obs
https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1597442019
Appending to E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2020_08.csv with [991] obs
https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1597427806
Appending to E:/Users/Chris

https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1596805290
Appending to E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2020_08.csv with [973] obs
https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1596769127
Appending to E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2020_08.csv with [978] obs
https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1596751806
Appending to E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2020_08.csv with [984] obs
https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1596742447
Appending to E:/Users/Christiaan/Large_Files/Thesis/reddit/comments/pennystocks/2020_08.csv with [978] obs
https://api.pushshift.io/reddit/comment/search?limit=1000&order=desc&subreddit=pennystocks&before=1596732370
Appending to E:/Users/Chris

In [217]:
# path = r"C:\Users\Ck0rt\PycharmProjects\MasterThesis_BMMTFI\thesis\data_source\reddit\test.csv"
# path = r"C:\Users\Ck0rt\PycharmProjects\MasterThesis_BMMTFI\data\reddit\postdownloader\comments.csv"
path = r"test1233.csv"

columns = ['author', 
           'author_fullname', 
           'created_utc',
           'utc_datetime_str',
           'permalink',
           'score',
           'score_hidden',
           'total_awards_received',
           'body', 
           'is_submitter',
           'id',
           'link_id',
           'parent_id',
           'nest_level',
           'subreddit',
           'subreddit_id']
df = pd.read_csv(path)
df


Unnamed: 0.1,Unnamed: 0,author,author_flair_css_class,author_flair_text,body,can_gild,controversiality,created_utc,distinguished,edited,gilded,id,is_submitter,link_id,parent_id,permalink,score,stickied,subreddit,subreddit_id,subreddit_type,retrieved_utc,updated_utc,body_sha1,utc_datetime_str,edited_on,nest_level,collapsed,collapsed_reason,ups
0,0,s_i_m_s,,,Otherwise id go with the created_utc field go ...,True,0,1519531275,,False,0,dusekcz,False,t3_7zi9am,30159780000.0,/r/pushshift/comments/7zi9am/frequency_of_upda...,1,False,pushshift,t5_37z6f,public,1520216550,1668761992,51f333e820c478f537fcb99c7a10c9ce4c385d35,2018-02-25 04:01:15,,,,,
1,1,Nikhil9R,,,Yeah..seems like there’s something up then,True,0,1519528927,,True,0,duscp25,True,t3_7zi9am,30159330000.0,/r/pushshift/comments/7zi9am/frequency_of_upda...,1,False,pushshift,t5_37z6f,public,1520215429,1668761989,26028d34145da972e5c8d17b0365088fc9a6c1f1,2018-02-25 03:22:07,1519529000.0,,,,
2,2,s_i_m_s,,,Usually it's a stream and updated within secon...,True,0,1519516821,,False,0,dus30iw,False,t3_7zi9am,,/r/pushshift/comments/7zi9am/frequency_of_upda...,1,False,pushshift,t5_37z6f,public,1520209500,1668761977,1ef24a05a853332e104f8765c5f07480e2dbd74f,2018-02-25 00:00:21,,1.0,,,
3,3,Roughy,,,"Ah, thanks for the heads up about the maintain...",True,0,1519085863,,False,0,duin09r,True,t3_7ynd67,30143280000.0,/r/pushshift/comments/7ynd67/fetch_endpoints_5...,1,False,pushshift,t5_37z6f,public,1520029413,1668761228,10cc9096890a301740778bb571adc79703293d21,2018-02-20 00:17:43,,,,,
4,4,inspiredby,,,https://api.pushshift.io/reddit/search/submiss...,True,0,1519081588,,False,0,duij7h3,False,t3_7ynd67,,/r/pushshift/comments/7ynd67/fetch_endpoints_5...,1,False,pushshift,t5_37z6f,public,1520027220,1668761224,ce121b5db335b27abc082bf258f11a855dfc9b42,2018-02-19 23:06:28,,1.0,,,
5,5,BoyAndHisBlob,,,"&gt;If you have any other questions, please fe...",True,0,1518822796,,False,0,dud5bi2,True,t3_7xiemz,30131870000.0,/r/pushshift/comments/7xiemz/522_connection_ti...,1,False,pushshift,t5_37z6f,public,1519317083,1668760714,86a05a4ff857ac18838d83b5e9458c4327825aaf,2018-02-16 23:13:16,,,,,
6,6,s_i_m_s,,,Excellent! Thanks!,True,0,1518800854,,False,0,ducikc3,False,t3_7xiemz,30133050000.0,/r/pushshift/comments/7xiemz/522_connection_ti...,2,False,pushshift,t5_37z6f,public,1519306227,1668760696,df97672c41989fd95aacb4762c120d6a33d98574,2018-02-16 17:07:34,,,,,
7,7,Stuck_In_the_Matrix,,,You want to exclude specific authors from the ...,True,0,1518798472,,False,0,ducfxz2,False,t3_7xiemz,30132950000.0,/r/pushshift/comments/7xiemz/522_connection_ti...,1,False,pushshift,t5_37z6f,public,1519304976,1668760693,2db41f00b872669bafdce12e02650a231289c001,2018-02-16 16:27:52,,,,,
8,8,s_i_m_s,,,API question then:\nIs there a way to search c...,True,0,1518796320,,False,0,ducdni7,False,t3_7xiemz,30131870000.0,/r/pushshift/comments/7xiemz/522_connection_ti...,1,False,pushshift,t5_37z6f,public,1519303890,1668760691,28886e3a752b869c04b7ac76b6d0c780165a96d2,2018-02-16 15:52:00,,,,,
9,9,Stuck_In_the_Matrix,,,"Yes, you are correct. There are issues with D...",True,0,1518757458,,False,0,dubqkfz,False,t3_7xiemz,30131590000.0,/r/pushshift/comments/7xiemz/522_connection_ti...,1,False,pushshift,t5_37z6f,public,1519292902,1668760648,01890a3f0edd3626ea8f6e206273c3b85c1e2ddb,2018-02-16 05:04:18,,,,,


# ---------------------- Vault START ----------------------

In [None]:
def write_csv_line(writer, obj, is_submission, header=False):
    output_list = []
    output_list.append(str(obj['score']))
    output_list.append(datetime.fromtimestamp(obj['created_utc']).strftime("%Y-%m-%d"))
    if is_submission:
        output_list.append(obj['title'])
    output_list.append(f"u/{obj['author']}")
    output_list.append(f"https://www.reddit.com{obj['permalink']}")
    if is_submission:
        if obj['is_self']:
            if 'selftext' in obj:
                output_list.append(obj['selftext'])
            else:
                output_list.append("")
        else:
            output_list.append(obj['url'])
    else:
        output_list.append(obj['body'])
    print(output_list)
    writer.writerow(output_list)

def download_from_url(filename, url_base, output_format, start_datetime, end_datetime, is_submission, convert_to_ascii, debug=False):
    print(f"Saving to {filename}")

    count = 0
    if output_format == "human" or output_format == "json":
        if convert_to_ascii:
            handle = open(filename, 'w', encoding='ascii')
        else:
            handle = open(filename, 'w', encoding='UTF-8')
    else:
        handle = open(filename, 'w', encoding='UTF-8', newline='')
        writer = csv.writer(handle)

    previous_epoch = int(start_datetime.timestamp())
    break_out = False
    while True:
        new_url = url_base+str(previous_epoch)
        json_text = requests.get(new_url, headers={'User-Agent': "Post downloader by /u/Watchful1"})
        time.sleep(1)  # pushshift has a rate limit, if we send requests too fast it will start returning error messages
        
        if debug:
            print(new_url)

        try:
            json_data = json_text.json()
        except json.decoder.JSONDecodeError:
            time.sleep(1)
            continue

        if 'data' not in json_data:
            break
        objects = json_data['data']
        if len(objects) == 0:
            break
            
        df = pd.DataFrame(objects)
        df.to_csv("test.csv", header=True, index=False)

        for obj in objects:

            previous_epoch = obj['created_utc'] - 1
            if end_datetime is not None and datetime.utcfromtimestamp(previous_epoch) < end_datetime:
                break_out = True
                break
            count += 1
            try:
                if output_format == "human":
                    write_human_line(handle, obj, is_submission, convert_to_ascii)
                elif output_format == "csv":
                    write_csv_line(writer, obj, is_submission)
                elif output_format == "json":
                    write_json_line(handle, obj)
            except Exception as err:
                if 'permalink' in obj:
                    print(f"Couldn't print object: https://www.reddit.com{obj['permalink']}")
                else:
                    print(f"Couldn't print object, missing permalink: {obj['id']}")
                print(err)
                print(traceback.format_exc())

        if break_out:
            break

        print(f"Saved {count} through {datetime.fromtimestamp(previous_epoch).strftime('%Y-%m-%d')}")

    print(f"Saved {count}")
    handle.close()


# if __name__ == "__main__":
def run(comments_save_loc="comments.csv", posts_save_loc="posts.csv", debug=False):
    filter_string = None
    if username == "" and subreddit == "" and thread_id == "":
        print("Fill in username, subreddit or thread id")
        sys.exit(0)
    if output_format not in ("human", "csv", "json"):
        print("Output format must be one of human, csv, json")
        sys.exit(0)

    filters = []
    if username:
        filters.append(f"author={username}")
    if subreddit:
        filters.append(f"subreddit={subreddit}")
    if thread_id:
        if convert_thread_id_to_base_ten:
            filters.append(f"link_id={int(thread_id, 36)}")
        else:
            filters.append(f"link_id=t3_{thread_id}")
    filter_string = '&'.join(filters)

    url_template = "https://api.pushshift.io/reddit/{}/search?limit=1000&order=desc&{}&before="

    if not thread_id:
        test = "remove this when done"
#         download_from_url(posts_save_loc, url_template.format("submission", filter_string), output_format, start_time, end_time, True, convert_to_ascii)
    download_from_url(comments_save_loc, url_template.format("comment", filter_string), output_format, start_time, end_time, False, convert_to_ascii, debug=False)

username = ""  # put the username you want to download in the quotes
subreddit = "pushshift"  # put the subreddit you want to download in the quotes
thread_id = ""  

year = "2020"
start_time = datetime.strptime(f"04/02/{year}", "%m/%d/%Y") #datetime.utcnow() 
end_time = datetime.strptime(f"04/01/{year}", "%m/%d/%Y")

dir_path = r"E:\Users\Christiaan\Large_Files\Thesis\reddit\comments"
comments_save_loc = f"{dir_path}\comments.csv"
posts_save_loc = f"{dir_path}\posts.csv"
run(comments_save_loc, posts_save_loc)


# ---------------------- Vault END ----------------------

# Vault

In [2]:
import datetime as dt
import time
from dateutil.relativedelta import relativedelta

import pandas as pd
import praw
from pmaw import PushshiftAPI
api = PushshiftAPI(num_workers=10)
# api_praw = PushshiftAPI(praw=reddit)

pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 500)

from pathlib import Path
import os.path

In [19]:
start_date = dt.datetime(2018, 4, 1, 0, 0)
end_date = dt.datetime(2020, 4, 3, 0, 0)
start = int(start_date.timestamp())
end = int(end_date.timestamp())


posts = api.search_submissions(subreddit='wallstreetbets', limit=3000, after=start, before=end)
print(f'Retrieved {len(posts)} posts from Pushshift in seconds')

df = pd.DataFrame(posts)

Retrieved 0 posts from Pushshift in seconds


In [None]:
save = False

if save:
    # Directory where Reddit data is saved
    reddit_dir = r"C:/Users/Ck0rt/Documents/Large files/School/MSc Finance & Investments/Thesis/Reddit/posts"

    # Starting datetime
    start_date = dt.datetime(2018, 4, 1, 0, 0)
    end_date = dt.datetime(2020, 9, 1, 0, 0)

    subreddits = ['wallstreetbets', 'stocks', 'investing', 'stockmarket', 'pennystocks']

    for subreddit in subreddits:
        # Create folder to save output
        folder_loc = os.path.join(reddit_dir, subreddit).replace('\\', '/')
        print(folder_loc)
        Path(folder_loc).mkdir(parents=True, exist_ok=True)

        date_time = start_date
        while date_time < end_date:
            # Create 1 month search period in epoch time
            year_and_month = date_time.strftime('%Y_%m')
            start = int(date_time.timestamp())
            end = date_time + relativedelta(months=1)
            end = int(end.timestamp())

            # Create file to save output
            file_loc = os.path.join(folder_loc, year_and_month).replace('\\', '/') + ".csv"

            # Check if file already exists and skip API request if file exists
            if os.path.isfile(file_loc):
                print(f"File exists: [{file_loc}]")
                date_time = date_time + relativedelta(months=1)

                continue

            # Api cooldown time
            time.sleep(3)
            print(f"Now collecting data for [{subreddit}] in [{date_time.strftime('%B %Y')}]")

            # Request data from Pushshift
            start_time = time.time()
            posts = api.search_submissions(subreddit=subreddit, limit=300000, after=start, before=end)
            print(f'Retrieved {len(posts)} posts from Pushshift in [{time.time() - start_time}] seconds')

            # Save output to CSV via dataframe
            reddit_df = pd.DataFrame(posts)

            columns = ['author', 'created_utc', 'full_link', 'id', 'num_comments', 'score', 'selftext',
                       'subreddit', 'subreddit_id', 'subreddit_subscribers', 'title', 'url']
            reddit_df.to_csv(file_loc, header=True, index=False, columns=columns)
            print(f"Saving csv at [{file_loc}]")

            # Adding 1 month to date_time tracker
            date_time = date_time + relativedelta(months=1)
