In [1]:
from pmaw import PushshiftAPI
import pandas as pd
import datetime as dt
import math
import os
import glob

In [2]:
# Instantiate Pushshift API
api = PushshiftAPI(num_workers=20)

In [3]:
years = [2017, 2018, 2019, 2020, 2021]

public_subreddits = ["UCSD", "FSU", "ucla", "UCSantaBarbara", "UCI", "UCDavis", "UCSC", "uofm", "udub", "UTAustin",
                     "PennStateUniversity", "gatech"]


private_subreddits = ["columbia", "Harvard", "yale", "UPenn", "BrownU", "mit", "BostonU", "USC", "Syracuse",
                      "notredame", "georgetown", "stanford"]

In [4]:
# convert date to readable datetime
def get_date(created):
    return dt.datetime.fromtimestamp(created)

In [5]:
# Month should be one of the following: 01, 02, 03, 04, ..., 12
def get_top_per_month(month, df):
    df = df[df["created_utc"].dt.strftime("%m") == month] # get posts for given month
    df = df.sort_values("score", ascending=False) # sort DataFrame by score
    num_top = math.ceil(len(df.index)*0.05) # get number of posts needed (top 5%)
    df = df.head(num_top) # get top 5% of posts
    return df

In [6]:
# Given a subreddit, get all posts from that subreddit for 2017-2021
# college_type is either "private" or "public"
def get_college_posts(subreddit, college_type):
    for year in years:
        # timestamps
        before = int(dt.datetime(year+1,1,1).timestamp())
        after = int(dt.datetime(year,1,1).timestamp())
        
        # get posts; store results in cache
        posts = api.search_submissions(subreddit=subreddit, before=before, after=after, safe_exit=True)
        print(f'Retrieved {len(posts)} posts from Pushshift')

        # convert to DataFrame
        posts_df = pd.DataFrame(posts)
        
        # basic filtering
        
        # get only relevant fields and convert date to readable datetime
        posts_df = posts_df[["id", "url", "title", "score", "num_comments", "created_utc", "selftext"]]
        posts_df["created_utc"] = posts_df["created_utc"].apply(get_date)
        
        # filter out deleted posts, posts that only contain photos, and posts with no body
        posts_df = posts_df.loc[(posts_df["selftext"] != "[deleted]") & 
                                (posts_df["selftext"] != "[removed]") &
                                (posts_df["selftext"] != "") &
                                (posts_df["selftext"].isna() == False)]
        
        
        # get top 5% of posts for each month
        january = get_top_per_month("01", posts_df)
        february = get_top_per_month("02", posts_df)
        march = get_top_per_month("03", posts_df)
        april = get_top_per_month("04", posts_df)
        may = get_top_per_month("05", posts_df)
        june = get_top_per_month("06", posts_df)
        july = get_top_per_month("07", posts_df)
        august = get_top_per_month("08", posts_df)
        september = get_top_per_month("09", posts_df)
        october = get_top_per_month("10", posts_df)
        november = get_top_per_month("11", posts_df)
        december = get_top_per_month("12", posts_df)
        
        # combine top posts for each month into one DataFrame
        posts_df = pd.concat([january, february, march, april, may, june, july,
                              august, september, october, november, december])
        
        # export to csv
        path = "data/" + college_type + "/" + subreddit
        if not os.path.exists(path):
            os.makedirs(path)
            
        posts_df.to_csv(path + "/" + subreddit + str(year) + ".csv", index=False)

In [None]:
# Split up runs so function doesn't timeout
for i in range(0,6):
    get_college_posts(public_subreddits[i], "public")

In [12]:
for i in range(6,12):
    get_college_posts(public_subreddits[i], "public")

Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.


Retrieved 3183 posts from Pushshift


Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.


Retrieved 5210 posts from Pushshift


Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.


Retrieved 10025 posts from Pushshift


Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.


Retrieved 10432 posts from Pushshift
Retrieved 8641 posts from Pushshift


Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.


Retrieved 2244 posts from Pushshift


Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.


Retrieved 2699 posts from Pushshift


Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.


Retrieved 4320 posts from Pushshift


Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.


Retrieved 6892 posts from Pushshift
Retrieved 9548 posts from Pushshift


Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.


Retrieved 3420 posts from Pushshift


Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.


Retrieved 3290 posts from Pushshift


Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.


Retrieved 6278 posts from Pushshift


Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.


Retrieved 9483 posts from Pushshift
Retrieved 11776 posts from Pushshift


Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.


Retrieved 4583 posts from Pushshift


Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.


Retrieved 4757 posts from Pushshift


Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.


Retrieved 5990 posts from Pushshift


Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.


Retrieved 9127 posts from Pushshift
Retrieved 10521 posts from Pushshift


Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.


Retrieved 1916 posts from Pushshift


Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.


Retrieved 2196 posts from Pushshift


Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.


Retrieved 3002 posts from Pushshift


Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.


Retrieved 4650 posts from Pushshift
Retrieved 5717 posts from Pushshift


Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.


Retrieved 5817 posts from Pushshift


Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.


Retrieved 6664 posts from Pushshift


Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.


Retrieved 7602 posts from Pushshift


Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.


Retrieved 7854 posts from Pushshift
Retrieved 8017 posts from Pushshift


In [7]:
# Merge data for all years for a given subreddit into one csv
# Note that this function expects the only csv files in the subreddit subdirectory are the ones generated 
# by get_college_posts()
def aggregate_by_subreddit(subreddit, college_type):
    path = "data/" + college_type + "/" + subreddit
    files = [f for f in glob.glob(path + "/*.csv")]
    
    df = pd.DataFrame(columns=["id", "url", "title", "score", "num_comments", "created_utc", "selftext"])
    for f in files:
        data = pd.read_csv(f)
        df = pd.concat([df, data])
        
    print(len(df.index))
    df.to_csv(path + "/" + subreddit + "_all_time.csv", index=False)

In [None]:
aggregate_by_subreddit("UCSD", "public")
aggregate_by_subreddit("FSU", "public")

In [None]:
test = pd.read_csv("data/public/FSU/FSU_all_time.csv")
test

In [14]:
for i in range(2,12):
    aggregate_by_subreddit(public_subreddits[i], "public")

2123
1504
2041
1459
1496
964
1202
1175
636
1151


In [None]:
# Displays number of shards (nodes) that are successful
api.metadata_.get('shards')

In [16]:
# Split up runs so function doesn't timeout
for i in range(0,4):
    get_college_posts(private_subreddits[i], "private")

Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.


Retrieved 592 posts from Pushshift


Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.


Retrieved 630 posts from Pushshift


Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.


Retrieved 1437 posts from Pushshift


Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.


Retrieved 3288 posts from Pushshift
Retrieved 5886 posts from Pushshift


Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.


Retrieved 639 posts from Pushshift


Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.


Retrieved 675 posts from Pushshift


Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.


Retrieved 1055 posts from Pushshift


Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.


Retrieved 1691 posts from Pushshift
Retrieved 2101 posts from Pushshift


Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.


Retrieved 227 posts from Pushshift


Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.


Retrieved 406 posts from Pushshift


Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.


Retrieved 608 posts from Pushshift


Not all PushShift shards are active. Query results may be incomplete.


Retrieved 886 posts from Pushshift
Retrieved 1049 posts from Pushshift


Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.


Retrieved 740 posts from Pushshift


Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.


Retrieved 861 posts from Pushshift


Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.


Retrieved 1812 posts from Pushshift


Not all PushShift shards are active. Query results may be incomplete.


Retrieved 4334 posts from Pushshift
Retrieved 5952 posts from Pushshift


In [17]:
for i in range(4,8):
    get_college_posts(private_subreddits[i], "private")

Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.


Retrieved 223 posts from Pushshift


Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.


Retrieved 424 posts from Pushshift


Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.


Retrieved 538 posts from Pushshift


Not all PushShift shards are active. Query results may be incomplete.


Retrieved 944 posts from Pushshift
Retrieved 939 posts from Pushshift


Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.


Retrieved 556 posts from Pushshift


Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.


Retrieved 655 posts from Pushshift


Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.


Retrieved 788 posts from Pushshift


Not all PushShift shards are active. Query results may be incomplete.


Retrieved 1236 posts from Pushshift
Retrieved 1492 posts from Pushshift


Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.


Retrieved 890 posts from Pushshift


Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.


Retrieved 1336 posts from Pushshift


Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.


Retrieved 2769 posts from Pushshift


Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.


Retrieved 7258 posts from Pushshift
Retrieved 10019 posts from Pushshift


Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.


Retrieved 1461 posts from Pushshift


Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.


Retrieved 1702 posts from Pushshift


Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.


Retrieved 2970 posts from Pushshift


Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.


Retrieved 5052 posts from Pushshift
Retrieved 9916 posts from Pushshift


In [19]:
for i in range(8,12):
    get_college_posts(private_subreddits[i], "private")

Retrieved 1032 posts from Pushshift
Retrieved 1068 posts from Pushshift
Retrieved 1331 posts from Pushshift
Retrieved 1981 posts from Pushshift
Retrieved 2814 posts from Pushshift
Retrieved 387 posts from Pushshift
Retrieved 461 posts from Pushshift
Retrieved 677 posts from Pushshift
Retrieved 1458 posts from Pushshift
Retrieved 1475 posts from Pushshift
Retrieved 234 posts from Pushshift
Retrieved 205 posts from Pushshift
Retrieved 319 posts from Pushshift
Retrieved 557 posts from Pushshift
Retrieved 728 posts from Pushshift
Retrieved 745 posts from Pushshift


Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.


Retrieved 801 posts from Pushshift


Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.


Retrieved 1672 posts from Pushshift


Not all PushShift shards are active. Query results may be incomplete.
Not all PushShift shards are active. Query results may be incomplete.


Retrieved 2968 posts from Pushshift
Retrieved 4453 posts from Pushshift


In [20]:
for i in range(0,12):
    aggregate_by_subreddit(private_subreddits[i], "private")

508
169
127
582
143
191
937
846
283
174
90
384


In [21]:
# Displays number of shards (nodes) that are successful
api.metadata_.get('shards')

{'failed': 0, 'skipped': 0, 'successful': 4, 'total': 4}