# Reddit get-comment tool, covid-19 sentiment analysis

## Set-up

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plot
import datetime as dt
import time
import requests
from bs4 import BeautifulSoup

In [2]:
sub_url = 'https://api.pushshift.io/reddit/search/submission'
comment_url = 'https://api.pushshift.io/reddit/search/comment'

#### Define subreddits, fields gathered

In [3]:
# Play with different localities here
subreddits = ['nyc', 'houston']

In [4]:
submission_fields = ['id','title', 'created_utc','num_comments','subreddit']
comment_fields = ['link_id','body','created_utc', 'subreddit', 'score']

#### Set key terms; Name data

In [5]:
# Batch name (suffix to add to all saved data)
prefix = '2020_general'

In [6]:
# Search terms
keywords = 'covid|quarantine|pandemic|coronavirus'

#### Set Time

In [7]:
int(dt.datetime(2020,5,10,0,0,0).timestamp())

1589083200

In [8]:
# time flags (search starts at t2 and goes back by 'span_days' find t1)
# 86400 utc = 1 day

# Start Time
# t2 = round(time.time()) # Now
# t2 = round((dt.datetime.now() - dt.timedelta(days=365)).timestamp()) #5/10/2019, 12 am
t2 = int(dt.datetime(2020,5,10,0,0,0).timestamp())
# Search Span
span_days = 80

t1 = str(int(t2) - span_days*86400)

## Submissions

In [9]:
# get submissions
submissions = pd.DataFrame(columns = submission_fields)
df_list = []

for subreddit in subreddits:
    start_time = t2
    # start_time = round(time.time())
    res = requests.get(
        sub_url,
        params={
            'subreddit' : subreddit,
            # 'q' : keywords,
            'fields': submission_fields,
            'size' : 400,
            'sort_type' : 'num_comments',
            'sort' : 'desc',
            'before': start_time,  
            'after': t1,
        })
    # Make sure we got a 2xx response
    res.raise_for_status()

    df = pd.DataFrame(res.json()['data'])
    
    # Filter out non-commented; could also set 'sort_type' parameter to get most commented
    df = df[df['num_comments'] >0]
    
    df_list.append(df)

start_time = df.created_utc.min()
submissions = pd.concat(df_list, axis=0)
submissions['date'] = [dt.date.fromtimestamp(x).isoformat() for x in submissions['created_utc']]

## Comments

In [10]:
# Generate id dictionary for cross-referencing submissions with comments
link_ids = {sub: submissions[submissions["subreddit"] == sub]["id"] for sub in subreddits}

In [11]:
# get comments
df_list = []

for subreddit in subreddits:
    start_time = t2
    c = 0
    while c < submissions[submissions['subreddit'] == subreddit]['num_comments'].sum():
        time.sleep(2)
        res = requests.get(
            comment_url,
            params={
                'subreddit' : subreddit,
                'fields': comment_fields,
                'link_id' : (['t3_' + n for n in link_ids[subreddit]]),  #/comment?link_id : /submission?ids
                'size' : 1000,
                'before' : start_time,
            })
        # Make sure we got a 2xx response
        res.raise_for_status()

        # Don't parse data unless we got at least one post
        if len(res.json()['data']) == 0:
            break
        
        df = pd.DataFrame(res.json()['data'])

        # raise counter by number of rows in df
        c += df.shape[0]

        print(f"Fetched {c} comments from r/{subreddit} since {dt.datetime.fromtimestamp(start_time).isoformat()}")

        df_list.append(df)
        start_time = df['created_utc'].min()
        
comments = pd.concat(df_list, axis=0)
comments['date'] = [dt.date.fromtimestamp(x).isoformat() for x in comments['created_utc']]

Fetched 1000 comments from r/nyc since 2020-05-10T00:00:00
Fetched 2000 comments from r/nyc since 2020-05-09T12:30:54
Fetched 3000 comments from r/nyc since 2020-05-08T18:49:44
Fetched 4000 comments from r/nyc since 2020-05-08T13:09:04
Fetched 5000 comments from r/nyc since 2020-05-08T08:20:29
Fetched 6000 comments from r/nyc since 2020-05-07T16:59:32
Fetched 7000 comments from r/nyc since 2020-05-07T08:51:33
Fetched 8000 comments from r/nyc since 2020-05-06T16:15:49
Fetched 9000 comments from r/nyc since 2020-05-06T09:51:11
Fetched 10000 comments from r/nyc since 2020-05-05T16:47:05
Fetched 11000 comments from r/nyc since 2020-05-05T11:53:26
Fetched 12000 comments from r/nyc since 2020-05-04T19:49:35
Fetched 13000 comments from r/nyc since 2020-05-04T09:44:22
Fetched 14000 comments from r/nyc since 2020-05-03T13:03:46
Fetched 15000 comments from r/nyc since 2020-05-02T21:29:31
Fetched 16000 comments from r/nyc since 2020-05-02T13:09:30
Fetched 17000 comments from r/nyc since 2020-05-0

Fetched 138000 comments from r/nyc since 2020-03-06T18:40:52
Fetched 139000 comments from r/nyc since 2020-03-06T07:51:42
Fetched 140000 comments from r/nyc since 2020-03-05T09:43:03
Fetched 141000 comments from r/nyc since 2020-03-04T12:10:32
Fetched 142000 comments from r/nyc since 2020-03-03T12:21:06
Fetched 143000 comments from r/nyc since 2020-03-02T09:09:56
Fetched 144000 comments from r/nyc since 2020-03-01T19:20:31
Fetched 145000 comments from r/nyc since 2020-02-29T10:03:41
Fetched 146000 comments from r/nyc since 2020-02-28T14:27:29
Fetched 147000 comments from r/nyc since 2020-02-27T12:47:21
Fetched 148000 comments from r/nyc since 2020-02-26T12:36:12
Fetched 149000 comments from r/nyc since 2020-02-23T17:26:52
Fetched 149525 comments from r/nyc since 2020-02-21T12:24:47
Fetched 1000 comments from r/houston since 2020-05-10T00:00:00
Fetched 2000 comments from r/houston since 2020-05-08T10:12:25
Fetched 3000 comments from r/houston since 2020-05-07T08:39:03
Fetched 4000 comme

## Sample/Save Data

In [12]:
# Fewest posts we got from a subreddit
smallest = comments['subreddit'].value_counts().min()

# Pare every subreddit down to this number by random sampling
comments_sampled = pd.concat([
        comments[comments['subreddit'] == subreddit].sample(smallest, random_state=101)
        for subreddit in subreddits
    ])

In [13]:
# verify we now have equal classes
comments_sampled['subreddit'].value_counts()

houston    85089
nyc        85089
Name: subreddit, dtype: int64

In [14]:
# Save to local hard drive with prefix
today = dt.date.today().isoformat()

comments.to_csv(f'../data/{prefix}_reddit-comments_all-{today}.csv.bz2', index=False, compression='bz2')
comments_sampled.to_csv(f'../data/{prefix}_reddit-comments_sampled-{today}.csv.bz2', index=False, compression='bz2')
submissions.to_csv(f'../data/{prefix}_reddit-submissions-{today}.csv.bz2', index=False, compression='bz2')