In [1]:
import time
import datetime as dt

import json

import configparser

import requests
from pmaw import PushshiftAPI

import pandas as pd

In [2]:
headers = {'User-Agent': 'test_project'}

# Pulling posts using pmaw
* this method actually works in filtering using before and after
* multithreaded library allows for faster pulling of data

[Source](https://www.reddit.com/r/pushshift/comments/y4raqy/keyerror_created_utc_in_pushshiftapibasepy/)

In [3]:
# Code pulled from reddit post link below
# https://www.reddit.com/r/pushshift/comments/y4raqy/keyerror_created_utc_in_pushshiftapibasepy/

start_time = time.time()
subreddits = ['sandiego']

submissions_limit = 100
# comments_limit = 100
# users_limit = 350

api = PushshiftAPI()

# before = int(dt.datetime(2022,2,1,0,0).timestamp())
# after = int(dt.datetime(2021,12,1,0,0).timestamp())
before = int(dt.datetime(2022, 11, 1, 0, 0).timestamp())
after = int(dt.datetime(2021, 12, 31, 23, 59).timestamp())

# collecting_users_comments = api.search_comments(
#                                     subreddit=subreddits,
#                                     # limit=users_limit,
#                                     before=before, after=after,
#                                     fields=['author', 'subreddit', 'created_utc']
#                                 )
collecting_users_submissions = api.search_submissions(
                                    subreddit="sandiego",
                                    # limit=users_limit,
                                    before=before, after=after,
                                )
# print(f'Retrieved {len(collecting_users_comments)} users from submissions')
print(f'Retrieved {len(collecting_users_submissions)} users from posts')
print(f'Elapsed time: {time.time() - start_time}')

# This will consume the request
# In order to use collecting_users_submissions requires rerunning the code
post_list = [post for post in collecting_users_submissions]

Retrieved 22005 users from posts
Elapsed time: 507.9436819553375


In [4]:
keys_interest = ['title', 'author', 'created_utc', 'selftext', 'full_link']
data_dict = {}
res_dict = {}
for i, post_dict in enumerate(post_list):
    res_dict[i] = post_dict
    data_dict[i] = {}
    for k in keys_interest:
        try:
            data_dict[i][k] = post_dict[k]
        except:
            data_dict[i][k] = None

# Write json files of original pull
with open('year_data.json', 'w') as f:
    json.dump(res_dict, f)
    
# Write out csv of data
data_df = pd.DataFrame(data_dict).T
data_df["created_utc"] = pd.to_datetime(data_df["created_utc"], utc=True, unit='s')
data_df.reset_index(inplace=True)
data_df.sort_values('created_utc')
data_df.to_csv('year_data.csv')
data_df

Unnamed: 0,index,title,author,created_utc,selftext,full_link
0,0,How to find a room under 1100??,logancali,2022-03-03 02:53:22+00:00,I am a San Diego native and have been looking ...,https://www.reddit.com/r/sandiego/comments/t5h...
1,1,Tax Return Intercepted by City?,abn104,2022-03-03 02:46:25+00:00,My FTB account says that $250+ of my Californi...,https://www.reddit.com/r/sandiego/comments/t5h...
2,2,San Diego Convention Center,4gecko44,2022-03-03 02:39:21+00:00,&amp;#x200B;\n\n[At the direction of San Diego...,https://www.reddit.com/r/sandiego/comments/t5h...
3,3,Pro-Putin musician Nina Kravitz is playing CRO...,Naturwissenschaftler,2022-03-03 02:32:47+00:00,"She made a small fortune during the pandemic, ...",https://www.reddit.com/r/sandiego/comments/t5g...
4,4,I got soaked today,zachtheeagle,2022-03-03 02:19:22+00:00,,https://www.reddit.com/r/sandiego/comments/t5g...
...,...,...,...,...,...,...
22000,22000,Anyone receiving extortion letters for copyrig...,rytecno1,2022-08-08 14:50:16+00:00,Creative friends and myself have been getting ...,https://www.reddit.com/r/sandiego/comments/wja...
22001,22001,PETA Files Federal Complaint Against SeaWorld ...,Numerous-Macaroon224,2022-08-08 14:21:55+00:00,,https://www.reddit.com/r/sandiego/comments/wja...
22002,22002,PETA Files Federal Complaint Against SeaWorld ...,Numerous-Macaroon224,2022-08-08 14:13:50+00:00,,https://www.reddit.com/r/sandiego/comments/wj9...
22003,22003,"For subscribers: 'Urban villages,' an aerial s...",Moleoaxaqueno,2022-08-08 14:10:26+00:00,,https://www.reddit.com/r/sandiego/comments/wj9...


# Other Methods of Interfacing with reddit posts

## WARNING: No Guarantee These Methods or Code Works
All code was used as exploration in getting information from reddit

# Pulling posts using Reddit's api

## Issues
* No way to pull posts from specific times

In [2]:
subreddit      = "SanDiego"
subreddit_filt = "new"

In [3]:
# read in config file
config = configparser.ConfigParser()
config.read('reddit_config.ini')

['reddit_config.ini']

In [4]:
# Request access to reddit
auth = requests.auth.HTTPBasicAuth(config["script"]["personal_use_script"], config["script"]["secret"])

# set up log in creds
data = {'grant_type': 'password',
        'username': config["auth"]["username"],
        'password': config["auth"]["password"]
       }

In [5]:
# send our request for an OAuth token
res = requests.post('https://www.reddit.com/api/v1/access_token',
                    auth=auth, data=data, headers=headers)

# convert response to JSON and pull access_token value
TOKEN = res.json()['access_token']

# add authorization to our headers dictionary
headers = {**headers, **{'Authorization': f"bearer {TOKEN}"}}

# while the token is valid (~2 hours) we just add headers=headers to our requests
requests.get('https://oauth.reddit.com/api/v1/me', headers=headers)

<Response [200]>

In [6]:
res = requests.get(f"https://oauth.reddit.com/r/{subreddit}/{subreddit_filt}",
                   headers=headers, params={'limit':'200'})

In [7]:
reddit_res = res.json()

In [8]:
keys_interest = ['title', 'created_utc', 'selftext', 'url', 'link_flair_text']
data_dict = {}
for i, d in enumerate(reddit_res['data']['children']):
    data_dict[i] = {k: d['data'][k] for k in keys_interest}
data_df = pd.DataFrame(data_dict).T
data_df

Unnamed: 0,title,created_utc,selftext,url,link_flair_text
0,San Diego Zoo!,1667595536.0,I apologize I assume you guys get a lot of the...,https://www.reddit.com/r/sandiego/comments/yma...,
1,Mexico Soccer Jersey,1667593178.0,where can i found some in sd?,https://www.reddit.com/r/sandiego/comments/ym9...,
2,Stratotanker Currently Circling San Diego,1667592914.0,Anyone else see/hear the Stratotanker currentl...,https://www.reddit.com/r/sandiego/comments/ym9...,
3,Point Loma Tide Pools,1667592754.0,,https://i.redd.it/0nyaaektrzx91.jpg,Photo
4,Car accident on intersection Texas St and Cami...,1667591281.0,,https://i.redd.it/snnbsnrhnzx91.jpg,Photo
...,...,...,...,...,...
95,How to join pickleball team in SD?,1667369435.0,Any pickleball players have insight on how to ...,https://www.reddit.com/r/sandiego/comments/yjx...,
96,Need help from you fine folks again…,1667368811.0,"Except this time instead of “open late,” are t...",https://www.reddit.com/r/sandiego/comments/yjx...,
97,Tequila samples,1667367930.0,Does anyone know where I can find a liquor sto...,https://www.reddit.com/r/sandiego/comments/yjw...,
98,My 16yo’s car was stolen near SDSU. Cops don’t...,1667366199.0,,https://www.reddit.com/r/sandiego/comments/yjw...,


In [9]:
data_df["created_utc"] = pd.to_datetime(data_df["created_utc"], utc=True, unit='s')
data_df

Unnamed: 0,title,created_utc,selftext,url,link_flair_text
0,San Diego Zoo!,2022-11-04 20:58:56+00:00,I apologize I assume you guys get a lot of the...,https://www.reddit.com/r/sandiego/comments/yma...,
1,Mexico Soccer Jersey,2022-11-04 20:19:38+00:00,where can i found some in sd?,https://www.reddit.com/r/sandiego/comments/ym9...,
2,Stratotanker Currently Circling San Diego,2022-11-04 20:15:14+00:00,Anyone else see/hear the Stratotanker currentl...,https://www.reddit.com/r/sandiego/comments/ym9...,
3,Point Loma Tide Pools,2022-11-04 20:12:34+00:00,,https://i.redd.it/0nyaaektrzx91.jpg,Photo
4,Car accident on intersection Texas St and Cami...,2022-11-04 19:48:01+00:00,,https://i.redd.it/snnbsnrhnzx91.jpg,Photo
...,...,...,...,...,...
95,How to join pickleball team in SD?,2022-11-02 06:10:35+00:00,Any pickleball players have insight on how to ...,https://www.reddit.com/r/sandiego/comments/yjx...,
96,Need help from you fine folks again…,2022-11-02 06:00:11+00:00,"Except this time instead of “open late,” are t...",https://www.reddit.com/r/sandiego/comments/yjx...,
97,Tequila samples,2022-11-02 05:45:30+00:00,Does anyone know where I can find a liquor sto...,https://www.reddit.com/r/sandiego/comments/yjw...,
98,My 16yo’s car was stolen near SDSU. Cops don’t...,2022-11-02 05:16:39+00:00,,https://www.reddit.com/r/sandiego/comments/yjw...,


In [10]:
reddit_res['data']['children'][0]['data'].keys()

dict_keys(['approved_at_utc', 'subreddit', 'selftext', 'author_fullname', 'saved', 'mod_reason_title', 'gilded', 'clicked', 'title', 'link_flair_richtext', 'subreddit_name_prefixed', 'hidden', 'pwls', 'link_flair_css_class', 'downs', 'thumbnail_height', 'top_awarded_type', 'hide_score', 'name', 'quarantine', 'link_flair_text_color', 'upvote_ratio', 'author_flair_background_color', 'ups', 'total_awards_received', 'media_embed', 'thumbnail_width', 'author_flair_template_id', 'is_original_content', 'user_reports', 'secure_media', 'is_reddit_media_domain', 'is_meta', 'category', 'secure_media_embed', 'link_flair_text', 'can_mod_post', 'score', 'approved_by', 'is_created_from_ads_ui', 'author_premium', 'thumbnail', 'edited', 'author_flair_css_class', 'author_flair_richtext', 'gildings', 'post_hint', 'content_categories', 'is_self', 'subreddit_type', 'created', 'link_flair_type', 'wls', 'removed_by_category', 'banned_by', 'author_flair_type', 'domain', 'allow_live_comments', 'selftext_html',

In [11]:
reddit_res['data'].keys()

dict_keys(['after', 'dist', 'modhash', 'geo_filter', 'children', 'before'])

In [12]:
data_df.to_csv("reddit_cop_example.csv", index=False)

In [13]:
with open('reddit_cop_data.json', 'w') as f:
    json.dump(reddit_res['data'], f)

# Pulling using Push Shift

## Issues
* before and after filter doesn't seem to work to filter posts
    * [Click here for example](https://api.pushshift.io/reddit/submission/search?limit=1000&?after=1667276000&before=1667286000&sort_type=score&sort=desc&subreddit=SanDiego)

In [9]:
start = dt.datetime(2022, 11, 1, 0, 0)
before = int(start.timestamp())
after = before - 10000

# after = datetime(2021, 12, 31, 23, 59)
# after = int(start.timestamp())

end = dt.datetime(2022, 9, 1, 23, 59) # testing purposes
final_after = int(end.timestamp())

print((before, after))

(1667286000, 1667276000)


True

In [10]:
# Putting together and testing various urls

# url = "https://api.pushshift.io/reddit/{}/search?limit=1000&sort=desc&before={}&after={}"
# url = "https://api.pushshift.io/reddit/submission/search/?after={}&before={}&sort_type=score&sort=desc&subreddit={}"
url = "https://api.pushshift.io/reddit/submission/search?limit=1000&?after={}&before={}&sort_type=score&sort=desc&subreddit={}"
# url = "https://api.pushshift.io/reddit/submission/search?limit=25&?before={}&sort_type=score&sort=desc&subreddit={}"

# url = "https://api.pushshift.io/reddit/{}/search?limit=1000&sort=desc&before={}&after={}"

In [11]:
# Get out example url
url.format(after, before, subreddit)

'https://api.pushshift.io/reddit/submission/search?limit=1000&?after=1667276000&before=1667286000&sort_type=score&sort=desc&subreddit=SanDiego'

In [13]:
res_df = pd.DataFrame()
i = 0
print(final_after < before)
while final_after < before:
    time.sleep(1)
    # new_url = url.format(subreddit, before, after)
    new_url = url.format(after, before, subreddit)
    print(new_url)
    # new_url = url.format(before, subreddit)
    res = requests.get(new_url, headers=headers)
    print(res)
    
    reddit_res = res.json()
    keys_interest = ['title', 'author', 'created_utc', 'selftext', 'full_link']
    data_dict = {}
    for i, d in enumerate(reddit_res['data']):
        data_dict[i] = {}
        for k in keys_interest:
            try:
                data_dict[i][k] = d[k]
            except:
                data_dict[i][k] = None
        # data_dict[i] = {k: d[k] for k in keys_interest}
    data_df = pd.DataFrame(data_dict).T
    # break
    data_df["created_utc"] = pd.to_datetime(data_df["created_utc"], utc=True, unit='s')
    res_df = pd.concat([res_df, data_df])
    res_df.reset_index(inplace=True)
    res_df.sort_values('created_utc')
    
    before = data_df.iloc[-1].created_utc.to_pydatetime()
    before = int(before.timestamp())
    after = before - 100000
    print(before)
    i = i + 1
    if i > 3:
        break
res_df.to_csv("test_output.csv", index=False)
res_df

True
1667286000
0
https://api.pushshift.io/reddit/submission/search?limit=1000&?after=1667276000&before=1667286000&sort_type=score&sort=desc&subreddit=SanDiego
<Response [200]>
1561748408
