# Project 3: Web APIs and NLP <br>

#### Brandie Hatch

## Preparation and Get Data

In [1]:
# imports
import pandas as pd

pd.options.display.max_columns =999

import requests
import time
import re
import nltk
from datetime import datetime

In [2]:
def get_posts_batch(subreddit, before):
    
    url = 'https://api.pushshift.io/reddit/search/submission'
    
    params = {
        'subreddit': subreddit,
        'size': 100
    }
    if before is not None: 
        params['before'] = before
        
    res = requests.get(url, params)
    return res.json()

def posts_to_df(posts):
    data = posts['data']
   
    df = pd.DataFrame(data)
    return df[['subreddit', 'id', 'author', 'score', 'title', 'selftext', 'num_comments', 'created_utc']]

def get_posts(subreddit, num_posts):
    
    post_ids = set()
    
    before = None
    
    i = 0
    
    while len(post_ids) < num_posts:
        batch = get_posts_batch(subreddit, before)
        
        df = posts_to_df(batch)
        
        before = df['created_utc'].min()
        
        post_ids.update(df['id'])
        
        if i == 0: 
            header = True
            mode = 'w'
        else: 
            header = False 
            mode = 'a'
        df.to_csv(subreddit + '.csv', mode=mode, header=header, index=False)
        print(f'We have scraped {len(post_ids)} posts.')
        i += 1
        time.sleep(2)

In [5]:
get_posts_batch('cattraining', '2022-05-14T06:04:21Z')

JSONDecodeError: [Errno Expecting value] <html>
  <head>
    <title>Internal Server Error</title>
  </head>
  <body>
    <h1><p>Internal Server Error</p></h1>
    
  </body>
</html>
: 0

In [32]:
CLIENT_ID = '5aTMVLqpXQvo6vpgNv171w'
SECRET_KEY = 'FAjO19Zyx5W4N9llEeklP8apwz20Zw'

In [33]:
with open('../assets/pw.txt', 'r') as f:
    pw = f.read()

In [36]:
auth = requests.auth.HTTPBasicAuth(CLIENT_ID, SECRET_KEY)

data = {
    'grant_type': 'password',
    'username': 'BrandieHatch',
    'password': pw
}

headers = {'user-agent': 'praw_BH/0.0.1', 'accept': '*/*'}

res = requests.post('https://www.reddit.com/api/v1/access_token',
                    auth=auth, data=data, headers=headers)

token = res.json()['access_token']


# add authorization to our headers dictionary
headers = {**headers, **{'Authorization': f'bearer {token}'}}

# while the token is valid (~2 hours) we just add headers=headers to our requests
requests.get('https://oauth.reddit.com/api/v1/me', headers=headers)

<Response [200]>

In [37]:
def get_posts(subreddit):
    
    url = f"https://oauth.reddit.com/r/{subreddit}"

    params = {
        'subreddit': subreddit,
        'size': 100
    }

    res = requests.get(url, headers=headers, params=params)

    return res.json()['data']['children']


In [38]:
# function to convert the responses to a dataframe

def res_to_df(res):
    
    df = []

    for submission in res:
        df.append({
            'subreddit': submission['data']['subreddit'],
            'id': submission['data']['id'],
            'title': submission['data']['title'],
            'selftext': submission['data']['selftext'],
            'author': submission['data']['author'],
            'name': submission['data']['name'],
            'ups': submission['data']['ups'], 
            'downs': submission['data']['downs'],
            'score': submission['data']['score'],
            'num_comments': submission['data']['num_comments'], 
            'created_utc': datetime.fromtimestamp(submission['data']['created_utc']).strftime('%Y-%m-%dT%H:%M:%SZ'),
            'over_18': submission['data']['over_18']
        })
    # print(df)
    return df

In [39]:
df_dogtraining = pd.DataFrame()
params = {'limit': 200}
url = f"https://oauth.reddit.com/r/dogtraining/new"

# loop through to get multiple requests
for i in range(200):
    # make request
    res = requests.get(url,
                       headers=headers,
                       params=params)

    # get dataframe from responses
    new_df = pd.DataFrame(res_to_df(get_posts('dogtraining')))
    # take the final row (which is the oldest entry)
    row = new_df.iloc[len(new_df)-1]
    # create fullname
    fullname = '_' + row['id']
    # add fullname in params
    params['after'] = fullname
    
    # append new_df to data
    df_dogtraining = df_dogtraining.append(new_df, ignore_index=False)
    
    import warnings
    warnings.simplefilter("ignore")

  df_dogtraining = df_dogtraining.append(new_df, ignore_index=False)


In [40]:
df_dogtraining.shape

(5400, 12)

In [41]:
df_dogtraining.head()

Unnamed: 0,subreddit,id,title,selftext,author,name,ups,downs,score,num_comments,created_utc,over_18
0,Dogtraining,uijir1,Trick of the Month - May 2022 - Crawl Backwards,Welcome to the Trick of the Month!\n\nThis mon...,moo6,t3_uijir1,9,0,9,4,2022-05-04T16:25:25Z,False
1,Dogtraining,ujxbsz,Announcement - Puppy Enrichment AMA With Allie...,,Cursethewind,t3_ujxbsz,6,0,6,2,2022-05-06T14:05:34Z,False
2,Dogtraining,ur8mh2,Zeke getting so much better off leash. 1 yr ol...,,Interr0gate,t3_ur8mh2,501,0,501,24,2022-05-16T16:06:13Z,False
3,Dogtraining,urcw3p,Need help with crate training. Also is this cr...,,urrkah,t3_urcw3p,64,0,64,28,2022-05-16T19:58:18Z,False
4,Dogtraining,urma9k,People keep approaching my puppy,"So, background, we recently adopted a 4 month ...",Marv_Harry_TWB15,t3_urma9k,3,0,3,7,2022-05-17T06:19:00Z,False


In [42]:
df_dogtraining.to_csv('../data/dogtraining' + '.csv', header=True, index=False)

In [46]:
df_cattraining = pd.DataFrame()
params = {'limit': 200}
url = f"https://oauth.reddit.com/r/cattraining/new"

# loop through to get multiple requests
for i in range(200):
    # make request
    res = requests.get(url,
                       headers=headers,
                       params=params)

    # get dataframe from response
    new_df = pd.DataFrame(res_to_df(get_posts('cattraining')))
    # take the final row (which is the oldest entry)
    row = new_df.iloc[len(new_df)-1]
    # create fullname
    fullname = '_' + row['id']
    # add fullname in params
    params['after'] = fullname
    
    # add new_df to data
    df_cattraining = df_cattraining.append(new_df, ignore_index=False)
    
    import warnings
    warnings.simplefilter("ignore")

In [47]:
df_cattraining.shape

(5200, 12)

In [48]:
df_cattraining.head()

Unnamed: 0,subreddit,id,title,selftext,author,name,ups,downs,score,num_comments,created_utc,over_18
0,CatTraining,glg2qe,META: Sub Updated,"All,\n\nI've gone through and updated the Rule...",shrttle,t3_glg2qe,13,0,13,11,2020-05-17T07:14:51Z,False
1,CatTraining,ur6a1i,Cat Not Covering Waste,,jZeus__,t3_ur6a1i,34,0,34,12,2022-05-16T14:15:24Z,False
2,CatTraining,uqrytj,How can I train a stray cat to not try to rush...,Hey guys. So a little background; there are a ...,U_hav_2_call_me_drgn,t3_uqrytj,29,0,29,26,2022-05-16T02:33:27Z,False
3,CatTraining,uqxmkm,Cat wants to go outside and never shuts up meo...,Hey so we don’t let my cat outside and all he ...,thugmanthug,t3_uqxmkm,3,0,3,7,2022-05-16T07:46:38Z,False
4,CatTraining,uqum76,What age should I start clicker training my ki...,,Longjumping-Mine-145,t3_uqum76,5,0,5,3,2022-05-16T05:18:16Z,False


In [49]:
df_cattraining.to_csv('../data/cattraining' + '.csv', header=True, index=False)

## Sources:

Initial authorization and PRAW tutorial: https://youtu.be/NRgfgtzIhBQ 

PRAW, Requests, and authorization code assistance to build functions:
https://towardsdatascience.com/how-to-use-the-reddit-api-in-python-5e05ddfd1e5c

Python Requests Library Guide: https://realpython.com/python-requests/ 