# Project 3: Web APIs and NLP <br>

#### Brandie Hatch

## Preparation and Get Data

In [6]:
# imports
import pandas as pd

pd.options.display.max_columns =999

import requests
from datetime import datetime

In [7]:
CLIENT_ID = '5aTMVLqpXQvo6vpgNv171w'
SECRET_KEY = 'FAjO19Zyx5W4N9llEeklP8apwz20Zw'

In [11]:
with open('../assets/pw.txt', 'r') as f:
    pw = f.read()

In [12]:
auth = requests.auth.HTTPBasicAuth(CLIENT_ID, SECRET_KEY)

data = {
    'grant_type': 'password',
    'username': 'BrandieHatch',
    'password': pw
}

headers = {'user-agent': 'praw_BH/0.0.1', 'accept': '*/*'}

res = requests.post('https://www.reddit.com/api/v1/access_token',
                    auth=auth, data=data, headers=headers)

token = res.json()['access_token']


# add authorization to our headers dictionary
headers = {**headers, **{'Authorization': f'bearer {token}'}}

# while the token is valid (~2 hours) we just add headers=headers to our requests
requests.get('https://oauth.reddit.com/api/v1/me', headers=headers)

<Response [200]>

In [13]:
def get_posts(subreddit):
    
    url = f"https://oauth.reddit.com/r/{subreddit}"

    params = {
        'subreddit': subreddit,
        'size': 100
    }

    res = requests.get(url, headers=headers, params=params)

    return res.json()['data']['children']


In [14]:
# function to convert the responses to a dataframe

def res_to_df(res):
    
    df = []

    for submission in res:
        df.append({
            'subreddit': submission['data']['subreddit'],
            'id': submission['data']['id'],
            'title': submission['data']['title'],
            'selftext': submission['data']['selftext'],
            'author': submission['data']['author'],
            'name': submission['data']['name'],
            'ups': submission['data']['ups'], 
            'downs': submission['data']['downs'],
            'score': submission['data']['score'],
            'num_comments': submission['data']['num_comments'], 
            'created_utc': datetime.fromtimestamp(submission['data']['created_utc']).strftime('%Y-%m-%dT%H:%M:%SZ'),
            'over_18': submission['data']['over_18']
        })
    # print(df)
    return df

In [26]:
df_dogtraining = pd.DataFrame()
params = {'limit': 200}
url = f"https://oauth.reddit.com/r/dogtraining/new"

# loop through to get multiple requests
for i in range(200):
    # make request
    res = requests.get(url,
                       headers=headers,
                       params=params)

    # get dataframe from responses
    new_df = pd.DataFrame(res_to_df(get_posts('dogtraining')))
    # take the final row (which is the oldest entry)
    row = new_df.iloc[len(new_df)-1]
    # create fullname
    fullname = '_' + row['id']
    # add fullname in params
    params['after'] = fullname
    
    # append new_df to data
    df_dogtraining = df_dogtraining.append(new_df, ignore_index=False)
    
    import warnings
    warnings.simplefilter("ignore")

In [27]:
df_dogtraining.shape

(5400, 12)

In [28]:
df_dogtraining.head()

Unnamed: 0,subreddit,id,title,selftext,author,name,ups,downs,score,num_comments,created_utc,over_18
0,Dogtraining,uijir1,Trick of the Month - May 2022 - Crawl Backwards,Welcome to the Trick of the Month!\n\nThis mon...,moo6,t3_uijir1,8,0,8,4,2022-05-04T16:25:25Z,False
1,Dogtraining,ujxbsz,Announcement - Puppy Enrichment AMA With Allie...,,Cursethewind,t3_ujxbsz,7,0,7,2,2022-05-06T14:05:34Z,False
2,Dogtraining,up52vw,How do I get a cafe/brewery dog?,I am sitting at a brewery right now and all th...,slothsandwhich,t3_up52vw,311,0,311,88,2022-05-13T17:11:13Z,False
3,Dogtraining,upd3e8,"Hi, does anybody know the company that makes t...",,Fluffy_Overlord_1995,t3_upd3e8,59,0,59,6,2022-05-14T01:47:21Z,False
4,Dogtraining,upgv84,Warning: Gross,I noticed my dog trying to eat the poop of oth...,oxabexo,t3_upgv84,19,0,19,40,2022-05-14T06:05:24Z,False


In [29]:
df_dogtraining.to_csv('../data/dogtraining' + '.csv', header=True, index=False)

In [22]:
df_dogcare = pd.DataFrame()
params = {'limit': 200}
url = f"https://oauth.reddit.com/r/dogtcare/new"

# loop through to get multiple requests
for i in range(350):
    # make request
    res = requests.get(url,
                       headers=headers,
                       params=params)

    # get dataframe from response
    new_df = pd.DataFrame(res_to_df(get_posts('dogcare')))
    # take the final row (which is the oldest entry)
    row = new_df.iloc[len(new_df)-1]
    # create fullname
    fullname = '_' + row['id']
    # add fullname in params
    params['after'] = fullname
    
    # add new_df to data
    df_dogcare = df_dogcare.append(new_df, ignore_index=False)
    
    import warnings
    warnings.simplefilter("ignore")

In [23]:
df_dogcare.shape

(8750, 12)

In [24]:
df_dogcare.head()

Unnamed: 0,subreddit,id,title,selftext,author,name,ups,downs,score,num_comments,created_utc,over_18
0,DogCare,upgugb,Massages for hip dysplasia?,"He's a 6 yo boxer/mastiff,135 lbs and in good ...",Flaky_Watch,t3_upgugb,5,0,5,1,2022-05-14T06:04:21Z,False
1,DogCare,uow11b,"My dog has this weird thing on the tail, can a...",,NivTheGever,t3_uow11b,24,0,24,15,2022-05-13T09:50:09Z,False
2,DogCare,upa4qd,Lab/Great Dane puppy leg shattered,So my dog jumped out of my truck and completel...,Boomstick825,t3_upa4qd,1,0,1,4,2022-05-13T22:14:14Z,False
3,DogCare,up7lse,Any ideas what this might be?,I came home from work a couple of days ago and...,ChunkyMonkey3499,t3_up7lse,0,0,0,1,2022-05-13T19:35:38Z,False
4,DogCare,uoyffw,Anyone know what this could be? 7 Yr old Irish...,,Disastrous_Bobcat402,t3_uoyffw,1,0,1,0,2022-05-13T11:44:38Z,False


In [25]:
df_dogcare.to_csv('../data/dogcare' + '.csv', header=True, index=False)

## Sources:

Initial authorization and PRAW tutorial: https://youtu.be/NRgfgtzIhBQ 

PRAW, Requests, and authorization code assistance to build functions:
https://towardsdatascience.com/how-to-use-the-reddit-api-in-python-5e05ddfd1e5c

Python Requests Library Guide: https://realpython.com/python-requests/ 