# Reddit API script

This script uses the **Reddit API** to pull data from sub-Reddits about Cocktails.

It provides some data cleaning and summary statistic functions for the data pulled.

In [1]:
# importing libraries 
import requests
import pandas as pd 

In [34]:
# requesting a temporary OAuth token from Reddit
# note that 'CLIENT_ID' and 'SECRET_TOKEN' refer to personal API key parameters
app_id = '<CLIENT_ID>'
secret = '<SECRET_TOKEN>'

auth = requests.auth.HTTPBasicAuth(app_id, secret)

# reddit details
reddit_username = '<USERNAME>'
reddit_password = '<PASSWORD>'

data = {'grant_type' : 'password',
        'username' : reddit_username,
        'password' : reddit_password}

headers = {'User-Agent': 'MyBot/0.0.1'}

# sending request
res = requests.post('https://www.reddit.com/api/v1/access_token',
                    auth=auth, data=data, headers=headers)
print(res)


<Response [200]>


In [36]:
# response into json to access the token value 
token = res.json()['access_token']


# adding authorization token to headers dictionary
headers['Authorization'] = 'bearer {}'.format(token)

# checking access token works
requests.get('https://oauth.reddit.com/api/v1/me', 
             headers=headers)

<Response [200]>

In [38]:
# api parameter setup
web_url = 'https://oauth.reddit.com/r/cocktails/hot'

params = {'limit' : 100}

# making an api request
res = requests.get(web_url, headers=headers, params=params)
res.json()

{'kind': 'Listing',
 'data': {'after': 't3_124b6th',
  'dist': 101,
  'modhash': None,
  'geo_filter': None,
  'children': [{'kind': 't3',
    'data': {'approved_at_utc': None,
     'subreddit': 'cocktails',
     'selftext': "**This month's ingredients: Radish &amp; Lemon**\n\n---\n\n**Next month's ingredients: Gin &amp; Egg** \n\n---\n\nHello mixologists and liquor enthusiasts. Welcome to the monthly original cocktail competition.\n\nFor those looking to participate, here are the rules and guidelines. Any violations of these rules will result in disqualification from this month's competition.\n\n1. You must use both of the listed ingredients, but you can use them in absolutely any way or form (e.g. a liqueur, infusion, syrup, ice, smoke, etc.) you want and in whatever quantities you want. You do not have to make ingredients from scratch. You may also use any other ingredients you want.\n\n2. Your entry must be an original cocktail. Alterations of established cocktails are permitted wi

In [57]:
# function to pull data and pit it into a pandas dataframe

def jsonToDataframe(json):
    df = pd.DataFrame({'name': [], 'title': [], 'selftext': [], 'score':[], 'time':[]})

    name = []
    title = []
    selftext = []
    score = []
    time = []

    for post in json['data']['children']:
        name.append(post['data']['name'])
        title.append(post['data']['title'])
        selftext.append(post['data']['selftext'])
        score.append(post['data']['score'])
        time.append(post['data']["created_utc"])

    df["name"] = name
    df["title"] = title
    df["selftext"] = selftext
    df["score"] = score
    df["time"] = time
        
    return(df)

reddit_df = jsonToDataframe(res.json())
reddit_df
        

Unnamed: 0,name,title,selftext,score,time
0,t3_11gthp5,Original Cocktail Competition - March 2023 - R...,**This month's ingredients: Radish &amp; Lemon...,19,1.677825e+09
1,t3_12686te,Chartreuse Subs Experiment,Tested chartreuse subs neat and in a last word,367,1.680142e+09
2,t3_1265ka3,I made a cocktail smoker out of a cigar box an...,,313,1.680135e+09
3,t3_1261fex,The Bloodyless Mary,I was unsurprisingly craving a Bloody Mary one...,306,1.680126e+09
4,t3_12653ko,Trinidad Sour,,42,1.680134e+09
...,...,...,...,...,...
96,t3_124e5dv,How would you go about balancing this cocktail?,* 2 parts vodka\n* 1 part pomegranate juice \n...,5,1.679981e+09
97,t3_124meaf,Help Tweak a Shitty Recipe,"Hi, I'm trying to improve a recipe on Crown Ro...",0,1.680004e+09
98,t3_123vty1,Yuzu Mint Margarita,,25,1.679942e+09
99,t3_1247ch9,Crime and Punish-Mint,,7,1.679965e+09


### Data Cleaning ###

In [63]:
# chcecking for duplicates
for row in reddit_df.index.duplicated():
    if row == True:
        print('Row Number: ', row.index(), 'has a duplicate. Please remove!')

# removing unnecessary columns - name column gives no insight
reddit_df.drop('name', axis=1, inplace=True)

# convert 'time' into readable datatime format
reddit_df['time'] = pd.to_datetime(reddit_df['time'], unit='s')

In [71]:
reddit_df.dtypes


title               object
selftext            object
score                int64
time        datetime64[ns]
dtype: object

In [67]:
reddit_df

Unnamed: 0,title,selftext,score,time
0,Original Cocktail Competition - March 2023 - R...,**This month's ingredients: Radish &amp; Lemon...,19,2023-03-03 06:27:09
1,Chartreuse Subs Experiment,Tested chartreuse subs neat and in a last word,367,2023-03-30 02:06:06
2,I made a cocktail smoker out of a cigar box an...,,313,2023-03-30 00:13:47
3,The Bloodyless Mary,I was unsurprisingly craving a Bloody Mary one...,306,2023-03-29 21:34:08
4,Trinidad Sour,,42,2023-03-29 23:54:18
...,...,...,...,...
96,How would you go about balancing this cocktail?,* 2 parts vodka\n* 1 part pomegranate juice \n...,5,2023-03-28 05:18:28
97,Help Tweak a Shitty Recipe,"Hi, I'm trying to improve a recipe on Crown Ro...",0,2023-03-28 11:54:43
98,Yuzu Mint Margarita,,25,2023-03-27 18:31:01
99,Crime and Punish-Mint,,7,2023-03-28 00:57:08


### Summary Statistics ###

In [73]:
# number of datapoints
reddit_df.shape

(101, 4)

In [74]:
# summary statistics of reddit post scores 
reddit_df['score'].describe()

count    101.000000
mean      51.980198
std      122.485018
min        0.000000
25%        1.000000
50%        7.000000
75%       25.000000
max      686.000000
Name: score, dtype: float64

In [77]:
# saving datafram as a .csv to the data folder 
reddit_df.to_csv('../data/reddit.csv')
