# Spiketrap Homework

Build a strategy to download and store all reddit posts and comments (including upvotes and downvotes) for a given subreddit (eg reddit.com/r/sanfrancisco).

Write down an executable script in any language to run your strategy.

Storage of your choice among Redis, MongoDB, or Mysql. Up to you choose which one you think fits best and/or you are more familiar with.

Tutorial from: https://towardsdatascience.com/how-to-use-the-reddit-api-in-python-5e05ddfd1e5c

In [85]:
with open('api_key.txt', 'r') as key_file:
    CLIENT_ID, SECRET_KEY = key_file.read().strip('\n').split('\n')

In [17]:
import requests

In [18]:
auth = requests.auth.HTTPBasicAuth(CLIENT_ID, SECRET_KEY)

In [81]:
with open('secret.txt', 'r') as pw_file:
    user, pw = pw_file.read().strip('\n').split('\n')

In [82]:
pw

'crash99dsa33'

In [83]:
user

'Alepv'

In [94]:
user_data = {
    'grant_type': 'password',
    'username': user,
    'password': pw
}

In [95]:
headers = {'User-Agent': 'HwAPI/0.0.1'}

 ### Important: This access Token will expire after 2 hours (or 1?), a new one has to be requested (permanent?) https://github.com/reddit-archive/reddit/wiki/OAuth2

In [96]:
res = requests.post('https://www.reddit.com/api/v1/access_token', 
                   auth=auth, data=user_data, headers=headers)

In [97]:
res.json()

{'access_token': '187569803486-pSfTdctngpij-xco_pHIOBErpoCYcg',
 'token_type': 'bearer',
 'expires_in': 3600,
 'scope': '*'}

In [98]:
TOKEN = res.json()['access_token']

In [99]:
headers['Authorization'] = f'bearer {TOKEN}'

### Important: The API has a limit of requests per minute, monitor the usage. https://github.com/reddit-archive/reddit/wiki/API

In [100]:
res = requests.get('https://oauth.reddit.com/api/v1/me', headers=headers)

In [101]:
res.json()

{'is_employee': False,
 'seen_layout_switch': True,
 'has_visited_new_profile': False,
 'pref_no_profanity': True,
 'has_external_account': False,
 'pref_geopopular': '',
 'seen_redesign_modal': True,
 'pref_show_trending': True,
 'subreddit': {'default_set': True,
  'user_is_contributor': False,
  'banner_img': 'https://styles.redditmedia.com/t5_pra4k/styles/profileBanner_nxyt86yz98031.jpg?width=1280&amp;height=384&amp;crop=1280:384,smart&amp;s=ccbe7c186fd7e0e09acca5287daa3a4d8abd0d4f',
  'restrict_posting': True,
  'user_is_banned': False,
  'free_form_reports': True,
  'community_icon': None,
  'show_media': True,
  'icon_color': '',
  'user_is_muted': False,
  'display_name': 'u_Alepv',
  'header_img': None,
  'title': 'Alex',
  'coins': 0,
  'previous_names': [],
  'over_18': False,
  'icon_size': [256, 256],
  'primary_color': '',
  'icon_img': 'https://styles.redditmedia.com/t5_pra4k/styles/profileIcon_676bsq1r18031.jpeg?width=256&amp;height=256&amp;crop=256:256,smart&amp;s=43d1

In [102]:
sf_hot = requests.get('https://oauth.reddit.com/r/haxball/hot', headers=headers)

In [103]:
sf_hot.json()['data']

{'after': 't3_kr5pd4',
 'dist': 25,
 'modhash': None,
 'geo_filter': None,
 'children': [{'kind': 't3',
   'data': {'approved_at_utc': None,
    'subreddit': 'haxball',
    'selftext': '',
    'author_fullname': 't2_1djpneug',
    'saved': False,
    'mod_reason_title': None,
    'gilded': 0,
    'clicked': False,
    'title': 'i look at speedtest it says 11 ping 30mb but too much lag in haxball why is it so',
    'link_flair_richtext': [],
    'subreddit_name_prefixed': 'r/haxball',
    'hidden': False,
    'pwls': 6,
    'link_flair_css_class': None,
    'downs': 0,
    'thumbnail_height': None,
    'top_awarded_type': None,
    'hide_score': False,
    'name': 't3_otw4ql',
    'quarantine': False,
    'link_flair_text_color': 'dark',
    'upvote_ratio': 1.0,
    'author_flair_background_color': None,
    'subreddit_type': 'public',
    'ups': 1,
    'total_awards_received': 0,
    'media_embed': {},
    'thumbnail_width': None,
    'author_flair_template_id': None,
    'is_original_

In [119]:
import pandas as pd
from datetime import datetime
from time import sleep

In [124]:
data = pd.DataFrame()  # initialize dataframe
params = {'limit': 100}

In [125]:
# we use this function to convert responses to dataframes
def df_from_response(res):
    # initialize temp dataframe for batch of data in response
    df = pd.DataFrame()

    # loop through each post pulled from res and append to df
    for post in res.json()['data']['children']:
        df = df.append({
            'subreddit': post['data']['subreddit'],
            'title': post['data']['title'],
            'selftext': post['data']['selftext'],
            'upvote_ratio': post['data']['upvote_ratio'],
            'ups': post['data']['ups'],
            'downs': post['data']['downs'],
            'score': post['data']['score'],
            'link_flair_css_class': post['data']['link_flair_css_class'],
            'created_utc': datetime.fromtimestamp(post['data']['created_utc']).strftime('%Y-%m-%dT%H:%M:%SZ'),
            'id': post['data']['id'],
            'kind': post['kind']
        }, ignore_index=True)

    return df

In [126]:
# loop through 10 times (returning 1K posts)
for i in range(3):
    # make request
    res = requests.get("https://oauth.reddit.com/r/haxball/new",
                       headers=headers,
                       params=params)

    # get dataframe from response
    new_df = df_from_response(res)
    # take the final row (oldest entry)
    row = new_df.iloc[len(new_df)-1]
    # create fullname
    fullname = row['kind'] + '_' + row['id']
    # add/update fullname in params
    params['after'] = fullname
    
    # append new_df to data
    data = data.append(new_df, ignore_index=True)
    
    sleep(1)

In [127]:
data

Unnamed: 0,created_utc,downs,id,kind,link_flair_css_class,score,selftext,subreddit,title,ups,upvote_ratio
0,2021-07-29T14:02:32Z,0.0,otw4ql,t3,,1.0,,haxball,i look at speedtest it says 11 ping 30mb but t...,1.0,1.00
1,2021-07-27T03:30:22Z,0.0,osd29g,t3,,3.0,Hi folks! I'm working on a project to run HaxB...,haxball,Framework to run HaxBall in a true headless en...,3.0,1.00
2,2021-07-16T02:21:13Z,0.0,ol5i3e,t3,,0.0,can I,haxball,Can I be a haxball superstar using macro,0.0,0.33
3,2021-07-06T22:22:21Z,0.0,of39bx,t3,,9.0,"Do you want haxball to be fixed and optimized,...",haxball,Petition to make a standalone Haxball App,9.0,0.91
4,2021-06-15T21:41:25Z,0.0,o0mrgw,t3,,5.0,5v5 or 7v7\n\nI would like to play organized s...,haxball,Anyone wanna play haxball soccer,5.0,1.00
...,...,...,...,...,...,...,...,...,...,...,...
295,2015-05-25T17:07:30Z,0.0,377kf7,t3,,6.0,"We could try reviving this and make it more, a...",haxball,Is this entire subreddit dead?,6.0,0.76
296,2015-05-07T18:56:36Z,0.0,356yzx,t3,,4.0,"Hello,\nWhat do you think of the idea to split...",haxball,Splitting direction keys,4.0,1.00
297,2015-05-06T00:36:42Z,0.0,34zsob,t3,,7.0,,haxball,New to Haxball what do i need to know,7.0,0.89
298,2015-04-07T19:30:00Z,0.0,31rqsi,t3,,2.0,,haxball,We need login system to avoid fakers,2.0,0.75
