# Project 3 - Reddit API and Classification

## Scraping EatCheapAndHealthy subreddit

Using Reddit's API, posts were collected from EatCheapAndHealthy subreddit.

In [1]:
import requests
import time
import pandas as pd

In [3]:
url = 'https://www.reddit.com/r/EatCheapAndHealthy.json'

In [4]:
# Creating custom user agent.
headers = {'User-agent': 'jason_bourne_0023'}

In [5]:
res = requests.get(url, headers=headers)

In [6]:
res.status_code

200

In [7]:
healthy = res.json()

In [8]:
sorted(healthy.keys())

['data', 'kind']

In [9]:
sorted(healthy['data'].keys())

['after', 'before', 'children', 'dist', 'modhash']

In [12]:
len(healthy['data']['children'])

26

In [13]:
# Name of the last post.
healthy['data']['after']

't3_cf6e6k'

In [15]:
[post['data']['name'] for post in  healthy['data']['children']]

['t3_8nmhix',
 't3_cfjthu',
 't3_cfv9y1',
 't3_cfs3gb',
 't3_cf7sa9',
 't3_cfwju4',
 't3_cfs7hh',
 't3_cfunv0',
 't3_cfrbh1',
 't3_cfowef',
 't3_cfpes4',
 't3_cfpn1q',
 't3_cfo68m',
 't3_cfreh4',
 't3_cfnrzq',
 't3_cfok1o',
 't3_cfoeh1',
 't3_cfocjj',
 't3_cfo328',
 't3_cfpl1z',
 't3_cfpc96',
 't3_cfmzu6',
 't3_cfgsf9',
 't3_cfjse4',
 't3_cfo5du',
 't3_cf6e6k']

In [16]:
param = {'after': 't3_cf6e6k'}

In [17]:
requests.get(url, params=param, headers=headers)

<Response [200]>

In [18]:
posts = []
after = None
for i in range(100):
    print(i)
    if after == None:
        params = {}
    else:
        params = {'after': after}
    url = 'https://www.reddit.com/r/EatCheapAndHealthy.json'
    res = requests.get(url, params=params, headers=headers)
    if res.status_code == 200:
        healthy = res.json()
        posts.extend(healthy['data']['children'])
        after = healthy['data']['after']
    else:
        print('Status code: {}'.format(res.status_code))
        break
    # allow for a break in between requests.
    time.sleep(3)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


In [19]:
len(set([p['data']['name'] for p in posts]))

756

In [23]:
posts[0]

{'kind': 't3',
 'data': {'approved_at_utc': None,
  'subreddit': 'EatCheapAndHealthy',
  'selftext': "For example:\n\n1. No fridge, microwave only: [SEARCH RESULTS](https://www.reddit.com/r/EatCheapAndHealthy/search?q=No+fridge%2C+microwave+only&amp;restrict_sr=on&amp;sort=relevance&amp;t=all)\n\n2. Student, need help with recipes:  [SEARCH RESULTS](https://www.reddit.com/r/EatCheapAndHealthy/search?q=Student%2C+need+help+with+recipes&amp;restrict_sr=on&amp;sort=relevance&amp;t=all)\n\n\n\n3. no oven, traveling : [SEARCH RESULTS](https://www.reddit.com/r/EatCheapAndHealthy/search?q=no+oven&amp;restrict_sr=on&amp;sort=relevance&amp;t=all)\n\nThese are three examples. Just keep entering keywords until you get a match for what you need. Please do this so we don't have to keep removing repeat links. Our database is quite large enough as is.",
  'author_fullname': 't2_4itpf',
  'saved': False,
  'mod_reason_title': None,
  'gilded': 0,
  'clicked': False,
  'title': '[MOD POST] Before you p

In [24]:
title = []
text = []
subreddit = []

for post in posts:
    title.append(post['data']['title'])
    text.append(post['data']['selftext'])
    subreddit.append(post['data']['subreddit'])

In [27]:
df = pd.DataFrame([title, text, subreddit]).T
df.columns = ['title', 'text', 'subreddit']
df.head()

Unnamed: 0,title,text,subreddit
0,"[MOD POST] Before you post, asking questions f...","For example:\n\n1. No fridge, microwave only: ...",EatCheapAndHealthy
1,Cucumber Salad,,EatCheapAndHealthy
2,Kala Chana Cheap and Very Healthy Option,Kala chana is also known as black chickpeas or...,EatCheapAndHealthy
3,You should try this snack,So heres what you need to do\n\nCut up some ca...,EatCheapAndHealthy
4,Watermelon and Whole Fruit Popsicle,,EatCheapAndHealthy


In [28]:
df.shape

(2444, 3)

In [29]:
# drop duplicate posts.
df = df.drop_duplicates()

In [30]:
df.shape

(750, 3)

In [31]:
df.to_csv('./data/healthy.csv', index=False)