In [1]:
# ETL - Extract, Transform, Load
# Get data from multiple sources (Databases, files, API, etc.)
# Transform the data to be useable - (weather e.x.: T for rain - (trace), near nothing).
# Load data - store / save our data, often to a database or a file.

# Your company wants you to get relevant data from that endpoint - ensure that data is easily useable.
# Make a CSV file with all relevant data.

In [2]:
import pandas as pd
from requests import get, HTTPError, ConnectionError
import json

In [3]:
reddit_endpoint = 'https://www.reddit.com/r/machinelearning.json'
json_data = None
try:
    req = get(reddit_endpoint, headers = {"User-agent": "max-etl-pipeline"})
    json_data = req.json()
except (HTTPError, ConnectionError) as err:
    print(err)

In [32]:
with open("data.json", "w") as outfile:
    json.dump(json_data,outfile,indent=4)

#   Square brackets for list comprehension and list output
d = [child.get('data', {}) for child in json_data.get('data', {}).get('children', [])]
#                                                   ^ Point to value of the top-level 'data' key
#              ^ get 'data' value from each child                ^ Drill down to contents of list "children"
#                           ^ Get each element in list "children"
posts = pd.DataFrame(d)

# Max's: 
# df = json_data["data"]["children"]
# df = [child["data"] for child in df]

posts["created_utc"] = pd.to_datetime(posts["created_utc"], unit='s')
# If extraneous spaces: e.g. posts["title"].str.strip()

# Harvesting: title, ups, downs, created_utc, num_comments

posts_sorted = posts.sort_values(by='created_utc')
harvest = posts_sorted[['created_utc', 'title', 'ups', 'upvote_ratio', 'num_comments']]

harvest.to_csv('reddit_data.csv', index=False)

<class 'pandas.core.frame.DataFrame'>
