# ETL
* Extract, transform, load
* Get data from multiple sources (DB, files, API, images)
* Transform into usable format
* Load data into storable form (DB, file)

In [30]:
import pandas as pd 
from requests import get, HTTPError, ConnectionError
import json
import datetime

In [62]:
reddit_endpoint = 'https://www.reddit.com/r/machinelearning/.json'
json_data = None
try: 
    req = get(reddit_endpoint, headers = {"User-agent": "max-etl-pipeline"})
    json_data = req.json()
except (HTTPError, ConnectionError) as err:
    print(err)

with open("data.json", "w") as outfile:
    json_str = json.dump(json_data, outfile, indent = 4)

with open("data.json", 'r') as jf: 
    data = json.load(jf)

post_data = data["data"]["children"]
df_rows = []

for post in post_data:
    pdata =  post['data']
    post_title =pdata["title"]
    post_text = pdata["selftext"]
    post_author = pdata["author"]
    post_upvote_ratio = pdata["upvote_ratio"]
    post_score = pdata["score"]
    post_flair = pdata["link_flair_text"]
    post_whitelist = pdata["parent_whitelist_status"]
    post_num_comments = pdata["num_comments"]
    post_permalink = pdata["permalink"]
    post_date = datetime.datetime.fromtimestamp(pdata["created_utc"])      #FIX

    #add each data item to appropriate list
    datalst = [post_title, post_permalink, post_text, post_author, post_upvote_ratio, post_score, post_num_comments, post_flair, post_whitelist,post_date]
    df_rows.append(datalst)

colnames = ["title", "permalink" , "text", "author", "upvote_ratio", "post_score", "number_comments", "flair", "whitelist_status", "date"]

df = pd.DataFrame(df_rows, columns = colnames)


df.to_csv('ml_reddit_data.csv')

df.head()





#title, author fullname, clicked

    

Unnamed: 0,title,permalink,text,author,upvote_ratio,post_score,number_comments,flair,whitelist_status,date
0,[D] Simple Questions Thread,/r/MachineLearning/comments/18kkdbb/d_simple_q...,Please post your questions here instead of cre...,AutoModerator,0.75,4,22,Discussion,all_ads,2023-12-17 10:00:19
1,"[P] the Decimator, or how to plot a lot of points",/r/MachineLearning/comments/18nq5p6/p_the_deci...,The decimator is a function that removes point...,quicklyalienated76,0.85,14,1,Project,all_ads,2023-12-21 09:54:54
2,"[D] Deep dive into the MMLU (""Are you smarter ...",/r/MachineLearning/comments/18ntia7/d_deep_div...,After all the hubbub around the MMLU (for exam...,brokensegue,1.0,7,4,Discussion,all_ads,2023-12-21 12:21:28
3,[P] I built an open SotA image tagging model t...,/r/MachineLearning/comments/18nb15l/p_i_built_...,"I'm a hobbyist ML researcher and finally, afte...",fpgaminer,0.95,154,48,Project,all_ads,2023-12-20 19:34:39
4,Meta AI Residency Interview Question [D],/r/MachineLearning/comments/18nio9k/meta_ai_re...,Was curious about this coding question that I ...,Immediate-Tailor-275,0.83,27,23,Discussion,all_ads,2023-12-21 02:58:28


In [8]:
print(json_data)

{'kind': 'Listing', 'data': {'after': 't3_18na690', 'dist': 27, 'modhash': '', 'geo_filter': None, 'children': [{'kind': 't3', 'data': {'approved_at_utc': None, 'subreddit': 'MachineLearning', 'selftext': 'Please post your questions here instead of creating a new thread. Encourage others who create new posts for questions to post here instead!\n\nThread will stay alive until next one so keep posting after the date in the title.\n\nThanks to everyone for answering questions in the previous thread!', 'author_fullname': 't2_6l4z3', 'saved': False, 'mod_reason_title': None, 'gilded': 0, 'clicked': False, 'title': '[D] Simple Questions Thread', 'link_flair_richtext': [], 'subreddit_name_prefixed': 'r/MachineLearning', 'hidden': False, 'pwls': 6, 'link_flair_css_class': 'one', 'downs': 0, 'thumbnail_height': None, 'top_awarded_type': None, 'hide_score': False, 'name': 't3_18kkdbb', 'quarantine': False, 'link_flair_text_color': None, 'upvote_ratio': 0.63, 'author_flair_background_color': None

In [None]:
# Your company wants you to get relevant data from that endpoint - ensure that data is easily useable.
# Make a CSV file with all relevant data.
