# ETL
* Extract, transform, load
* Get data from multiple sources (DB, files, API, images)
* Transform into usable format
* Load data into storable form (DB, file)

In [30]:
import pandas as pd 
from requests import get, HTTPError, ConnectionError
import json
import datetime

In [64]:
reddit_endpoint = 'https://www.reddit.com/r/machinelearning/.json'
json_data = None
try: 
    req = get(reddit_endpoint, headers = {"User-agent": "max-etl-pipeline"})
    json_data = req.json()
except (HTTPError, ConnectionError) as err:
    print(err)

with open("data.json", "w") as outfile:
    json_str = json.dump(json_data, outfile, indent = 4)

with open("data.json", 'r') as jf: 
    data = json.load(jf)

post_data = data["data"]["children"]
df_rows = []

for post in post_data:
    pdata =  post['data']
    post_title =pdata["title"]
    post_text = pdata["selftext"]
    post_author = pdata["author"]
    post_upvote_ratio = pdata["upvote_ratio"]
    post_score = pdata["score"]
    post_flair = pdata["link_flair_text"]
    post_whitelist = pdata["parent_whitelist_status"]
    post_num_comments = pdata["num_comments"]
    post_permalink = pdata["permalink"]
    post_date = datetime.datetime.fromtimestamp(pdata["created_utc"])      #FIX

    #add each data item to appropriate list
    datalst = [post_title, post_permalink, post_text, post_author, post_upvote_ratio, post_score, post_num_comments, post_flair, post_whitelist,post_date]
    df_rows.append(datalst)

colnames = ["title", "permalink" , "text", "author", "upvote_ratio", "post_score", "number_comments", "flair", "whitelist_status", "date"]

df = pd.DataFrame(df_rows, columns = colnames)


df.head()

# df2 = pd.DataFrame(post_data)
# df2.head()


    

Unnamed: 0,title,permalink,text,author,upvote_ratio,post_score,number_comments,flair,whitelist_status,date
0,[D] Simple Questions Thread,/r/MachineLearning/comments/18kkdbb/d_simple_q...,Please post your questions here instead of cre...,AutoModerator,0.75,4,22,Discussion,all_ads,2023-12-17 10:00:19
1,"[D] Deep dive into the MMLU (""Are you smarter ...",/r/MachineLearning/comments/18ntia7/d_deep_div...,After all the hubbub around the MMLU (for exam...,brokensegue,1.0,13,4,Discussion,all_ads,2023-12-21 12:21:28
2,"[P] the Decimator, or how to plot a lot of points",/r/MachineLearning/comments/18nq5p6/p_the_deci...,The decimator is a function that removes point...,quicklyalienated76,0.83,15,1,Project,all_ads,2023-12-21 09:54:54
3,[P] I built an open SotA image tagging model t...,/r/MachineLearning/comments/18nb15l/p_i_built_...,"I'm a hobbyist ML researcher and finally, afte...",fpgaminer,0.95,152,52,Project,all_ads,2023-12-20 19:34:39
4,Meta AI Residency Interview Question [D],/r/MachineLearning/comments/18nio9k/meta_ai_re...,Was curious about this coding question that I ...,Immediate-Tailor-275,0.81,26,23,Discussion,all_ads,2023-12-21 02:58:28


* Making a DF of all of the data columns 

In [82]:
full_df_rows = []
for post in post_data:
    pdata = post['data']
    datalst = []
    for key in pdata.keys():
        datalst.append(pdata[key])
    full_df_rows.append(datalst)

#why is it making 109 rows when it says each data row is 106 entries long???

df2 = pd.DataFrame(full_df_rows)

df2.head()
    

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,99,100,101,102,103,104,105,106,107,108
0,,MachineLearning,Please post your questions here instead of cre...,t2_6l4z3,False,,0,False,[D] Simple Questions Thread,[],...,True,https://www.reddit.com/r/MachineLearning/comme...,2842726,1702828819.0,0,,False,,,
1,,MachineLearning,After all the hubbub around the MMLU (for exam...,t2_pm8ge,False,,0,False,"[D] Deep dive into the MMLU (""Are you smarter ...",[],...,False,https://www.reddit.com/r/MachineLearning/comme...,2842726,1703182888.0,0,,False,,,
2,,MachineLearning,The decimator is a function that removes point...,t2_tfe7ylgn,False,,0,False,"[P] the Decimator, or how to plot a lot of points",[],...,False,https://www.reddit.com/r/MachineLearning/comme...,2842726,1703174094.0,0,,False,,,
3,,MachineLearning,"I'm a hobbyist ML researcher and finally, afte...",t2_bkfa9,False,,0,False,[P] I built an open SotA image tagging model t...,[],...,/r/MachineLearning/comments/18nb15l/p_i_built_...,all_ads,False,https://www.reddit.com/r/MachineLearning/comme...,2842726,1703122000.0,2,,False,
4,,MachineLearning,Was curious about this coding question that I ...,t2_d78s0gv9,False,,0,False,Meta AI Residency Interview Question [D],[],...,False,https://www.reddit.com/r/MachineLearning/comme...,2842726,1703149108.0,0,,False,,,


* Data cleaning