## Data Acquisition

#### Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import requests
import praw

from datetime import datetime
from credentials import API_KEY, API_SECRET, USER_AGENT

In [2]:
reddit = praw.Reddit(
    client_id = API_KEY,
    client_secret = API_SECRET,
    user_agent = USER_AGENT)

In [3]:
def combine_data(posts, label):
    data = []
    for p in posts:
        row = (p.created_utc, p.title, p.selftext, p.upvote_ratio, p.num_comments)
        data.append(row)
        min_time = int(min(r[0] for r in data)) - 100_000
    print(f"{label.upper()} POSTS :: N = {len(data)}")
    return data

In [4]:
sub1 = reddit.subreddit('explainlikeimfive')

posts_new = sub1.new(limit = 1000)
posts_hot = sub1.hot(limit = 1000)
posts_top = sub1.top(limit = 1000)
posts_con = sub1.controversial(limit = 1000)

In [5]:
data_new = combine_data(posts_new, 'new')
data_hot = combine_data(posts_hot, 'hot')
data_top = combine_data(posts_top, 'top')
data_con = combine_data(posts_con, 'controversial')

NEW POSTS :: N = 987
HOT POSTS :: N = 461
TOP POSTS :: N = 968
CONTROVERSIAL POSTS :: N = 989


In [6]:
df1 = pd.DataFrame(data_new + data_hot + data_top + data_con, columns = ['time', 'title', 'text', 'upvotes', 'comments'])

In [7]:
df1.drop_duplicates().shape

(3192, 5)

In [8]:
df1.drop_duplicates(inplace = True)

In [9]:
df1

Unnamed: 0,time,title,text,upvotes,comments
0,1.687220e+09,ELI5: Sliver Fill (landfill),I opérate heavy equipment but I’m new to landf...,1.00,0
1,1.687220e+09,ELI5 What’s the difference between MLS and Zil...,Why do realtors give you access to MLS but mos...,1.00,0
2,1.687219e+09,ELI5: How does a videogame know you've done a ...,For example I've been playing GTA San Andreas ...,1.00,3
3,1.687218e+09,ELI5: Why did warfare around the Mediterranean...,"Obviously, not all warfare was heavy infantry ...",1.00,1
4,1.687218e+09,"ELI5: If we can't see 4D beings, what are the ...","The idea of 4th dimension exist , whether they...",0.33,4
...,...,...,...,...,...
3400,1.451167e+09,ELI5: How did the pyramids go missing for so l...,,0.50,13
3401,1.447076e+09,Eli5: What is the point in using turn signals ...,I see everybody turning with signals. I person...,0.50,20
3402,1.444424e+09,ELI5: Why are actors and musicians more famous...,,0.47,5
3403,1.443834e+09,ELI5:Why is [the shadow of this astronaut](htt...,"I'm not a conspiracy theorist, but I thought R...",0.50,8


In [10]:
df1.to_csv(f"data/ELIF_{datetime.now().strftime('%m-%d-%y_%H-%M')}.csv", index=False)

In [11]:
sub2 = reddit.subreddit('AskScience')

posts_new2 = sub2.new(limit = 1000)
posts_hot2 = sub2.hot(limit = 1000)
posts_top2 = sub2.top(limit = 1000)
posts_con2 = sub2.controversial(limit = 1000)

In [12]:
data_new2 = combine_data(posts_new2, 'new')
data_hot2 = combine_data(posts_hot2, 'hot')
data_top2 = combine_data(posts_top2, 'top')
data_con2 = combine_data(posts_con2, 'controversial')

NEW POSTS :: N = 758
HOT POSTS :: N = 995
TOP POSTS :: N = 998
CONTROVERSIAL POSTS :: N = 999


In [13]:
df2 = pd.DataFrame(data_new2 + data_hot2 + data_top2 + data_con2, columns = ['time', 'title', 'text', 'upvotes', 'comments'])

In [14]:
df2.drop_duplicates().shape

(3337, 5)

In [15]:
df2.drop_duplicates(inplace = True)

In [16]:
df2

Unnamed: 0,time,title,text,upvotes,comments
0,1.687161e+09,"When making an epigenetic change to DNA, how d...",So I know that noncoding RNA is able to mediat...,0.78,9
1,1.687152e+09,Why are there no longer Yellow Fever outbreaks...,,0.71,13
2,1.687155e+09,Do astronauts loose hair cause problems on the...,Hair comes off everybody. In space of course w...,0.85,203
3,1.687139e+09,How fast does the major axis of the Earth's or...,The Earth orbits the sun in an ellipse with lo...,0.71,17
4,1.687104e+09,We're any dinosaurs frugivores (diet consistin...,,0.72,57
...,...,...,...,...,...
3739,1.406395e+09,"At the center of the earth, wouldn't there exi...",Maybe this should be posted in shower thoughts...,0.47,6
3740,1.401026e+09,[physics] How does an eigenstate related to an...,I'm getting really jumbled up in all this Eige...,0.46,7
3741,1.394904e+09,How do programs like 'iTunes visualiser' work?...,,0.50,3
3742,1.389487e+09,Why are socks so hard to put on with wet feet?,,0.50,1


In [17]:
df2.to_csv(f"data/AskScience_{datetime.now().strftime('%m-%d-%y_%H-%M')}.csv", index=False)

Using PRAW (Python Reddit API Wrapper), an API wrapped that allows easy access to Reddit information without the need for manual scraping, I acquired thousands of posts and created their own csv files that are dated and timed to allow me to see when my data was acquired.