# Getting Reddit Data

There are two options for extracting data from Reddit:

* The `requests` library, which will allow us to interface directly with the Reddit API.

* The PRAW library, which is a wrapper library that adds an extra layer of abstraction in accessing the Reddit API.

Here we will cover the first option, using the `requests` library to interface directly with the API.

The final extraction script will look like this:

In [18]:
import requests
import pandas as pd


class Reddit:
    def __init__(self, client_id, secret_token, username, password):
        # first create authentication object
        auth = requests.auth.HTTPBasicAuth(client_id, secret_token)
        # build login dictionary
        login = {'grant_type': 'password',
                 'username': username,
                 'password': password}
        # setup header info (incl description of API)
        headers = {'User-Agent': 'MyBot/0.0.1'}
        # send request for OAuth token
        res = requests.post(f'https://www.reddit.com/api/v1/access_token',
                            auth=auth, data=login, headers=headers)
        # pull auth bearer token from response
        token = res.json()['access_token']
        # add authorization to headers dictionary
        headers['Authorization'] = f'bearer {token}'
        # add headers dict to internal attributes
        self.headers = headers
        # and api
        self.api = 'https://oauth.reddit.com'

    def get_new(self, subreddit, iters):
        # initialize dataframe to store data
        df = pd.DataFrame()
        # initialize parameters dictionary
        params = {'limit': 100}
        # iterate through several times to make sure we get all the data available
        for i in range(iters):
            # make request
            res = requests.get(f'{self.api}/r/{subreddit}/new',
                               headers=self.headers,
                               params=params)
            # check that we returned something (if not we reached end)
            if len(res.json()['data']['children']) == 0:
                print('No more found')
                return df
            # iterate through each thread recieved
            for thread in res.json()['data']['children']:
                # add info to dataframe
                df = df.append({
                    'id': thread['data']['name'],
                    'created_utc': int(thread['data']['created_utc']),
                    'subreddit': thread['data']['subreddit'],
                    'title': thread['data']['title'],
                    'selftext': thread['data']['selftext'],
                    'upvote_ratio': thread['data']['upvote_ratio'],
                    'ups': thread['data']['ups'],
                    'downs': thread['data']['downs'],
                    'score': thread['data']['score']
                }, ignore_index=True)
            # get earliest ID
            earliest = df['id'].iloc[len(df)-1]
            # add earliest ID to params
            params['after'] = earliest
        return df

In [19]:
SUB = 'investing'

In [20]:
# Load credentials
import json
f = open('reddit_tkn.json')
tkn = json.load(f)

In [21]:
reddit = Reddit(tkn['client_id'], tkn['secret_token'], tkn['user'], tkn['pwd'])

In [22]:
data = reddit.get_new(SUB, 20)

No more found


In [25]:
data.head()

Unnamed: 0,created_utc,downs,id,score,selftext,subreddit,title,ups,upvote_ratio
0,1646144000.0,0.0,t3_t48xbu,1.0,My 9yr old wants to invest his 300$ so he can ...,investing,Investment options for children?,1.0,1.0
1,1646130000.0,0.0,t3_t44pro,597.0,"Again, the world's major chip and semiconducto...",investing,Ukraine supplies 70% of the world's neon. Chip...,597.0,0.95
2,1646129000.0,0.0,t3_t44j5g,2.0,Have a general question? Want to offer some c...,investing,Daily General Discussion and Advice Thread - M...,2.0,0.76
3,1646091000.0,0.0,t3_t3t1i6,15.0,Can someone explain how/why something like the...,investing,How would the RSX get Delisted?,15.0,0.77
4,1646075000.0,0.0,t3_t3n3cz,0.0,Many years ago I had money put in DODBX for me...,investing,trying to figure DODBX out,0.0,0.14


In [23]:
data = data.replace({'|': ''}, regex=True)

In [24]:
data.to_csv(f'./data/reddit_{SUB}.csv', sep='|', index=False)