In [1]:
import json
import lzma
import bz2
import numpy as np
import pandas as pd
import pprint
from collections import defaultdict

In [2]:
def get_subreddit_list():
    # return list of lower case subreddit names
    subs = pd.read_csv('../reference/subreddits.txt', header=None)
    outsubs = list(sub.lower() for sub in subs[0])
    return outsubs

In [None]:
def parse_file(filename):
    # read in local file and parse all submissions for relevant subs
    errors = []
    # Creating a file pointer to read the .bz2 file
    fp = bz2.BZ2File(filename,'r')

    subs = get_subreddit_list()
    
    # initialize dictionary to contain output 
    output = defaultdict(list)
    
    # for all lines in file
    for i,line in enumerate(fp):
        #if i > 200: # Processing only first 3 lines - Remove this while parsing the entire data
            #break
        # read line as JSON dictionary
        try:
            json_dict = json.loads(line)
            subreddit = json_dict['subreddit'].lower()
            if subreddit in subs:
                output[subreddit].append(json_dict)
        except:
            errors.append(filename)
    fp.close()

    with open('../data/submissions/{}.json'.format(filename[:-4]), 'w') as outfile:
        json.dump(output, outfile)

In [None]:
def extract_submissions():
    subs = get_subreddit_list()
    # for each monthly file
    for year in range(2017, 2018):
        for month in range(1,13):
            # retrieve data file to local disc
            if year == 2017 and month > 11: continue
            link = 'http://files.pushshift.io/reddit/submissions/'
            filename = 'RS_{}-{:02}.bz2'.format(year, month)
            get_file = link + filename
            try:
                !wget $get_file
            except:
                # skip month if error with wget
                continue
            # parse file for subreddits matching list of subreddits
            parse_file(filename)

In [None]:
extract_submissions()

In [37]:
def parse_file_lzma(filename):
    # read in local file and parse all submissions for relevant subs
    errors = []
    # Creating a file pointer to read the .bz2 file
    fp = lzma.LZMAFile(filename, mode='r')
    subs = get_subreddit_list()
    
    # initialize dictionary to contain output 
    output = defaultdict(list)
    
    # for all lines in file
    for i,line in enumerate(fp):
        #if i > 50: # Processing only first 3 lines - Remove this while parsing the entire data
        #    break
        # read line as JSON dictionary
        try:
            json_dict = json.loads(line)
            
            subreddit = str(json_dict['subreddit']).lower()
            if subreddit in subs:
                output[subreddit].append(json_dict)
        except:
            continue
    fp.close()
    
    return output
    
    with open('../data/submissions/{}.json'.format(filename[:-3]), 'w') as outfile:
        json.dump(output, outfile)

In [38]:
output = parse_file_lzma('RS_2017-12.xz')

ERROR!
{u'adserver_click_url': None,
 u'adserver_imp_pixel': None,
 u'archived': False,
 u'author': u'AstroglideAds',
 u'author_flair_css_class': None,
 u'author_flair_text': None,
 u'author_id': u't2_15l5zt',
 u'brand_safe': False,
 u'contest_mode': False,
 u'created_utc': 1512086436,
 u'disable_comments': False,
 u'distinguished': None,
 u'domain': u'astroglide.com',
 u'domain_override': None,
 u'edited': False,
 u'embed_type': None,
 u'embed_url': None,
 u'gilded': 0,
 u'hidden': False,
 u'hide_score': False,
 u'href_url': u'https://www.astroglide.com/where-to-buy/?utm_source=reddit&amp;utm_medium=social&amp;utm_campaign=Q22018_routine',
 u'id': u'7gqvgs',
 u'is_crosspostable': False,
 u'is_reddit_media_domain': False,
 u'is_self': False,
 u'is_video': False,
 u'link_flair_css_class': None,
 u'link_flair_text': None,
 u'locked': False,
 u'media': None,
 u'media_embed': {},
 u'mobile_ad_url': u'https://b.thumbs.redditmedia.com/3w7iuQ_ZwI-zCCLz4IDCuypCPPHVq2I6TZO2aPFbqnE.jpg',
 u'num_

In [39]:
len(output['bitcoin'])

385

In [4]:
def extract_submissions_lzma():
    subs = get_subreddit_list()
    # for each monthly file
    for year in range(2017, 2019):
        for month in range(1,13):
            # skip list of those already iterated
            if year == 2017 and month < 12:
                continue
            # retrieve data file to local disc
            #link = 'http://files.pushshift.io/reddit/submissions/'
            filename = 'RS_{}-{:02}.xz'.format(year, month)
            #get_file = link + filename
            #try:
            #    !wget $get_file
            #except:
                # skip month if error with wget
            #    continue
            # parse file for subreddits matching list of subreddits
            print('parsing {}'.format(filename))
            parse_file_lzma(filename)

In [5]:
extract_submissions_lzma()

parsing RS_2017-12.xz
parsing RS_2018-01.xz
parsing RS_2018-02.xz
parsing RS_2018-03.xz
parsing RS_2018-04.xz
parsing RS_2018-05.xz
parsing RS_2018-06.xz


KeyboardInterrupt: 

In [16]:
parse_file('RS_2011-01.bz2')

In [6]:
filename = 'RS_2011-01.bz2'

# Creating a file pointer to read the .bz2 file
fp = bz2.BZ2File(filename,'r')

subs = get_subreddit_list()

# for loop to iterate over every record in the file and parse them
# this for loop reads one line in the file at a time
output = defaultdict(list)
for i,line in enumerate(fp):
    if i > 200: # Processing only first 3 lines - Remove this while parsing the entire data
        break
    json_dict = json.loads(line) # Converting the json object(line) into a dictionary
    print(json_dict['subreddit_id']) # Prints the subreddit_id
    if json_dict['subreddit'] not in subs:
        output[json_dict['subreddit']].append(json_dict['subreddit_id'])
    
    
fp.close()

with open('{}.json'.format(filename[:-4]), 'w') as outfile:
    json.dump(output, outfile)

t5_2qh0u
t5_2s3uz
t5_2qh68
t5_2qh33
t5_2qh33
t5_2qhlm
t5_2s7d1
t5_2qhix
t5_2qh2z
t5_2qh03
t5_2qh61
t5_2r5vt
t5_2qh1i
t5_2qj8f
t5_2qh33
t5_2qh1u
t5_2qh61
t5_2qh16
t5_2qh1i
t5_2r9vp
t5_2qh1i
t5_2qh2b
t5_6
t5_2qh7q
t5_6
t5_2qh6c
t5_2qpol
t5_2qgzt
t5_2qh03
t5_2qzb6
t5_2qh33
t5_2qh61
t5_2qh33
t5_2qtnt
t5_6
t5_6
t5_2qh03
t5_6
t5_2qtp5
t5_2r05i
t5_2s4lk
t5_2qh03
t5_2qh1i
t5_2qh63
t5_2qh6z
t5_2r5vt
t5_2qh0u
t5_2qqlo
t5_2qhk3
t5_2qh61
t5_2qpp6
t5_2qio8
t5_2s7tt
t5_2qh1i
t5_2r5vt
t5_6
t5_2qp30
t5_2qh1i
t5_6
t5_2s8wf
t5_2qkeh
t5_2qh61
t5_6
t5_2s7tt
t5_2qh1u
t5_2rp0r
t5_2qhfi
t5_2s7po
t5_6
t5_2qio8
t5_6
t5_2qh61
t5_2r7eu
t5_6
t5_6
t5_6
t5_6
t5_2qh1i
t5_2qh1i
t5_2s67l
t5_2qh6c
t5_2qy7e
t5_2r0gj
t5_2qpol
t5_2qq5c
t5_2r5vt
t5_2qh1i
t5_2qh1x
t5_6
t5_2qh2a
t5_2r05i
t5_2qpp6
t5_6
t5_2qm4e
t5_2qhud
t5_2qqlo
t5_2qhhw
t5_2qh2n
t5_2qh0u
t5_2qpp6
t5_2qh03
t5_2qh53
t5_6
t5_2qh0f
t5_2qhjq
t5_2s1g4
t5_2qjvn
t5_2s93y
t5_6
t5_6
t5_2r367
t5_2qhrv
t5_6
t5_2qh0f
t5_2r5vt
t5_6
t5_2qht2
t5_6
t5_2qhx4
t5_2qh0u
t5_6
t5_

In [16]:
filename = 'RS_2011-01.bz2'

# Creating a file pointer to read the .bz2 file
fp = bz2.BZ2File(filename,'r')

# for loop to iterate over every record in the file and parse them
# this for loop reads one line in the file at a time
output = dict()

data = json.load(fp)


KeyboardInterrupt: 

In [10]:
json_dict.keys()

[u'domain',
 u'banned_by',
 u'media_embed',
 u'subreddit',
 u'selftext_html',
 u'selftext',
 u'likes',
 u'link_flair_text',
 u'id',
 u'clicked',
 u'title',
 u'media',
 u'score',
 u'approved_by',
 u'over_18',
 u'hidden',
 u'thumbnail',
 u'subreddit_id',
 u'edited',
 u'link_flair_css_class',
 u'author_flair_css_class',
 u'downs',
 u'saved',
 u'is_self',
 u'permalink',
 u'name',
 u'created',
 u'url',
 u'author_flair_text',
 u'author',
 u'created_utc',
 u'ups',
 u'num_comments',
 u'num_reports',
 u'distinguished']

In [11]:
filename = 'RS_2011-01.bz2'
print(filename[:-4])

RS_2011-01


In [5]:
!wget 'http://files.pushshift.io/reddit/submissions/RS_2011-01.bz2'

--2018-11-23 18:49:00--  http://files.pushshift.io/reddit/submissions/RS_2011-01.bz2
Resolving files.pushshift.io (files.pushshift.io)... 104.27.134.62, 104.27.135.62, 2606:4700:30::681b:863e, ...
Connecting to files.pushshift.io (files.pushshift.io)|104.27.134.62|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 151006946 (144M) [application/octet-stream]
Saving to: ‘RS_2011-01.bz2’


2018-11-23 18:49:30 (4.87 MB/s) - ‘RS_2011-01.bz2’ saved [151006946/151006946]



In [26]:
teststring = 'this is a test string'
letters = 'abcdg'
output = defaultdict(list)
for letter in teststring:
    if letter in letters:
        output[letter].append(1)

In [27]:
output

defaultdict(list, {'a': [1], 'g': [1]})

In [8]:
subs = get_subreddit_list()

In [9]:
subs

['bitcoin',
 'bitcoinbeginners',
 'bitcoincash',
 'bitcoindiscussion',
 'bitcoinmarkets',
 'bitcoinmining',
 'bitcointechnology',
 'bitcointrading',
 'btc',
 'cryptocurrency',
 'cryptomarkets',
 'cryptotrade',
 'ethanalysis',
 'ethdapps',
 'ethdev',
 'ethereum',
 'ethermining',
 'ethinvestor',
 'ethereumcommunity',
 'ethereumnoobies',
 'ethinsider',
 'ethtrader',
 'ethtraderpro',
 'gpumining']

In [8]:
for year in range(2012, 2019):
    for month in range(1,13):
        print('http://files.pushshift.io/reddit/submissions/RS_{}-{:02}.bz2/'.format(year, month))

http://files.pushshift.io/reddit/submissions/RS_2012-01.bz2/
http://files.pushshift.io/reddit/submissions/RS_2012-02.bz2/
http://files.pushshift.io/reddit/submissions/RS_2012-03.bz2/
http://files.pushshift.io/reddit/submissions/RS_2012-04.bz2/
http://files.pushshift.io/reddit/submissions/RS_2012-05.bz2/
http://files.pushshift.io/reddit/submissions/RS_2012-06.bz2/
http://files.pushshift.io/reddit/submissions/RS_2012-07.bz2/
http://files.pushshift.io/reddit/submissions/RS_2012-08.bz2/
http://files.pushshift.io/reddit/submissions/RS_2012-09.bz2/
http://files.pushshift.io/reddit/submissions/RS_2012-10.bz2/
http://files.pushshift.io/reddit/submissions/RS_2012-11.bz2/
http://files.pushshift.io/reddit/submissions/RS_2012-12.bz2/
http://files.pushshift.io/reddit/submissions/RS_2013-01.bz2/
http://files.pushshift.io/reddit/submissions/RS_2013-02.bz2/
http://files.pushshift.io/reddit/submissions/RS_2013-03.bz2/
http://files.pushshift.io/reddit/submissions/RS_2013-04.bz2/
http://files.pushshift.i

In [9]:
for _ in range(2):
    !wget 'http://samplecsvs.s3.amazonaws.com/Sacramentorealestatetransactions.csv'

--2018-11-23 17:46:27--  http://samplecsvs.s3.amazonaws.com/Sacramentorealestatetransactions.csv
Resolving samplecsvs.s3.amazonaws.com (samplecsvs.s3.amazonaws.com)... 52.216.81.192
Connecting to samplecsvs.s3.amazonaws.com (samplecsvs.s3.amazonaws.com)|52.216.81.192|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 113183 (111K) [application/x-csv]
Saving to: ‘Sacramentorealestatetransactions.csv.1’


2018-11-23 17:46:27 (19.3 MB/s) - ‘Sacramentorealestatetransactions.csv.1’ saved [113183/113183]

--2018-11-23 17:46:28--  http://samplecsvs.s3.amazonaws.com/Sacramentorealestatetransactions.csv
Resolving samplecsvs.s3.amazonaws.com (samplecsvs.s3.amazonaws.com)... 52.216.225.184
Connecting to samplecsvs.s3.amazonaws.com (samplecsvs.s3.amazonaws.com)|52.216.225.184|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 113183 (111K) [application/x-csv]
Saving to: ‘Sacramentorealestatetransactions.csv.2’


2018-11-23 17:46:28 (2.12 MB/s) - ‘Sacrame