In [49]:
import json
import lzma
import os
import bz2
import pprint
import numpy as np
import pandas as pd
from collections import defaultdict

In [50]:
def get_subreddit_list():
    # return list of lower case subreddit names
    subs = pd.read_csv('../reference/subreddits.txt', header=None)
    outsubs = list(sub.lower() for sub in subs[0])
    return outsubs

In [51]:
def parse_file(filename):
    # read in local file and parse all submissions for relevant subs
    errors = []
    # Creating a file pointer to read the .bz2 file
    fp = bz2.BZ2File(filename,'r')

    subs = get_subreddit_list()
    
    # initialize dictionary to contain output 
    output = defaultdict(list)
    
    # for all lines in file
    for i,line in enumerate(fp):
        if i > 1: # Processing only first 3 lines - Remove this while parsing the entire data
            break
        # read line as JSON dictionary
        try:
            json_dict = json.loads(line)
            print(json_dict)
            subreddit = json_dict['subreddit'].lower()
            if subreddit in subs:
                output[subreddit].append(json_dict)
        except:
            errors.append(filename)
    fp.close()

    with open('../data/comments/{}.json'.format(filename[:-4]), 'w') as outfile:
        json.dump(output, outfile)

In [4]:
parse_file('RC_2017-12.xz')

IOError: invalid data stream

In [4]:
def extract_comments():
    subs = get_subreddit_list()
    # for each monthly file
    for year in range(2012, 2019):
        for month in range(1,13):
            # retrieve data file to local disc
            link = 'http://files.pushshift.io/reddit/comments/'
            filename = 'RC_{}-{:02}.bz2'.format(year, month)
            get_file = link + filename
            try:
                !wget $get_file
            except:
                # skip month if error with wget
                continue
            # parse file for subreddits matching list of subreddits
            parse_file(filename)
            try:
                !rm $filename
            except:
                pass

In [None]:
extract_comments()

--2018-11-24 10:48:18--  http://files.pushshift.io/reddit/comments/RC_2012-01.bz2
Resolving files.pushshift.io (files.pushshift.io)... 104.27.135.62, 104.27.134.62, 2606:4700:30::681b:863e, ...
Connecting to files.pushshift.io (files.pushshift.io)|104.27.135.62|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1514838863 (1.4G) [application/octet-stream]
Saving to: ‘RC_2012-01.bz2’


2018-11-24 10:53:15 (4.86 MB/s) - ‘RC_2012-01.bz2’ saved [1514838863/1514838863]

--2018-11-24 11:04:05--  http://files.pushshift.io/reddit/comments/RC_2012-02.bz2
Resolving files.pushshift.io (files.pushshift.io)... 104.27.135.62, 104.27.134.62, 2606:4700:30::681b:873e, ...
Connecting to files.pushshift.io (files.pushshift.io)|104.27.135.62|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1505314039 (1.4G) [application/octet-stream]
Saving to: ‘RC_2012-02.bz2’


2018-11-24 11:08:59 (4.89 MB/s) - ‘RC_2012-02.bz2’ saved [1505314039/1505314039]

--2018-11-24 11:

In [9]:
def extract_comments_two():
    subs = get_subreddit_list()
    # for each monthly file
    for year in range(2014, 2019):
        for month in range(1,13):
            if year == 2014 and month < 12:
                continue
            # retrieve data file to local disc
            link = 'http://files.pushshift.io/reddit/comments/'
            filename = 'RC_{}-{:02}.bz2'.format(year, month)
            get_file = link + filename
            try:
                !wget $get_file
            except:
                # skip month if error with wget
                continue
            # parse file for subreddits matching list of subreddits
            parse_file(filename)
            try:
                !rm $filename
            except:
                pass

In [10]:
extract_comments_two()

--2018-11-25 18:58:29--  http://files.pushshift.io/reddit/comments/RC_2014-12.bz2
Resolving files.pushshift.io (files.pushshift.io)... 104.27.135.62, 104.27.134.62, 2606:4700:30::681b:863e, ...
Connecting to files.pushshift.io (files.pushshift.io)|104.27.135.62|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4889955672 (4.6G) [application/octet-stream]
Saving to: ‘RC_2014-12.bz2’


2018-11-25 19:14:36 (4.82 MB/s) - ‘RC_2014-12.bz2’ saved [4889955672/4889955672]

--2018-11-25 19:43:51--  http://files.pushshift.io/reddit/comments/RC_2015-01.bz2
Resolving files.pushshift.io (files.pushshift.io)... 104.27.135.62, 104.27.134.62, 2606:4700:30::681b:863e, ...
Connecting to files.pushshift.io (files.pushshift.io)|104.27.135.62|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5452413560 (5.1G) [application/octet-stream]
Saving to: ‘RC_2015-01.bz2’


2018-11-25 20:01:37 (4.88 MB/s) - ‘RC_2015-01.bz2’ saved [5452413560/5452413560]

--2018-11-25 20:

IOError: invalid data stream

In [54]:
def parse_file_lzma(filename):
    # read in local file and parse all submissions for relevant subs
    errors = []
    # Creating a file pointer to read the .bz2 file
    fp = lzma.LZMAFile(filename, mode='r')

    subs = get_subreddit_list()
    
    # initialize dictionary to contain output 
    output = defaultdict(list)
    
    for sub in subs:
        output[sub] = []
    
    # for all lines in file
    counter = 0
    for i,line in enumerate(fp):
        counter += 1
        #if i > 0: # Processing only first 3 lines - Remove this while parsing the entire data
        #    break
        # read line as JSON dictionary
        try:
            json_dict = json.loads(line)

            subreddit = str(json_dict['subreddit']).lower()
            if subreddit in subs:
                #pprint.pprint(json_dict)
                output[subreddit].append(json_dict)
        except:
            errors.append(json_dict)
            continue
    fp.close()
    
    print("number of errors: {}".format(len(errors)))
    print('number of lines: {}'.format(i))
    pprint.pprint(errors)
    #with open('../data/comments/{}.json'.format(filename[:-3]), 'w') as outfile:
    #    json.dump(output, outfile)

In [55]:
parse_file_lzma('RC_2017-12.xz')

number of errors: 1
number of lines: 296751
[{u'author': u'FourDM',
  u'author_flair_css_class': None,
  u'author_flair_text': None,
  u'body': u'Dude, chill the fuck out.\n\nPeople with a "there ought to be a law" disposition (such are yourself) are a far greater menace to society than senior citizens.',
  u'can_gild': True,
  u'controversiality': 0,
  u'created_utc': 1512093960,
  u'distinguished': None,
  u'edited': False,
  u'gilded': 0,
  u'id': u'dql7su4',
  u'is_submitter': False,
  u'link_id': u't3_7go19o',
  u'parent_id': u't1_dql26zr',
  u'permalink': u'/r/Justrolledintotheshop/comments/7go19o/72_year_old_woman_drove_onto_a_curb_and_lost_oil/dql7su4/',
  u'retrieved_on': 1514216296,
  u'score': -23,
  u'stickied': False,
  u'subreddit': u'Justrolledintotheshop',
  u'subreddit_id': u't5_2tteh',
  u'subreddit_type': u'public'}]


In [31]:
def extract_comments_lzma():
    subs = get_subreddit_list()
    # for each monthly file
    for year in range(2017, 2019):
        for month in range(1,13):
            # skip list of those already iterated
            if year == 2017 and month < 12:
                continue
            if year==2018 and month > 10: break
            # retrieve data file to local disc
            link = 'http://files.pushshift.io/reddit/comments/'
            filename = 'RC_{}-{:02}.xz'.format(year, month)
            get_file = link + filename
            try:
                !wget $get_file
            except:
                # skip month if error with wget
                continue
            # parse file for subreddits matching list of subreddits
            print('parsing {}'.format(filename))
            parse_file_lzma(filename)

In [8]:
extract_comments_lzma()

--2018-11-28 23:33:06--  http://files.pushshift.io/reddit/comments/RC_2017-12.xz
Resolving files.pushshift.io (files.pushshift.io)... 104.27.135.62, 104.27.134.62, 2606:4700:30::681b:863e, ...
Connecting to files.pushshift.io (files.pushshift.io)|104.27.135.62|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7635154760 (7.1G) [application/octet-stream]
Saving to: ‘RC_2017-12.xz’


2018-11-28 23:58:10 (4.84 MB/s) - ‘RC_2017-12.xz’ saved [7635154760/7635154760]

parsing RC_2017-12.xz
--2018-11-28 23:58:17--  http://files.pushshift.io/reddit/comments/RC_2018-01.xz
Resolving files.pushshift.io (files.pushshift.io)... 104.27.134.62, 104.27.135.62, 2606:4700:30::681b:863e, ...
Connecting to files.pushshift.io (files.pushshift.io)|104.27.134.62|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8228348756 (7.7G) [application/octet-stream]
Saving to: ‘RC_2018-01.xz’


2018-11-29 00:25:10 (4.87 MB/s) - ‘RC_2018-01.xz’ saved [8228348756/8228348756]



In [9]:
# iterate through all subreddits
for sub in get_subreddit_list():
    print('extracting {}...'.format(sub))
    output = dict()
    i = 0
    # iterate through all submission files
    for sub_file in os.listdir('../data/comments'):
        if sub_file[:2] != 'RC': continue
        with open('../data/comments/' + sub_file) as f:
            print('parsing {}'.format(str(sub_file)))
            data = json.load(f)
            # skip if subreddit not in data file
            if sub not in data.keys(): continue
            # for each dictionary in list of submissions
            for comment in data[sub]:
                com_data = dict()
                try:
                    com_data['author'] = comment['author']
                except:
                    pass
                try:
                    com_data['id'] = comment['id']
                except:
                    pass
                try:
                    com_data['time'] = comment['created_utc']
                except:
                    pass
                try:
                    com_data['downs'] = comment['downs']
                except:
                    pass
                try:
                    com_data['ups'] = comment['ups']
                except:
                    pass
                try:
                    com_data['body'] = comment['body']
                except:
                    pass
                try:
                    com_data['parent_id'] = comment['parent_id']
                except:
                    pass
                try:
                    com_data['link_id'] = comment['link_id']
                except:
                    pass
                output[i] = com_data
                i += 1
    print('writing {} file...'.format(sub))
    with open('../data/final_comments/{}.json'.format(sub), 'w') as outfile:
        json.dump(output, outfile)

extracting bitcoin...
parsing RC_2012-01.json
parsing RC_2012-02.json
parsing RC_2012-03.json
parsing RC_2012-04.json
parsing RC_2012-05.json
parsing RC_2012-06.json
parsing RC_2012-07.json
parsing RC_2012-08.json
parsing RC_2012-09.json
parsing RC_2012-10.json
parsing RC_2012-11.json
parsing RC_2012-12.json
parsing RC_2013-01.json
parsing RC_2013-02.json
parsing RC_2013-03.json
parsing RC_2013-04.json
parsing RC_2013-05.json
parsing RC_2013-06.json
parsing RC_2013-07.json
parsing RC_2013-08.json
parsing RC_2013-09.json
parsing RC_2013-10.json
parsing RC_2013-11.json
parsing RC_2013-12.json
parsing RC_2014-01.json
parsing RC_2014-02.json
parsing RC_2014-03.json
parsing RC_2014-04.json
parsing RC_2014-05.json
parsing RC_2014-06.json
parsing RC_2014-07.json
parsing RC_2014-08.json
parsing RC_2014-09.json
parsing RC_2014-10.json
parsing RC_2014-11.json
parsing RC_2014-12.json
parsing RC_2015-01.json
parsing RC_2015-02.json
parsing RC_2015-03.json
parsing RC_2015-04.json
parsing RC_2015-05

In [16]:
for sub_file in os.listdir('../data/comments'):
    if sub_file[:2] != 'RC': continue
    print(str(sub_file))

RC_2012-01.json
RC_2012-02.json
RC_2012-03.json
RC_2012-04.json
RC_2012-05.json
RC_2012-06.json
RC_2012-07.json
RC_2012-08.json
RC_2012-09.json
RC_2012-10.json
RC_2012-11.json
RC_2012-12.json
RC_2013-01.json
RC_2013-02.json
RC_2013-03.json
RC_2013-04.json
RC_2013-05.json
RC_2013-06.json
RC_2013-07.json
RC_2013-08.json
RC_2013-09.json
RC_2013-10.json
RC_2013-11.json
RC_2013-12.json
RC_2014-01.json
RC_2014-02.json
RC_2014-03.json
RC_2014-04.json
RC_2014-05.json
RC_2014-06.json
RC_2014-07.json
RC_2014-08.json
RC_2014-09.json
RC_2014-10.json
RC_2014-11.json
RC_2014-12.json
RC_2015-01.json
RC_2015-02.json
RC_2015-03.json
RC_2015-04.json
RC_2015-05.json
RC_2015-06.json
RC_2015-07.json
RC_2015-08.json
RC_2015-09.json
RC_2015-10.json
RC_2015-11.json
RC_2015-12.json
RC_2017-12.json
RC_2018-01.json
RC_2018-02.json
RC_2018-03.json
RC_2018-04.json
RC_2018-05.json
RC_2018-06.json
RC_2018-07.json
RC_2018-08.json
RC_2018-09.json
RC_2018-10.json
RC_2016-01.json
RC_2016-02.json
RC_2016-03.json
RC_2016-

In [16]:
parse_file('RS_2011-01.bz2')

In [6]:
filename = 'RS_2011-01.bz2'

# Creating a file pointer to read the .bz2 file
fp = bz2.BZ2File(filename,'r')

subs = get_subreddit_list()

# for loop to iterate over every record in the file and parse them
# this for loop reads one line in the file at a time
output = defaultdict(list)
for i,line in enumerate(fp):
    if i > 200: # Processing only first 3 lines - Remove this while parsing the entire data
        break
    json_dict = json.loads(line) # Converting the json object(line) into a dictionary
    print(json_dict['subreddit_id']) # Prints the subreddit_id
    if json_dict['subreddit'] not in subs:
        output[json_dict['subreddit']].append(json_dict['subreddit_id'])
    
    
fp.close()

with open('{}.json'.format(filename[:-4]), 'w') as outfile:
    json.dump(output, outfile)

t5_2qh0u
t5_2s3uz
t5_2qh68
t5_2qh33
t5_2qh33
t5_2qhlm
t5_2s7d1
t5_2qhix
t5_2qh2z
t5_2qh03
t5_2qh61
t5_2r5vt
t5_2qh1i
t5_2qj8f
t5_2qh33
t5_2qh1u
t5_2qh61
t5_2qh16
t5_2qh1i
t5_2r9vp
t5_2qh1i
t5_2qh2b
t5_6
t5_2qh7q
t5_6
t5_2qh6c
t5_2qpol
t5_2qgzt
t5_2qh03
t5_2qzb6
t5_2qh33
t5_2qh61
t5_2qh33
t5_2qtnt
t5_6
t5_6
t5_2qh03
t5_6
t5_2qtp5
t5_2r05i
t5_2s4lk
t5_2qh03
t5_2qh1i
t5_2qh63
t5_2qh6z
t5_2r5vt
t5_2qh0u
t5_2qqlo
t5_2qhk3
t5_2qh61
t5_2qpp6
t5_2qio8
t5_2s7tt
t5_2qh1i
t5_2r5vt
t5_6
t5_2qp30
t5_2qh1i
t5_6
t5_2s8wf
t5_2qkeh
t5_2qh61
t5_6
t5_2s7tt
t5_2qh1u
t5_2rp0r
t5_2qhfi
t5_2s7po
t5_6
t5_2qio8
t5_6
t5_2qh61
t5_2r7eu
t5_6
t5_6
t5_6
t5_6
t5_2qh1i
t5_2qh1i
t5_2s67l
t5_2qh6c
t5_2qy7e
t5_2r0gj
t5_2qpol
t5_2qq5c
t5_2r5vt
t5_2qh1i
t5_2qh1x
t5_6
t5_2qh2a
t5_2r05i
t5_2qpp6
t5_6
t5_2qm4e
t5_2qhud
t5_2qqlo
t5_2qhhw
t5_2qh2n
t5_2qh0u
t5_2qpp6
t5_2qh03
t5_2qh53
t5_6
t5_2qh0f
t5_2qhjq
t5_2s1g4
t5_2qjvn
t5_2s93y
t5_6
t5_6
t5_2r367
t5_2qhrv
t5_6
t5_2qh0f
t5_2r5vt
t5_6
t5_2qht2
t5_6
t5_2qhx4
t5_2qh0u
t5_6
t5_

In [16]:
filename = 'RS_2011-01.bz2'

# Creating a file pointer to read the .bz2 file
fp = bz2.BZ2File(filename,'r')

# for loop to iterate over every record in the file and parse them
# this for loop reads one line in the file at a time
output = dict()

data = json.load(fp)


KeyboardInterrupt: 

In [10]:
json_dict.keys()

[u'domain',
 u'banned_by',
 u'media_embed',
 u'subreddit',
 u'selftext_html',
 u'selftext',
 u'likes',
 u'link_flair_text',
 u'id',
 u'clicked',
 u'title',
 u'media',
 u'score',
 u'approved_by',
 u'over_18',
 u'hidden',
 u'thumbnail',
 u'subreddit_id',
 u'edited',
 u'link_flair_css_class',
 u'author_flair_css_class',
 u'downs',
 u'saved',
 u'is_self',
 u'permalink',
 u'name',
 u'created',
 u'url',
 u'author_flair_text',
 u'author',
 u'created_utc',
 u'ups',
 u'num_comments',
 u'num_reports',
 u'distinguished']

In [11]:
filename = 'RS_2011-01.bz2'
print(filename[:-4])

RS_2011-01


In [5]:
!wget 'http://files.pushshift.io/reddit/submissions/RS_2011-01.bz2'

--2018-11-23 18:49:00--  http://files.pushshift.io/reddit/submissions/RS_2011-01.bz2
Resolving files.pushshift.io (files.pushshift.io)... 104.27.134.62, 104.27.135.62, 2606:4700:30::681b:863e, ...
Connecting to files.pushshift.io (files.pushshift.io)|104.27.134.62|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 151006946 (144M) [application/octet-stream]
Saving to: ‘RS_2011-01.bz2’


2018-11-23 18:49:30 (4.87 MB/s) - ‘RS_2011-01.bz2’ saved [151006946/151006946]



In [26]:
teststring = 'this is a test string'
letters = 'abcdg'
output = defaultdict(list)
for letter in teststring:
    if letter in letters:
        output[letter].append(1)

In [27]:
output

defaultdict(list, {'a': [1], 'g': [1]})

In [8]:
subs = get_subreddit_list()

In [9]:
subs

['bitcoin',
 'bitcoinbeginners',
 'bitcoincash',
 'bitcoindiscussion',
 'bitcoinmarkets',
 'bitcoinmining',
 'bitcointechnology',
 'bitcointrading',
 'btc',
 'cryptocurrency',
 'cryptomarkets',
 'cryptotrade',
 'ethanalysis',
 'ethdapps',
 'ethdev',
 'ethereum',
 'ethermining',
 'ethinvestor',
 'ethereumcommunity',
 'ethereumnoobies',
 'ethinsider',
 'ethtrader',
 'ethtraderpro',
 'gpumining']

In [8]:
for year in range(2012, 2019):
    for month in range(1,13):
        print('http://files.pushshift.io/reddit/submissions/RS_{}-{:02}.bz2/'.format(year, month))

http://files.pushshift.io/reddit/submissions/RS_2012-01.bz2/
http://files.pushshift.io/reddit/submissions/RS_2012-02.bz2/
http://files.pushshift.io/reddit/submissions/RS_2012-03.bz2/
http://files.pushshift.io/reddit/submissions/RS_2012-04.bz2/
http://files.pushshift.io/reddit/submissions/RS_2012-05.bz2/
http://files.pushshift.io/reddit/submissions/RS_2012-06.bz2/
http://files.pushshift.io/reddit/submissions/RS_2012-07.bz2/
http://files.pushshift.io/reddit/submissions/RS_2012-08.bz2/
http://files.pushshift.io/reddit/submissions/RS_2012-09.bz2/
http://files.pushshift.io/reddit/submissions/RS_2012-10.bz2/
http://files.pushshift.io/reddit/submissions/RS_2012-11.bz2/
http://files.pushshift.io/reddit/submissions/RS_2012-12.bz2/
http://files.pushshift.io/reddit/submissions/RS_2013-01.bz2/
http://files.pushshift.io/reddit/submissions/RS_2013-02.bz2/
http://files.pushshift.io/reddit/submissions/RS_2013-03.bz2/
http://files.pushshift.io/reddit/submissions/RS_2013-04.bz2/
http://files.pushshift.i

In [9]:
for _ in range(2):
    !wget 'http://samplecsvs.s3.amazonaws.com/Sacramentorealestatetransactions.csv'

--2018-11-23 17:46:27--  http://samplecsvs.s3.amazonaws.com/Sacramentorealestatetransactions.csv
Resolving samplecsvs.s3.amazonaws.com (samplecsvs.s3.amazonaws.com)... 52.216.81.192
Connecting to samplecsvs.s3.amazonaws.com (samplecsvs.s3.amazonaws.com)|52.216.81.192|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 113183 (111K) [application/x-csv]
Saving to: ‘Sacramentorealestatetransactions.csv.1’


2018-11-23 17:46:27 (19.3 MB/s) - ‘Sacramentorealestatetransactions.csv.1’ saved [113183/113183]

--2018-11-23 17:46:28--  http://samplecsvs.s3.amazonaws.com/Sacramentorealestatetransactions.csv
Resolving samplecsvs.s3.amazonaws.com (samplecsvs.s3.amazonaws.com)... 52.216.225.184
Connecting to samplecsvs.s3.amazonaws.com (samplecsvs.s3.amazonaws.com)|52.216.225.184|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 113183 (111K) [application/x-csv]
Saving to: ‘Sacramentorealestatetransactions.csv.2’


2018-11-23 17:46:28 (2.12 MB/s) - ‘Sacrame