In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import json
import os
from pprint import pprint
from collections import defaultdict

In [2]:
DATEFORMAT = "%Y-%m-%d %H:%M"

In [3]:
def convert_from_utc(utc_time):
    """
    Convert from UTC to datetime object
    """
    return datetime.utcfromtimestamp(utc_time)

def convert_to_datetime(inputdt):
    """
    Return datetime object with date for parameter
    """
    return datetime.strftime(inputdt, DATEFORMAT)

def convert_time(utc_time):
    return convert_to_datetime(convert_from_utc(utc_time))

In [14]:
def get_comment_dictionary(body=False):
    # read in comment dictionary from file
    if body:
        comment_dict_dir = '../../data/final/dictionaries/comments.json'
    else:
        comment_dict_dir = '../../data/final/dictionaries/comments-no-body.json'
        
    with open(comment_dict_dir,'r') as f:
            # load data
            file_data = json.load(f)
    
    return file_data   

In [None]:
test = 1386623654

In [None]:
test_dt = convert_from_utc(test)

In [None]:
convert_time(test)

In [None]:
comments = get_comment_dictionary()

In [13]:
def parse_repo_files(output):
    # read in all submission files from submission folder
    file_dir = '../../data/final/submissions'
    # directory of submissions
        
    # extract data from repo files
    print('Getting REPO Data...')
    for sub_file in os.listdir(file_dir):
        filename = sub_file[:-5]
        print('parsing {}...'.format(sub_file))
        
        file_path = file_dir + '/' + sub_file
        
        # open each file and parse
        with open(file_path,'r') as f:
            # load data
            file_data = json.load(f)
            
            # output print example if needed
            pprint(file_data['1'])
            
            # for each subreddit and list of submissions for that sub
            for subreddit, submissions in file_data.items():
                # for each submission in list of submissions
                for submission in submissions:
                    if type(submission) != dict: continue
                    pprint(submission)
                    pprint(submissions)
                    pprint(subreddit)

                    submission_dict = dict()

                    submission_dict['author_id'] = submission['author']
                    submission_dict['time'] = datetime.utcfromtimestamp(int(submission["time"])).strftime(DATEFORMAT)
                    submission_dict['subreddit'] = subreddit
                    submission_dict['title'] = submission['title']
                    submission_dict['body'] = submission['body']
                    submission_dict['score'] = submission['score']
                    
                    output[submission['id']] = submission_dict
                    
    return output

In [12]:
output = dict()
output = parse_repo_files(output)

Getting REPO Data...
parsing bitcoin.json...
{u'author': u'bilotrace',
 u'body': u'',
 u'downs': 8,
 u'id': u'nyk7m',
 u'num_comments': 15,
 u'time': 1325425821,
 u'title': u'This is why we need bitcoins',
 u'ups': 87,
 u'url': u'http://www.youtube.com/watch?feature=player_embedded&amp;v=xv3AhTL13BU'}
u'body'
{u'author': u'shortbitcoin',
 u'body': u'',
 u'downs': 0,
 u'id': u'3diyjz',
 u'num_comments': 30,
 u'time': u'1437067578',
 u'title': u'Bitcoin Price Collapses; What Now?',
 u'ups': 0,
 u'url': u'http://bitcoinwarrior.net/2015/07/bitcoin-price-collapses-what-now/'}
u'287144'
body


In [None]:
def parse_api_files(output):
    # read in all comment files from comment folder
    api_dir = '../../data/final/api_data'

    # directory of comments
    
    # extract data from reddit API
    print('Getting API Data...')
    for sub_file in os.listdir(api_dir):
        filename = sub_file[:-5]
        print('parsing {}...'.format(sub_file))
        
        file_path = api_dir + '/' + sub_file
        
        # open each file and parse
        with open(file_path,'r') as f:
            # load data
            file_data = json.load(f)
            
            # output print example if needed
            #pprint(file_data['1'])
            #break
            # for each entry
            for entry in file_data.keys():

                # load comments list into comment variable
                comments = file_data[entry]['comments']
                
                # iterate through each comment
                for comment in comments:
                    # create dict to hold comment dictionary info
                    comment_dict = dict()

                    # error handle to ensure all are parsed
                    try:
                        if comment[2] == '' or comment[2] == ' ':
                            comment_dict['author_id'] = '[deleted]'
                        else:
                            comment_dict['author_id'] = comment[2]
                    except:
                        pass
                    try:
                        comment_dict['subreddit'] = filename
                    except:
                        pass
                    try:
                        comment_dict['time'] = comment[0][:-3]
                    except:
                        pass
                    try:
                        comment_dict['link_id'] = comment[6]
                    except:
                        pass
                    try:
                        comment_dict['parent_id'] = comment[7]
                    except:
                        pass
                    try:
                        comment_dict['score'] = comment[4]
                    except:
                        pass
                    try:
                        comment_dict['body'] =  comment[3]
                    except:
                        pass
                    output[comment[1]] = comment_dict
    
    return output

In [None]:
def write_output(output):
    # write data out to disk
    print('writing data to disk...')
    
    out_path = '../../data/final/output/submissions.json'
    with open(out_path,'w') as f:
        json.dump(output, f)

In [None]:
def parse_comment_files():
    output = dict()
    
    output = parse_repo_files(output)
    output = parse_api_files(output)
    write_output(output)

In [None]:
parse_comment_files(True)