In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import json
import os
from pprint import pprint
from collections import defaultdict

In [2]:
DATEFORMAT = "%Y-%m-%d %H:%M"

In [3]:
def convert_from_utc(utc_time):
    """
    Convert from UTC to datetime object
    """
    return datetime.utcfromtimestamp(utc_time)

def convert_to_datetime(inputdt):
    """
    Return datetime object with date for parameter
    """
    return datetime.strftime(inputdt, DATEFORMAT)

def convert_time(utc_time):
    return convert_to_datetime(convert_from_utc(utc_time))

In [4]:
test = 1386623654

In [5]:
test_dt = convert_from_utc(test)

In [6]:
convert_time(test)

'2013-12-09 21:14'

In [7]:
def parse_repo_files(output, body=True):
    # read in all comment files from comment folder
    file_dir = '../../data/final/comments'
    # directory of comments
        
    # extract data from repo files
    print('Getting REPO Data...')
    for sub_file in os.listdir(file_dir):
        filename = sub_file[:-5]
        print('parsing {}...'.format(sub_file))
        
        file_path = file_dir + '/' + sub_file
        
        # open each file and parse
        with open(file_path,'r') as f:
            # load data
            file_data = json.load(f)
            
            # output print example if needed
            #pprint(file_data['1'])
            
            # for each entry
            for entry in file_data.keys():
                # load comment data into comment variable
                comment = file_data[entry]
                #pprint(comment)
                
                # create dict to hold comment dictionary info
                comment_dict = dict()
                
                #pprint(comment['author'])
                
                # error handle to ensure all are parsed
                try:
                    comment_dict['author_id'] = str(comment['author'])
                except:
                    pass
                try:
                    comment_dict['subreddit'] = filename
                except:
                    pass
                try:
                    comment_dict['time'] = str(convert_time(comment['time']))
                except:
                    pass
                try:
                    comment_dict['link_id'] = str(comment['link_id'])
                except:
                    pass
                try:
                    comment_dict['parent_id'] = str(comment['parent_id'])
                except:
                    pass
                try:
                    comment_dict['score'] = str(int(comment['ups']) - int(comment['downs']))
                except:
                    pass
                if body:
                    try:
                        comment_dict['body'] = str(comment['body'])
                    except:
                        pass
                
                #print(comment_dict)
                #print('\n')
                
                output[comment['id']] = comment_dict
    
    return output

In [8]:
def parse_api_files(output, body=True):
    # read in all comment files from comment folder
    api_dir = '../../data/final/api_data'

    # directory of comments
    
    # extract data from reddit API
    print('Getting API Data...')
    for sub_file in os.listdir(api_dir):
        filename = sub_file[:-5]
        print('parsing {}...'.format(sub_file))
        
        file_path = api_dir + '/' + sub_file
        
        # open each file and parse
        with open(file_path,'r') as f:
            # load data
            file_data = json.load(f)
            
            # output print example if needed
            #pprint(file_data['1'])
            #break
            # for each entry
            for entry in file_data.keys():

                # load comments list into comment variable
                comments = file_data[entry]['comments']
                
                # iterate through each comment
                for comment in comments:
                    # create dict to hold comment dictionary info
                    comment_dict = dict()

                    # error handle to ensure all are parsed
                    try:
                        if comment[2] == '' or comment[2] == ' ':
                            comment_dict['author_id'] = '[deleted]'
                        else:
                            comment_dict['author_id'] = comment[2]
                    except:
                        pass
                    try:
                        comment_dict['subreddit'] = filename
                    except:
                        pass
                    try:
                        comment_dict['time'] = comment[0][:-3]
                    except:
                        pass
                    try:
                        comment_dict['link_id'] = comment[6]
                    except:
                        pass
                    try:
                        comment_dict['parent_id'] = comment[7]
                    except:
                        pass
                    try:
                        comment_dict['score'] = comment[4]
                    except:
                        pass
                    if body:
                        try:
                            comment_dict['body'] =  comment[3]
                        except:
                            pass
                    output[comment[1]] = comment_dict
    
    return output

In [9]:
def write_output(output, body=True):
    # write data out to disk
    print('writing data to disk...')
    
    if body:
        out_path = '../../data/final/output/comments.json'
    else:
        out_path = '../../data/final/output/comments-no-body.json'
    with open(out_path,'w') as f:
        json.dump(output, f)

In [10]:
def parse_comment_files(body=True):
    output = dict()
    
    output = parse_repo_files(output,body)
    output = parse_api_files(output,body)
    write_output(output,body)

In [11]:
parse_comment_files(True)

Getting REPO Data...
parsing bitcoinbeginners.json...
parsing bitcoincash.json...
parsing bitcoindiscussion.json...
parsing bitcoinmarkets.json...
parsing bitcoinmining.json...
parsing bitcointechnology.json...
parsing bitcointrading.json...
parsing btc.json...
parsing cryptocurrency.json...
parsing cryptomarkets.json...
parsing cryptotrade.json...
parsing ethanalysis.json...
parsing ethdapps.json...
parsing ethdev.json...
parsing ethereum.json...
parsing ethereumcommunity.json...
parsing ethereumnoobies.json...
parsing ethermining.json...
parsing ethinsider.json...
parsing ethinvestor.json...
parsing ethtrader.json...
parsing ethtraderpro.json...
parsing gpumining.json...
parsing bitcoin.json...
Getting API Data...
parsing ethereumcommunity.json...
parsing bitcointrading.json...
parsing btc.json...
parsing cryptocurrency.json...
parsing cryptomarkets.json...
parsing cryptotrade.json...
parsing ethanalysis.json...
parsing ethdapps.json...
parsing ethdev.json...
parsing ethereum.json...

In [12]:
parse_comment_files(False)

Getting REPO Data...
parsing bitcoinbeginners.json...
parsing bitcoincash.json...
parsing bitcoindiscussion.json...
parsing bitcoinmarkets.json...
parsing bitcoinmining.json...
parsing bitcointechnology.json...
parsing bitcointrading.json...
parsing btc.json...
parsing cryptocurrency.json...
parsing cryptomarkets.json...
parsing cryptotrade.json...
parsing ethanalysis.json...
parsing ethdapps.json...
parsing ethdev.json...
parsing ethereum.json...
parsing ethereumcommunity.json...
parsing ethereumnoobies.json...
parsing ethermining.json...
parsing ethinsider.json...
parsing ethinvestor.json...
parsing ethtrader.json...
parsing ethtraderpro.json...
parsing gpumining.json...
parsing bitcoin.json...
Getting API Data...
parsing ethereumcommunity.json...
parsing bitcointrading.json...
parsing btc.json...
parsing cryptocurrency.json...
parsing cryptomarkets.json...
parsing cryptotrade.json...
parsing ethanalysis.json...
parsing ethdapps.json...
parsing ethdev.json...
parsing ethereum.json...

In [None]:
def parse_comment_files_old():
    # read in all comment files from comment folder
    file_dir = '../../data/final/comments'
    api_dir = '../../data/final/api_data'
    out_path = '../../data/final/output'
    # directory of comments
    
    output = dict()
    
    # extract data from repo files
    for sub_file in os.listdir(file_dir):
        filename = sub_file[:-5]
        print('parsing {}...'.format(sub_file))
        
        file_path = file_dir + '/' + sub_file
        
        # open each file and parse
        with open(file_path,'r') as f:
            # load data
            file_data = json.load(f)
            
            # output print example if needed
            pprint(file_data['1'])
            
            # for each entry
            for entry in file_data.keys():
                # load comment data into comment variable
                comment = file_data[entry]
                # create dict to hold comment dictionary info
                comment_dict = dict()

                # error handle to ensure all are parsed
                try:
                    comment_dict['author_id'] = entry_data['author']
                except:
                    pass
                try:
                    comment_dict['subreddit'] = filename
                except:
                    pass
                try:
                    comment_dict['time'] = convert_time(entry_data['time'])
                except:
                    pass
                try:
                    comment_dict['link_id'] = entry_data['link_id']
                except:
                    pass
                try:
                    comment_dict['parent_id'] = entry_data['parent_id']
                except:
                    pass
                try:
                    comment_dict['score'] = entry_data['ups'] - entry_data['downs']
                except:
                    pass
                try:
                    comment_dict['body'] = entry_data['body']
                except:
                    pass
                
                output[comment['id']] = comment_dict

    # extract data from reddit API
    print('\nGetting API Data...')
    for sub_file in os.listdir(api_dir):
        filename = sub_file[:-5]
        print('parsing {}...'.format(sub_file))
        
        file_path = api_dir + '/' + sub_file
        
        # open each file and parse
        with open(file_path,'r') as f:
            # load data
            file_data = json.load(f)
            
            # output print example if needed
            #pprint(file_data['1'])
            #break
            # for each entry
            for entry in file_data.keys():

                # load comments list into comment variable
                comments = file_data[entry]['comments']
                i = 0
                # iterate through each comment
                for comment in comments:
                    # create dict to hold comment dictionary info
                    comment_dict = dict()

                    # error handle to ensure all are parsed
                    try:
                        comment_dict['author_id'] = comment[2]
                    except:
                        pass
                    try:
                        comment_dict['subreddit'] = filename
                    except:
                        pass
                    try:
                        comment_dict['time'] = comment[0][:-3]
                    except:
                        pass
                    try:
                        comment_dict['link_id'] = comment[6]
                    except:
                        pass
                    try:
                        comment_dict['parent_id'] = comment[7]
                    except:
                        pass
                    try:
                        comment_dict['score'] = comment[4]
                    except:
                        pass
                    try:
                        comment_dict['body'] =  comment[3]
                    except:
                        pass
                    output[comment[1]] = comment_dict
    
    # write data out to disk
    print('writing data to disk...')
    with open(out_path + '/comments2.json','w') as f:
        json.dump(output, f)

In [8]:
!ls '../../data/final/comments'

bitcoinbeginners.json	btc.json		ethereum.json
bitcoincash.json	cryptocurrency.json	ethereumnoobies.json
bitcoindiscussion.json	cryptomarkets.json	ethermining.json
bitcoin.json		cryptotrade.json	ethinsider.json
bitcoinmarkets.json	ethanalysis.json	ethinvestor.json
bitcoinmining.json	ethdapps.json		ethtrader.json
bitcointechnology.json	ethdev.json		ethtraderpro.json
bitcointrading.json	ethereumcommunity.json	gpumining.json


In [7]:
def parse_comment_files():
    # read in all comment files from comment folder
    file_dir = '../../data/final/comments'
    api_dir = '../../data/final/api_data'
    out_path = '../../data/final/output'
    # directory of comments
    
    output = dict()
    
    # extract data from repo files
    for sub_file in os.listdir(file_dir):
        filename = sub_file[:-5]
        print('parsing {}...'.format(sub_file))
        
        file_path = file_dir + '/' + sub_file
        
        # open each file and parse
        with open(file_path,'r') as f:
            # load data
            file_data = json.load(f)
            
            # output print example if needed
            #pprint(file_data['1'])
            
            # for each entry
            for entry in file_data.keys():
                # load comment data into comment variable
                comment = file_data[entry]
                # create dict to hold comment dictionary info
                comment_dict = dict()

                # error handle to ensure all are parsed
                try:
                    comment_dict['author_id'] = entry_data['author']
                except:
                    pass
                try:
                    comment_dict['subreddit'] = filename
                except:
                    pass
                try:
                    comment_dict['time'] = convert_time(entry_data['time'])
                except:
                    pass
                try:
                    comment_dict['link_id'] = entry_data['link_id']
                except:
                    pass
                try:
                    comment_dict['parent_id'] = entry_data['parent_id']
                except:
                    pass
                try:
                    comment_dict['score'] = entry_data['ups'] - entry_data['downs']
                except:
                    pass
                try:
                    comment_dict['body'] = entry_data['body']
                except:
                    pass
                
                output[comment['id']] = comment_dict
    
    # write data out to disk
    print('writing data to disk...')
    with open(out_path + '/comments.json','w') as f:
        json.dump(output, f)

In [36]:
def parse_comment_files_test():
    # read in all comment files from comment folder
    file_dir = '../../data/final/comments'
    api_dir = '../../data/final/api_data'
    out_path = '../../data/final/output'
    # directory of comments
    
    output = dict()


    # extract data from reddit API
    print('\nGetting API Data...')
    for sub_file in os.listdir(api_dir):
        filename = sub_file[:-5]
        print('parsing {}...'.format(sub_file))
        
        file_path = api_dir + '/' + sub_file
        
        # open each file and parse
        with open(file_path,'r') as f:
            # load data
            file_data = json.load(f)
            
            # output print example if needed
            #pprint(file_data['1'])
            #break
            # for each entry
            for entry in file_data.keys():

                # load comments list into comment variable
                comments = file_data[entry]['comments']

                # iterate through each comment
                for comment in comments:
                    pprint(comment)
                    # create dict to hold comment dictionary info
                    comment_dict = dict()

                    # error handle to ensure all are parsed
                    try:
                        comment_dict['author_id'] = comment[2]
                    except:
                        pass
                    try:
                        comment_dict['subreddit'] = filename
                    except:
                        pass
                    try:
                        comment_dict['time'] = comment[0][:-3]
                    except:
                        pass
                    try:
                        comment_dict['link_id'] = comment[6]
                    except:
                        pass
                    try:
                        comment_dict['parent_id'] = comment[7]
                    except:
                        pass
                    try:
                        comment_dict['score'] = comment[4]
                    except:
                        pass
                    try:
                        comment_dict['body'] =  comment[3]
                    except:
                        pass

                    output[comment[1]] = comment_dict  
    
   # with open(out_path + '/comments.json','w') as f:
    #    json.dump(output, f)