In [5]:
from __future__ import division, print_function
from bz2 import BZ2File
import ujson

with BZ2File('./RC_2015-01.bz2') as f:
    line = f.readline()
ujson.loads(line)

{'archived': False,
 'author': 'YoungModern',
 'author_flair_css_class': None,
 'author_flair_text': None,
 'body': 'Most of us have some family members like this. *Most* of my family is like this. ',
 'controversiality': 0,
 'created_utc': '1420070400',
 'distinguished': None,
 'downs': 0,
 'edited': False,
 'gilded': 0,
 'id': 'cnas8zv',
 'link_id': 't3_2qyr1a',
 'name': 't1_cnas8zv',
 'parent_id': 't3_2qyr1a',
 'retrieved_on': 1425124282,
 'score': 14,
 'score_hidden': False,
 'subreddit': 'exmormon',
 'subreddit_id': 't5_2r0gj',
 'ups': 14}

In [6]:
from pandas import Timestamp, NaT, DataFrame
from toolz import dissoc


def to_json(line):
    """Convert a line of json into a cleaned up dict."""
    blob = ujson.loads(line)
    
    # Convert timestamps into Timestamp objects
    date = blob['created_utc']
    blob['created_utc'] = Timestamp.utcfromtimestamp(int(date))
    edited = blob['edited']
    blob['edited'] = Timestamp.utcfromtimestamp(int(edited)) if edited else NaT
    
    # Convert deleted posts into `None`s (missing text data)
    if blob['author'] == '[deleted]':
        blob['author'] = None
    if blob['body'] == '[deleted]':
        blob['body'] = None
        
    # Remove 'id', and 'subreddit_id' as they're redundant
    # Remove 'retrieved_on' as it's irrelevant
    return dissoc(blob, 'id', 'subreddit_id', 'retrieved_on')


columns = ['archived', 'author', 'author_flair_css_class', 'author_flair_text',
           'body', 'controversiality', 'created_utc', 'distinguished', 'downs',
           'edited', 'gilded', 'link_id', 'name', 'parent_id',
           'removal_reason', 'score', 'score_hidden', 'subreddit', 'ups']


def to_df(batch):
    """Convert a list of json strings into a dataframe"""
    blobs = map(to_json, batch)
    df = DataFrame.from_records(blobs, columns=columns)
    return df.set_index('created_utc')