# Reddit Comment Database

In [1]:
import os
import glob
import json
import sqlite3
import sqlalchemy
import numpy as np
import pandas as pd
from time import time
from os import listdir
from os.path import isfile, join
from sqlalchemy import create_engine

pd.set_option('display.max_columns', None)

In [2]:
dtype_dict = {
    "author": str,
    "author_fullname": str,
    "awarders": str,
    "body": str,
    "id": str,
    "link_id": str,
    "subreddit": str,
    "subreddit_id": str,
    "subreddit_type": str,
}



def count_objects_in_file(filepath):
    """Count how many comments are in a .json file"""
    idx = 0
    with open(filepath) as fp:
        for line in fp:
            idx+=1
    return idx


def extract_subset(filepath, start=0, end=10):
    """Extract a subset of raw comment data directly from .json file"""
    comments = []
    with open(filepath) as fp:
        for idx, line in enumerate(fp):
            if (idx >= start) and (idx < end):
                comment = json.loads(line)
                comments.append(comment)
            elif idx >= end:
                break                
        return comments
        
    
def write_to_database(db_conn, json_fp, chunk_size):
    """Write the contents of temporary .json file to SQLite database"""
    batch_no=1
    for chunk in pd.read_json(json_fp, chunksize=chunk_size, lines=True):
        try: 
            chunk.to_sql('reddit_comments', db_conn, if_exists='append')
        except sqlalchemy.exc.SQLAlchemyError as e: 
            print("\n  {}".format(e.orig))
        batch_no+=1
        
        
def drop_additional_columns(df):
    """Drops specific columns from the dataframe if they exist. This is necessay because some of the 
       comment archives contain additional columns."""
    if "author_cakeday" in df.columns:
        df.drop(columns="author_cakeday", inplace=True)
    if "comment_type" in df.columns:
        df.drop(columns="comment_type", inplace=True)
    if "media_metadata" in df.columns:
        df.drop(columns="media_metadata", inplace=True)
    if "editable" in df.columns:
        df.drop(columns="editable", inplace=True)
    return df
    
        
        
def create_database(database, json_fp, comments_per_chunk, chunk_size, columns_to_drop):
    """
    args:
        database            - sqlite databse object
        json_fp             - filepath to .json raw comments file
        comments_per_chunk  - number of comments to store in temporary .json files 
        chunk_size          - size of chunks for the pd.read_json() function
        columns_to_drop     - columns to drop 
    """
    print("\n######## File: {}".format(json_fp))
    n_comments_total = count_objects_in_file(filepath=json_fp)
    n_chunks = (n_comments_total // comments_per_chunk) + 1
    print("Contains {} comments - dividing into {} chunks".format(n_comments_total, n_chunks))
    
    for idx in range(0, n_chunks):
        start = int(idx * comments_per_chunk)
        end = int(start + comments_per_chunk)
        print("\r   File chunk {}: Extracting and writing comments {} - {}".format(idx+1, start, end), end='')
        comments = extract_subset(filepath=json_fp, start=start, end=end)
        df = pd.DataFrame(comments)
        df = df.drop(columns=columns_to_drop)
        df = drop_additional_columns(df=df)
        df = df.astype(dtype_dict)
        # Load the chunk into temporary .json file
        df.to_json("data/db_chunk.json", orient='records', lines=True)
        write_to_database(
            db_conn=database, 
            json_fp="data/db_chunk.json", 
            chunk_size=chunk_size
        )
        
        
def run_database_builder(input_path, db_path, drop_cols, comments_per_chunk=500000, chunk_size=100000):
    """
    args:
        input_path          - filepath to .json raw comments file
        db_path             - filepath to database file
        drop_cols           - columns to drop 
        comments_per_chunk  - number of comments to store in temporary .json files 
        chunk_size          - size of chunks for the pd.read_json() function
    """
    
    conn = create_engine('sqlite:///'+db_path)

    create_database(
        database=conn, 
        json_fp=input_path,
        comments_per_chunk=comments_per_chunk, 
        chunk_size=chunk_size,
        columns_to_drop=drop_cols
    )

In [3]:
DATA_ROOT = "/media/cameron/Seagate Basic/datasets/reddit/archives_decompressed/*/*.json"
DATA_DEST = "data/RC_2020_database.db"

In [4]:
data_files = sorted(glob.glob(DATA_ROOT))
print(data_files[43])
# n_comments = count_objects_in_file(filepath=data_files[0])
# print(n_comments)

/media/cameron/Seagate Basic/datasets/reddit/archives_decompressed/mar_2020/RC_2020-03-15.json


Specify some irrelevant columns to drop.

In [5]:
drop_cols = ["all_awardings", 
             "associated_award", 
             "author_flair_css_class", 
             "author_flair_richtext",
             "author_flair_background_color",
             "author_flair_text_color",
             "author_flair_type",
             "author_patreon_flair",
             "author_flair_template_id",
             "author_premium",
             "can_gild", 
             "collapsed",
             "collapsed_because_crowd_control",
             "collapsed_reason",
             "gildings",
             "permalink", 
             "subreddit_name_prefixed",
             "treatment_tags"]

Sanity check: Extract some comments from a file

In [6]:
comments = extract_subset(filepath=data_files[43], start=0, end=100)
df = pd.DataFrame(comments)
df = df.drop(columns=drop_cols)
df = drop_additional_columns(df=df)
df = df.astype(dtype_dict)
print(df.columns.values.tolist())

['author', 'author_created_utc', 'author_flair_text', 'author_fullname', 'awarders', 'body', 'can_mod_post', 'controversiality', 'created_utc', 'distinguished', 'edited', 'gilded', 'id', 'is_submitter', 'link_id', 'locked', 'no_follow', 'parent_id', 'quarantined', 'removal_reason', 'retrieved_on', 'score', 'send_replies', 'stickied', 'subreddit', 'subreddit_id', 'subreddit_type', 'top_awarded_type', 'total_awards_received']


In [7]:
df.head()

Unnamed: 0,author,author_created_utc,author_flair_text,author_fullname,awarders,body,can_mod_post,controversiality,created_utc,distinguished,edited,gilded,id,is_submitter,link_id,locked,no_follow,parent_id,quarantined,removal_reason,retrieved_on,score,send_replies,stickied,subreddit,subreddit_id,subreddit_type,top_awarded_type,total_awards_received
0,sadosmurf,1394162000.0,United States,t2_flarf,[],/r/signupsforpay has multiple offers to get pa...,False,0,1584230400,,False,0,fkiwnmw,False,t3_fimbpe,False,True,t3_fimbpe,False,,1591810444,1,True,False,SwagBucks,t5_2qw4c,public,,0
1,ChemicalAssistance,,,t2_4ho1ljfk,[],Is Gregory awake yet?,False,0,1584230400,,False,0,fkiwnmx,False,t3_fins0l,False,True,t3_fins0l,False,,1591810444,2,True,False,ufc,t5_2qsev,public,,0
2,88Trumans,,,t2_zlan5xu,[],"Actually, I have to be REMINDED to eat. Does ...",False,0,1584230400,,False,0,fkiwnmy,False,t3_filbbo,False,True,t3_filbbo,False,,1591810444,2,True,False,My600lbLife,t5_38ycw,public,,0
3,[deleted],,,,[],oh my god that looks amazing!,False,0,1584230400,,False,0,fkiwnmz,False,t3_fira32,False,False,t3_fira32,False,,1591810444,8,True,False,twentyonepilots,t5_2u0fp,public,,0
4,nice-scores,,,t2_5rj1cdoq,[],𝓷𝓲𝓬𝓮 ☜(ﾟヮﾟ☜)\n#Nice Leaderboard\n**1.** `u/Gil...,False,0,1584230400,,False,0,fkiwnn0,False,t3_fimjgn,False,True,t1_fkiw7y6,False,,1591810444,1,True,False,Metroid,t5_2rrd6,public,,0


In [8]:
data_files[43:]

['/media/cameron/Seagate Basic/datasets/reddit/archives_decompressed/mar_2020/RC_2020-03-15.json',
 '/media/cameron/Seagate Basic/datasets/reddit/archives_decompressed/mar_2020/RC_2020-03-16.json',
 '/media/cameron/Seagate Basic/datasets/reddit/archives_decompressed/mar_2020/RC_2020-03-17.json',
 '/media/cameron/Seagate Basic/datasets/reddit/archives_decompressed/mar_2020/RC_2020-03-18.json',
 '/media/cameron/Seagate Basic/datasets/reddit/archives_decompressed/mar_2020/RC_2020-03-19.json',
 '/media/cameron/Seagate Basic/datasets/reddit/archives_decompressed/mar_2020/RC_2020-03-20.json',
 '/media/cameron/Seagate Basic/datasets/reddit/archives_decompressed/mar_2020/RC_2020-03-21.json',
 '/media/cameron/Seagate Basic/datasets/reddit/archives_decompressed/mar_2020/RC_2020-03-22.json',
 '/media/cameron/Seagate Basic/datasets/reddit/archives_decompressed/mar_2020/RC_2020-03-23.json',
 '/media/cameron/Seagate Basic/datasets/reddit/archives_decompressed/mar_2020/RC_2020-03-24.json',
 '/media/c

last read: RC_2020-03-14.json

In [9]:
for idx, data_file in enumerate(data_files[43:]):
    t_start = time()
    run_database_builder(
        input_path=data_file, 
        db_path=DATA_DEST, 
        drop_cols=drop_cols,
    )
    t_end = time()
    t_iter = (t_end-t_start)/60
    print("\nFinished. Time: {:.1f} min".format(t_iter))


######## File: /media/cameron/Seagate Basic/datasets/reddit/archives_decompressed/mar_2020/RC_2020-03-15.json
Contains 4963077 comments - dividing into 10 chunks
   File chunk 10: Extracting and writing comments 4500000 - 5000000
Finished. Time: 15.8 min

######## File: /media/cameron/Seagate Basic/datasets/reddit/archives_decompressed/mar_2020/RC_2020-03-16.json
Contains 5393741 comments - dividing into 11 chunks
   File chunk 11: Extracting and writing comments 5000000 - 5500000
Finished. Time: 22.5 min

######## File: /media/cameron/Seagate Basic/datasets/reddit/archives_decompressed/mar_2020/RC_2020-03-17.json
Contains 5333263 comments - dividing into 11 chunks
   File chunk 11: Extracting and writing comments 5000000 - 5500000
Finished. Time: 19.9 min

######## File: /media/cameron/Seagate Basic/datasets/reddit/archives_decompressed/mar_2020/RC_2020-03-18.json
Contains 5518891 comments - dividing into 12 chunks
   File chunk 12: Extracting and writing comments 5500000 - 6000000
F

Contains 5864991 comments - dividing into 12 chunks
   File chunk 12: Extracting and writing comments 5500000 - 6000000
Finished. Time: 23.5 min

######## File: /media/cameron/Seagate Basic/datasets/reddit/archives_decompressed/may_2020/RC_2020-05-17.json
Contains 5775570 comments - dividing into 12 chunks
   File chunk 12: Extracting and writing comments 5500000 - 6000000
Finished. Time: 21.0 min

######## File: /media/cameron/Seagate Basic/datasets/reddit/archives_decompressed/may_2020/RC_2020-05-18.json
Contains 6293784 comments - dividing into 13 chunks
   File chunk 13: Extracting and writing comments 6000000 - 6500000
Finished. Time: 23.2 min

######## File: /media/cameron/Seagate Basic/datasets/reddit/archives_decompressed/may_2020/RC_2020-05-19.json
Contains 6372347 comments - dividing into 13 chunks
   File chunk 13: Extracting and writing comments 6000000 - 6500000
Finished. Time: 22.5 min

######## File: /media/cameron/Seagate Basic/datasets/reddit/archives_decompressed/may_

In [10]:
# t_start = time()
# run_database_builder(
#     input_path=data_files[101], 
#     db_path=DATA_DEST, 
#     drop_cols=drop_cols,
# )
# t_end = time()
# t_iter = (t_end-t_start)/60
# print("Finished. Time: {:.1f} min".format(t_iter))

In [11]:
DB_NAME = DATA_DEST
conn = sqlite3.connect(DB_NAME)

In [12]:
df = pd.read_sql_query("""SELECT * FROM reddit_comments LIMIT 100""", conn)
df = df.drop(columns=["index"])
df.head()

Unnamed: 0,author,author_created_utc,author_flair_text,author_fullname,awarders,body,can_mod_post,controversiality,created_utc,distinguished,edited,gilded,id,is_submitter,link_id,locked,no_follow,parent_id,quarantined,removal_reason,retrieved_on,score,send_replies,stickied,subreddit,subreddit_id,subreddit_type,top_awarded_type,total_awards_received,author_cakeday
0,[deleted],,,,[],[removed],0,0,1585699200,,0,0,fm2kad9,0,t3_fsjjok,0,1,t1_fm2jwbi,0,,1592786643,2,1,0,FragileWhiteRedditor,t5_mcrlm,public,,0,
1,aegisbur,1441387000.0,,t2_q5akk,[],like Yu-Gi-Oh or magic,0,0,1585699200,,0,0,fm2kada,0,t3_fsnxj5,0,1,t3_fsnxj5,0,,1592786643,1,1,0,lgbt,t5_2qhh7,public,,0,
2,[deleted],,,,[],[removed],0,0,1585699200,,0,0,fm2kadb,0,t3_fso8p5,0,1,t3_fso8p5,0,,1592786643,1,1,0,wallstreetbets,t5_2th52,public,,0,
3,DanielAdams6969,1411679000.0,,t2_ikmp4,[],wait what they kicked her out?!,0,0,1585699200,,0,0,fm2kadc,0,t3_fsbcrt,0,0,t1_fm2dmta,0,,1592786643,4,1,0,youtubehaiku,t5_2tqlz,public,,0,
4,Hokirob,1541870000.0,,t2_2kp55zr0,[],I’m a 1.865S EB guy myself. I know some guys ...,0,0,1585699200,,0,0,fm2kadd,0,t3_fsnxga,0,1,t3_fsnxga,0,,1592786643,1,1,0,EggsIncCoOp,t5_hc3ix,public,,0,
