This notebook serves to extract reddit comment and post data from .zst files retrieved from the pushshift database.

In [1]:
import sys
import os
import re
import time
import zstandard as zstd
import json

import pandas as pd
import numpy as np


#### Example
```
with open("filename.zst", 'rb') as fh:
    dctx = zstd.ZstdDecompressor()
    with dctx.stream_reader(fh) as reader:
        previous_line = ""
        while True:
            chunk = reader.read(65536)
            if not chunk:
                break

            string_data = chunk.decode('utf-8')
            lines = string_data.split("\n")
            for i, line in enumerate(lines[:-1]):
                if i == 0:
                    line = previous_line + line
                object = json.loads(line)
                # do something with the object here
            previous_line = lines[-1]
```

In [6]:
filename = "../../data/tabular/reddit_post_dumps/RS_2019-09.zst"
wsb_posts = []
with open(filename, 'rb') as fh:
    dctx = zstd.ZstdDecompressor()
    
    with dctx.stream_reader(fh) as reader:
        previous_line = ""
        while True:
            chunk = reader.read(2**24)
            if not chunk:
                break
            string_data = chunk.decode('utf-8')
            lines = string_data.split("\n")
            for i, line in enumerate(lines[:-1]):
                if i == 0:
                    line = previous_line + line
                post = json.loads(line)
                if post['subreddit'].lower() == 'wallstreetbets':
                    wsb_posts.append(post)
            previous_line = lines[-1]
            
print('{} posts extracted.'.format(len(wsb_posts)))

5364 posts extracted.


In [22]:
prefix = "../../data/tabular/reddit_post_dumps"
filenames = ["RS_2019-10", "RS_2019-11", "RS_2019-12"]

for filename in filenames:
    wsb_posts = []
    
    with open('{}/{}.zst'.format(prefix, filename), 'rb') as fh:
        dctx = zstd.ZstdDecompressor()
        with dctx.stream_reader(fh) as reader:
            previous_line = ""
            while True:
                chunk = reader.read(2**24)
                if not chunk:
                    break
                string_data = chunk.decode('utf-8')
                lines = string_data.split("\n")
                for i, line in enumerate(lines[:-1]):
                    if i == 0:
                        line = previous_line + line
                    post = json.loads(line)
                    if post['subreddit'].lower() == 'wallstreetbets':
                        wsb_posts.append(post)
                previous_line = lines[-1]
    
    with open('{}/{}.json'.format(prefix, filename), 'w') as json_file:
        json.dump(wsb_posts, json_file)
        
    print('Extracted {} from {}.zst'.format(len(wsb_posts), filename))
    
    if all([p['subreddit'] == 'wallstreetbets' for p in wsb_posts]):
        print('No subreddit errors detected.')
    else:
        print('Subreddit contamination detected.')

Extracted 5052 from RS_2019-10.zst
No subreddit errors detected.
Extracted 7604 from RS_2019-11.zst
No subreddit errors detected.
Extracted 6110 from RS_2019-12.zst
No subreddit errors detected.


In [15]:
all([p['subreddit'] == 'wallstreetbets' for p in test_load])

True

In [11]:


with open('../../data/tabular/reddit_post_dumps/RS_2019-09.json', 'w') as json_file:
    wsb_2019_09 = json.dump(wsb_posts, json_file) 

