In [1]:
import numpy as np
import praw
from psaw import PushshiftAPI
from asterixdb.asterixdb import AsterixConnection
from credentials import CLIENT_ID, CLIENT_SECRET, PASSWORD, USERNAME
from datetime import datetime
import json

In [2]:
con = AsterixConnection(server='http://localhost', port=19002)

In [3]:
corrections = con.query('''
    USE FactMap;

    SELECT m.*
    FROM matchCorrections m;
    ''').results

In [4]:
news = con.query('''
    USE FactMap;
    
    SELECT m.*
    FROM matchNews m;
''').results

In [5]:
reddit = praw.Reddit(
    user_agent="Comment Extraction",
    client_id=CLIENT_ID,
    client_secret=CLIENT_SECRET,
    username=USERNAME,
    password=PASSWORD
)
api = PushshiftAPI(reddit)

----

Get comments for each matching news or correction post:

In [6]:
post_ids = {n['p']['id']: n['p']['created_utc'] for n in news+corrections}

----

In [13]:
comments = list(api.search_comments(link_id=list(post_ids.keys())[0]))

In [11]:
filepath = '/Users/ageil/Github/FactMap/Data/comments.json'
pids = []

with open(filepath) as fp:
    line = fp.readline()
    cnt = 1
    while line:
        print("Line {}: {}".format(cnt, line.strip()))
        line = fp.readline().split('\n')[0]
        data = json.loads(line)
        pids.append(data)
        cnt += 1

Line 1: {"pid": "741sdn", "api_num_comments": 132, "comments": {"cid": "dnusak0", "author": null, "created_utc": 1507050399.0, "ups": -11, "downs": 0, "body_len": 64, "parent_id": "t3_741sdn", "delta_seconds": 72051}}
Line 2: {"pid": "744o7a", "api_num_comments": 0}
Line 3: {"pid": "76yrqg", "api_num_comments": 1, "comments": {"cid": "dohmte9", "author": "zroxx2", "created_utc": 1508249356.0, "ups": 2, "downs": 0, "body_len": 245, "parent_id": "t3_76yrqg", "delta_seconds": 72022}}
Line 4: {"pid": "76z1xu", "api_num_comments": 0}
Line 5: {"pid": "7710xd", "api_num_comments": 0}
Line 6: {"pid": "77129v", "api_num_comments": 0}
Line 7: {"pid": "7717ow", "api_num_comments": 39, "comments": {"cid": "doi7vre", "author": "RedditYearTwo", "created_utc": 1508271120.0, "ups": 1, "downs": 0, "body_len": 52, "parent_id": "t3_7717ow", "delta_seconds": 72524}}
Line 8: {"pid": "772mbg", "api_num_comments": 0}
Line 9: {"pid": "772mtt", "api_num_comments": 0}
Line 10: {"pid": "7747ok", "api_num_comment

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [14]:
loaded_pids = set(p['pid'] for p in pids)

In [15]:
missing_pids = set(post_ids.keys()) - loaded_pids

In [16]:
missing = {m['p']['id']: m['p']['created_utc'] for m in news+corrections if m['p']['id'] in missing_pids}

In [19]:
post_ids = {n['p']['id']: n['p']['created_utc'] for n in news+corrections}  # post_id: created_utc
failures = dict()

for idx, (pid, created_utc) in enumerate(missing.items()):
    if idx % 100 == 0:
        print('{}/{}'.format(idx, len(missing.keys())))
    try:
        post_datetime = datetime.strptime(created_utc, '%Y-%m-%dT%H:%M:%S.000Z')
        comments = list(api.search_comments(link_id=pid))
        num_comments = len(comments)
        comment_data = {'pid': pid, 'api_num_comments': num_comments}
        for c in comments:
            delta_sec = datetime.fromtimestamp(c.created_utc) - post_datetime
            attributes = dict()
            attributes['cid'] = c.id
            attributes['author'] = c.author.name if c.author else None  # None if author name deleted(?)
            attributes['created_utc'] = c.created_utc
            attributes['ups'] = c.ups
            attributes['downs'] = c.downs
            attributes['body_len'] = len(c.body)
            attributes['parent_id'] = c.parent_id
            attributes['delta_seconds'] = delta_sec.seconds
            comment_data['comments'] = attributes
        with open('/Users/ageil/Github/FactMap/Data/comments_missing.json', 'a') as f:
            json.dump(comment_data, f)
            f.write('\n')
    except:
        failures[pid] = created_utc

0/155




100/155


---

In [None]:
post_ids = {n['p']['id']: n['p']['created_utc'] for n in news+corrections}  # post_id: created_utc
failures = dict()

for idx, (pid, created_utc) in enumerate(post_ids.items()):
    if idx % 100 == 0:
        print('{}/{}'.format(idx, len(post_ids.keys())))
    try:
        post_datetime = datetime.strptime(created_utc, '%Y-%m-%dT%H:%M:%S.000Z')
        comments = list(api.search_comments(link_id=pid))
        num_comments = len(comments)
        comment_data = {'pid': pid, 'api_num_comments': num_comments}
        for c in comments:
            delta_sec = datetime.fromtimestamp(c.created_utc) - post_datetime
            attributes = dict()
            attributes['cid'] = c.id
            attributes['author'] = c.author.name if c.author else None  # None if author name deleted(?)
            attributes['created_utc'] = c.created_utc
            attributes['ups'] = c.ups
            attributes['downs'] = c.downs
            attributes['body_len'] = len(c.body)
            attributes['parent_id'] = c.parent_id
            attributes['delta_seconds'] = delta_sec.seconds
            comment_data['comments'] = attributes
        with open('/Users/ageil/Github/FactMap/Data/comments.json', 'a') as f:
            json.dump(comment_data, f)
            f.write('\n')
    except:
        failures[pid] = created_utc

0/11647


In [None]:
len(failures.keys())