# Steps:
* 1) Collect all missing comment IDs in files for 2015 compiled by @dgaff
* 2) Collect all parent IDs that were referenced by comments in 2015, from pushshift.io data dumps on BigQuery
* 3) Intersecting missing comment IDs with referenced BQ parent IDs - gives us list of missing parents ("dangling references")
* 4) Patch step: Using the Reddit API, query for missing parents and measure the amount we can retrieve.

Note: This analysis works ONLY WITH COMMENTS from 2015. Numbers for missing 'reference' submissions would change. But for the current analysis, only missing 'referenced' comments are considered.

In [7]:
'''
author: ceshwar
date: 16-03-2018
'''

import pandas as pd

filepath = "../../missing-ids/monthly_missing_data_start_to_2017_06/2015/"

monthly_dangling_references = []

overall_parent_ids_2015 = 0
overall_missing_parents_2015 = 0

for month in range(1,13):

    import time
    start_time = time.time()

    if month < 10:
        month_str = "0" + str(month)
    else:
        month_str = str(month)
    
#     print("Working on 2015/", month_str)
    
    df = pd.read_csv(filepath + 'RC_2015-' + month_str + '.csv', names = ['start_missing_ID', 'next_known_ID', 'gap_size'])
    
    list_missing_IDs = []

    for index, row in df.iterrows():
        start = row['start_missing_ID'].astype(int)
        offset = row['gap_size']

        for i in range(start, start + offset):
            list_missing_IDs.append(i)

    temp = pd.DataFrame()
    temp['name'] = list_missing_IDs

    from numpy import base_repr

    def base_10_to_36(num):
        return base_repr(num, 36).lower()

    temp['name'] = temp['name'].apply(base_10_to_36)
    
    ###get IDs in reddit parent_ID format; example: <t1_XXXX>
    comment_ids = []
    for _id in temp['name']:
        comment_ids.append("t1_" + str(_id))
    temp['comment_id'] = comment_ids

    ###load all parent_IDs from BQ
    bigquery_parent_IDs = pd.read_csv('../../missing-ids/pid-2015-' + month_str + '.dms')
    
    ###intersection between missing IDs and parent IDs (comments only)
    hit_list = bigquery_parent_IDs[bigquery_parent_IDs.parent_id.isin(temp['comment_id'])]
    monthly_dangling_references.append(hit_list)
    print("No. of missing parent IDs for month 2015-", month_str ," = ", len(hit_list))
    
    ###print them out
    import sys
#     for _id in hit_list['parent_id']:
#         sys.stdout.write("\"" + str(_id) + "\",")
    
    overall_missing_parents_2015 += len(hit_list)
    overall_parent_ids_2015 += len(bigquery_parent_IDs)
    
#     print("\n--- %s seconds since start of this month's analysis---" % (time.time() - start_time))

print("Total parents in 2015: ", overall_parent_ids_2015)
print("Total missing parents in 2015: ", overall_missing_parents_2015)

No. of missing parent IDs for month 2015- 01  =  21
No. of missing parent IDs for month 2015- 02  =  132
No. of missing parent IDs for month 2015- 03  =  208
No. of missing parent IDs for month 2015- 04  =  138
No. of missing parent IDs for month 2015- 05  =  150
No. of missing parent IDs for month 2015- 06  =  898
No. of missing parent IDs for month 2015- 07  =  4504
No. of missing parent IDs for month 2015- 08  =  81
No. of missing parent IDs for month 2015- 09  =  24
No. of missing parent IDs for month 2015- 10  =  378
No. of missing parent IDs for month 2015- 11  =  224
No. of missing parent IDs for month 2015- 12  =  335
Total parents in 2015:  331831351
Total missing parents in 2015:  7093


In [8]:
print("Total parents in 2015: ", overall_parent_ids_2015)
print("Total missing parents in 2015: ", overall_missing_parents_2015)

Total parents in 2015:  331831351
Total missing parents in 2015:  7093


In [142]:
print("Percentage of references to missing comment IDs = ", 100*(overall_missing_parents_2015/overall_parent_ids_2015), " %")

Percentage of references to missing comment IDs =  0.002137531604118985  %


# Of 331M referenced parent comment IDs in 2015, 7093 are missing parents! 
* This is 0.0021 % out of all parent comments in 2015. This percentage would be smaller if we compare it to the actual number of comments in 2015, since this would include parents and non-parents (i.e., children or leaf nodes)

In [133]:
import praw
from itertools import islice
                
reddit = praw.Reddit(client_id=client_id,
                     client_secret=client_secret,
                     password=password,
                     user_agent=user_agent,
                     username = username)

# Query the Reddit API for these missing parent IDs and calculate how many comments can be retrieved (aka, "the patch step")

In [166]:
month = 1

missing_parents = 0
parents_retrieved = 0

for hit_list in monthly_dangling_references:
    print("Month = ", month, "/2015; #missing parents = ", len(hit_list))
    month +=1
    missing_ids = hit_list['parent_id']
    # fetch missing things by id
    missing_things = list( reddit.info(list(missing_ids)))
    
    missing_parents += len(missing_ids)
    parents_retrieved += len(missing_things)

    print('retrieved', len(missing_things), '/', len(missing_ids), 'things')
    
    if month < 10:
        month_str = "0" + str(month)
    else:
        month_str = str(month)

    ###write patches to file
    import json
    for thing in missing_things:
        data = json.dumps({i:str(j) for i, j in thing.__dict__.items() if not i.startswith('_')})    
        with open("patched-parentIDs-2015-" + month_str + ".txt", 'a+') as outfile:
            json.dump(data, outfile)

Month =  1 /2015; #missing parents =  21
retrieved 13 / 21 things
Month =  2 /2015; #missing parents =  132
retrieved 117 / 132 things
Month =  3 /2015; #missing parents =  208
retrieved 93 / 208 things
Month =  4 /2015; #missing parents =  138
retrieved 88 / 138 things
Month =  5 /2015; #missing parents =  150
retrieved 85 / 150 things
Month =  6 /2015; #missing parents =  898
retrieved 653 / 898 things
Month =  7 /2015; #missing parents =  4504
retrieved 653 / 4504 things
Month =  8 /2015; #missing parents =  81
retrieved 55 / 81 things
Month =  9 /2015; #missing parents =  24
retrieved 20 / 24 things
Month =  10 /2015; #missing parents =  378
retrieved 359 / 378 things
Month =  11 /2015; #missing parents =  224
retrieved 213 / 224 things
Month =  12 /2015; #missing parents =  335
retrieved 334 / 335 things


In [147]:
print("Overall missing parents in 2015: ", missing_parents)
print("Number of parents retrieved by patch step: ", parents_retrieved)
print("Percentage of missing parents that can be retrieved by the patch step = ", 100*(parents_retrieved/missing_parents) , " %")

Overall missing parents in 2015:  7093
Number of parents retrieved by patch step:  2683
Percentage of missing parents that can be retrieved by the patch step =  37.82602565910052  %


# Summary: 
* Out of 331M referenced parent comment IDs in 2015, 7093 are missing parents. 
* This is 0.0021 % out of all parent comments in 2015 (would be smaller when compared to total # of all comments in 2015).
* Out of these missing (parent) comments, 2683 could be retrieved by querying the Reddit API. 
* Inclusion of a patch step would help retrieve 37.83% of the above mentioned missing parent comments. 