In [2]:
import pandas as pd
from io import StringIO

In [3]:
import datetime

In [4]:
import bz2

In [6]:
columns=['video_id','comment_id','likes','code',
         'source_user_id','target_user_id', 
         'comment']



In [7]:
new_columns=['video_id','comment_id','likes','code',
         'source_user_id','target_user_id',
         'parent_comment_id','depth','comment']



In [10]:
def dict_to_string(dictionary, columns):
    string = ''
    for c in columns:
        string += str(dictionary[c])
        string += ','
    string += '\n'
    return string

In [11]:
def combine_comment_segments(comment_segments):
    comment = ''
    for seg in comment_segments:
        comment += seg
    return comment

In [19]:
try: 
    print('time start: {0}'.format(datetime.datetime.now()))
    with bz2.open('./user_convos_2.bz2', 'at') as fout:
        with bz2.BZ2File('./user_comments.csv.bz2', 'r') as fin:
            i = 0

            # For each user, their latest comment id
            latest_comment = {}
            # For each user, their latest comment's depth
            latest_depth = {}
            for line in fin:
                
                # Debugging
                #i += 1
                #if i == 100: 
                #    break

                line = line.decode('utf-8')
                line = line.rstrip()
                ls = line.split(",")

                # Concatenate comments into one column (previously separated at each comma)
                comment = combine_comment_segments(
                    ls[len(columns)-1: len(ls)]
                )
                data = ls[0:len(columns)-1]
                data.append(comment)


                data_dict = dict(zip(columns, data))

                code = int(data_dict['code'])
                comment_id = data_dict['comment_id']
                source_user_id = data_dict['source_user_id']

                if code == 0:
                    parent_comment_id = 'root'
                    depth = 0
                    # Reset to a dictionary with just the (new) root comment
                    latest_comment = {source_user_id: comment_id}
                    latest_depth = {source_user_id: 0}

                # User source_id replies to user target_id, and target_id exists in thread
                # We assume that source_id is replying to target_id's *latest* comment
                # code == 3 is a special case where target comment == root comment 
                #    (thus, target_id == root comment user)
                elif code == 1 or code == 3:
                    target_user_id = data_dict['target_user_id']
                    if target_user_id in latest_comment.keys():
                        parent_comment_id = latest_comment[target_user_id]
                        depth = latest_depth[target_user_id]+1
                        latest_comment[source_user_id] = comment_id
                        latest_depth[source_user_id] = depth
                
                # Ways to deal with code==2 (target user not existent so far in the thread) or REMOVED_USER:
                #   - Treat as depth 1, replying to root
                #   - Treat as depth 1, replying to non-existent parent (parent_comment_id = -1)
                #   [x] - Ignore (i.e. don't write), and ignore future replies to this comment
                # Thus if we've arrived at any of these else statements, then the comment is either:
                #   (1) target_id == REMOVED_USER (and code == 1)
                #   (2) code == 2
                #   (3) a reply to one of the above without a "sensible" comment by target_id occuring in between
                    else:
                        parent_comment_id = '-1'
                        depth = '-1'         
                else:
                    # Code == 2
                    parent_comment_id = '-1'
                    depth = '-1'

                data_dict['parent_comment_id'] = parent_comment_id
                data_dict['depth'] = depth

                #print(data_dict)
                #print('code: {0}, comment_id: {1}'.format(code, comment_id))
                #print()

                fout.write(dict_to_string(data_dict, new_columns))
                
except:
    print('iteration: {0}'.format(i))
    print('data_dict: {0}'.format(data_dict))
    print('latest_comment: {0}'.format(latest_comment))
    print('latest_depth: {0}'.format(latest_depth))
    raise

finally:
    print('time end: {0}'.format(datetime.datetime.now()))


time start: 2022-07-05 17:57:31.907371
time end: 2022-07-05 19:02:05.251984


In [20]:
# Time ~ 1hr

In [24]:
# Find a suitable example tree
tree = []
desired_depth = 10

with bz2.BZ2File('./user_convos_2.bz2', 'r') as fin:
    max_depth = 0
    i = 0
    for line in fin:
        # i += 1
        # if i == 100: 
        #     break
        
        line = line.decode('utf-8')
        line = line.rstrip()
        ls = line.split(",")
        
        data = dict(zip(new_columns, ls))
        
        code = int(data['code'])
        depth = int(data['depth'])
        
        # print('depth: {0}, max_depth: {1}'.format(depth, max_depth))
        
        if depth > max_depth:
            max_depth = depth
        
        if code==0:
            if max_depth > desired_depth:
                break
            else:
                # new thread
                tree = []
                max_depth = 0
            
        tree.append(data)  
        
        # print(data)

In [25]:
# Has desired depth
tree

[{'video_id': 'uu6rQCDn_DE',
  'comment_id': 'UgzxbaDCpzIVRYvjIiB4AaABAg',
  'likes': '197',
  'code': '0',
  'source_user_id': 'UCfEbMejPZjYoJuHRUCnejAg',
  'target_user_id': 'L',
  'parent_comment_id': 'root',
  'depth': '0',
  'comment': 'Trump Barr and his various enablers are discusting.'},
 {'video_id': 'uu6rQCDn_DE',
  'comment_id': 'UgzxbaDCpzIVRYvjIiB4AaABAg.9ArTs3ZsaXO9ArY2lYA5US',
  'likes': '6',
  'code': '3',
  'source_user_id': 'UC8B_-1i9J2ujmf6SngxPHOQ',
  'target_user_id': 'UCfEbMejPZjYoJuHRUCnejAg',
  'parent_comment_id': 'UgzxbaDCpzIVRYvjIiB4AaABAg',
  'depth': '1',
  'comment': 'So is your English. Are you new to this country? Welcome to America!'},
 {'video_id': 'uu6rQCDn_DE',
  'comment_id': 'UgzxbaDCpzIVRYvjIiB4AaABAg.9ArTs3ZsaXO9ArYa6AyI2S',
  'likes': '29',
  'code': '1',
  'source_user_id': 'UCOrva2qlW7Cd-hnLXe1zM8Q',
  'target_user_id': 'UC8B_-1i9J2ujmf6SngxPHOQ',
  'parent_comment_id': 'UgzxbaDCpzIVRYvjIiB4AaABAg.9ArTs3ZsaXO9ArY2lYA5US',
  'depth': '2',
  'co

In [26]:
len(tree)

36

In [27]:
# source: https://www.geeksforgeeks.org/how-to-get-value-from-address-in-python/
import ctypes
def get_address(obj):
    return id(obj)
# e.g. 140650723644544
def get_obj(address):
    return ctypes.cast(address, ctypes.py_object).value

In [28]:
import copy
tree_copy = copy.deepcopy(tree)

In [29]:
# make the json tree
tree_copy[0]['children']=[]
tree_json = tree_copy[0]
# comment_id --> reference to the childrens list of the comment ID
children_ref = {}
children_ref[tree_copy[0]['comment_id']] = get_address(tree_copy[0]['children'])

for node in tree_copy[1:]:
    # Ignore if one of the (1-3) edge cases described above
    if node['parent_comment_id'] not in children_ref.keys():
        continue
    if len(node['comment']) > 10:
        node['comment'] = node['comment'][:10] + '...'
    node['children'] = []

    attach_list_ref = children_ref[node['parent_comment_id']]
    attach_list = get_obj(attach_list_ref)
    attach_list.append(node)
    
    children_ref[node['comment_id']] = get_address(node['children'])

In [30]:
tree_json

{'video_id': 'uu6rQCDn_DE',
 'comment_id': 'UgzxbaDCpzIVRYvjIiB4AaABAg',
 'likes': '197',
 'code': '0',
 'source_user_id': 'UCfEbMejPZjYoJuHRUCnejAg',
 'target_user_id': 'L',
 'parent_comment_id': 'root',
 'depth': '0',
 'comment': 'Trump Barr and his various enablers are discusting.',
 'children': [{'video_id': 'uu6rQCDn_DE',
   'comment_id': 'UgzxbaDCpzIVRYvjIiB4AaABAg.9ArTs3ZsaXO9ArY2lYA5US',
   'likes': '6',
   'code': '3',
   'source_user_id': 'UC8B_-1i9J2ujmf6SngxPHOQ',
   'target_user_id': 'UCfEbMejPZjYoJuHRUCnejAg',
   'parent_comment_id': 'UgzxbaDCpzIVRYvjIiB4AaABAg',
   'depth': '1',
   'comment': 'So is your...',
   'children': [{'video_id': 'uu6rQCDn_DE',
     'comment_id': 'UgzxbaDCpzIVRYvjIiB4AaABAg.9ArTs3ZsaXO9ArYa6AyI2S',
     'likes': '29',
     'code': '1',
     'source_user_id': 'UCOrva2qlW7Cd-hnLXe1zM8Q',
     'target_user_id': 'UC8B_-1i9J2ujmf6SngxPHOQ',
     'parent_comment_id': 'UgzxbaDCpzIVRYvjIiB4AaABAg.9ArTs3ZsaXO9ArY2lYA5US',
     'depth': '2',
     'comment'

In [31]:
import json
with open("example_tree_2.json", "w") as out:
    json.dump(tree_json, out)