In [79]:
import pandas as pd
import itertools as itt
import praw
import csv
from collections import defaultdict
from datetime import date 
import json


sub_ids_path = "../../data/all_ids.csv"
user_int_path = "../../data/user_interactions"

In [92]:
#do we use only original content or not?
#For later: consider upvote ratio & time-based statistics

class UserInteractionCollector:

    def __init__(self, client):
        self.client = client
        self.users = defaultdict(lambda: {
            "no_posts": 0, 
            "no_comments": 0, 
            "post_karma": 0,     
            "comment_karma": 0,
            "directs": defaultdict(int),
            "indirects": defaultdict(int),
            "first_date": date(2022,2,20),
            "last_date":  date(1999,1,1),
            "user_id": ""
        })

    def process_submission(self, id):
        try:
            post = self.client.submission(id=id)
            self.save_submission_details(post)
        except AttributeError:
            print("Submission id {} failed".format(id))

    def save_submission_details(self, post):
        '''Saves given submission user interaction details into the users dictionary'''
        
        def update_last_seen(user, date):
            '''Updates user first and last date attributes based on observed activity date'''
            if user['first_date'] > date:
                user['first_date'] = date            

            if user['last_date'] < date:
                user['last_date'] = date

        def parse_op_data(post, thread_participants, parents):
            '''Parses post author information'''
            if (post.author is not None) and (hasattr(post.author, 'id')): #happens when the user is deleted/suspended
            #include post statistics & date attributes for the author    
                op = self.users[post.author.id]
                op['user_id'] = post.author.id
                op['no_posts'] += 1
                op['post_karma'] += post.score
                update_last_seen(op, date.fromtimestamp(post.created_utc))
                
                #add to participant list
                thread_participants.add(post.author.id)
            
                #add op to the parent id list    
                parents[post.id] = post.author.id
        
        def parse_comments(post, thread_participants, parents):
            '''Parses all comments of a post'''
            #iterate through comments
            post.comments.replace_more(limit=None)
            for comment in post.comments.list():        

                #include comment statistics & date attributes for the author
                if (comment.author is not None) and (hasattr(comment.author, 'id')):
                    user = self.users[comment.author.id]
                    user['user_id'] = comment.author.id
                    user['no_comments'] += 1
                    user['comment_karma'] += comment.score
                    update_last_seen(user, date.fromtimestamp(comment.created_utc))
                
                    #add to participant list
                    thread_participants.add(comment.author.id)
            
                    #add user to the parent id list
                    parents[comment.id] = comment.author.id
                
                    #find who is the author of the parent
                    real_parent_id = comment.parent_id[3:] #first two symbols is an artificial prefix
                    parent_user_id = parents[real_parent_id]                
                    
                    #save evidence of direct iteraction
                    if parent_user_id is not None:
                        self.users[comment.author.id]['directs'][parent_user_id] += 1
                        self.users[parent_user_id]['directs'][comment.author.id] += 1
    
        thread_participants = set() #a set of all thread participants
        parents = defaultdict(lambda: None) #lookup table for author of each parent thread (to track direct interactions)

        parse_op_data(post, thread_participants, parents)
        #parse_comments(post, thread_participants, parents)
            
        #add all indirect interactions
        for person in thread_participants:
            others = thread_participants.difference(set([person]))
            for other_person in others:
                self.users[person]['indirects'][other_person] += 1

In [93]:
client = praw.Reddit(
    client_id="",
    client_secret="",
    user_agent="",
)
collector = UserInteractionCollector(client=client)

with open(sub_ids_path, newline='') as csvfile:
    idreader = csv.reader(csvfile, delimiter=',')
    next(idreader) #skip the header
      
    for i, row in enumerate(idreader):        
        collector.process_submission(row[1]) #process the submission 
        if i % 50 == 0:
            print("Processing submission #{}".format(i))      
        if (i > 0) and (i % 500 == 0):
            df = pd.DataFrame(list(collector.users.values()))
            df['directs'] = df['directs'].apply(json.dumps)
            df['indirects'] = df['indirects'].apply(json.dumps)
            df.to_csv("{}-{}.csv".format(user_int_path, i), index=False)
        if i == 1000:
            break                

Processing submission #0
Processing submission #50
Processing submission #100
Processing submission #150
Processing submission #200
Processing submission #250
Processing submission #300
Processing submission #350
Processing submission #400
Processing submission #450
Processing submission #500
Processing submission #550
Processing submission #600
Processing submission #650
Processing submission #700
Processing submission #750
Processing submission #800
Processing submission #850
Processing submission #900
Processing submission #950
Processing submission #1000


In [89]:
sum([u['no_posts'] for u in collector.users.values()])

15