## This notebook removes all users having commented a single time across all datasets
(only if their comment is a leaf in the conversation tree)

In [10]:
import pickle
import os
import sys

import numpy as np
import pandas as pd

In [None]:
reddittors_data = pickle.load(open('reddittors_data.pickle', 'rb'))
reddittors_data_ids = pickle.load(open('redditors_data_ids.pickle', 'rb'))
submitters_data_ids = pickle.load(open('submitters_data_ids.pickle', 'rb'))
reddittors_ids_to_names = pickle.load(open('reddittors_ids_to_names.pickle', 'rb'))

In [None]:
path = ""
subs_to_include = pickle.load(open(os.path.join(path, "to_include", "final_subs_to_include.pkl"), "rb"))
redditors_to_include = pickle.load(open(os.path.join(path, "to_include", "users_to_include.pkl"), "rb"))
bots = pickle.load(open(os.path.join(path, "to_include", "bots.pkl"), "rb"))

### Remove users

In [14]:
# remove users if not present in the list of users to include
to_remove = bots

for user in reddittors_data.keys():
    if user not in redditors_to_include:
        to_remove.add(user)

In [15]:
for user in to_remove:
    if user in reddittors_data:
        del reddittors_data[user]
    if user in reddittors_data_ids:
        del reddittors_data_ids[user]
    if user in submitters_data_ids:
        del submitters_data_ids[user]
    if user in reddittors_ids_to_names:
        del reddittors_ids_to_names[user]

In [16]:
len(reddittors_data), len(reddittors_data_ids), len(submitters_data_ids), len(reddittors_ids_to_names)

(94536, 94536, 3788, 94537)

### Find one time poster

In [17]:
one_timers = set()
one_timers_sub = {}

for user in reddittors_data.keys():
    n_actions = 0

    subreddits = reddittors_data[user].keys()
    #print(subreddits)

    for subreddit in subreddits:
        #print(reddittors_data[user][subreddit])
        n_actions += sum(reddittors_data[user][subreddit])

    #print(n_actions)

    if n_actions == 1:
        one_timers.add(user)
        one_timers_sub[user] = reddittors_data_ids[user][list(subreddits)[0]][0]

In [18]:
len(one_timers), len(one_timers_sub)

(53772, 53772)

In [19]:
submissions_to_check = set(one_timers_sub.values())
len(submissions_to_check)

4950

### Check which one-timer has posted a leaf comment

In [None]:
path = ""
thread_dir = os.path.join(path, "threads")
subreddit_dirs = os.listdir(thread_dir)

In [21]:
# here we check whether a one-timer has posted a leaf comment
# in that case, we can safely ignore their comment
for subreddit in subreddit_dirs:
    subreddit_name = subreddit.split("_")[0].lower()

    print("Working on", subreddit_name)

    subreddit_path = os.path.join(thread_dir, subreddit)
    thread_files = os.listdir(subreddit_path)

    for thread_file in thread_files:
        # if the submission is not in the list of submissions to check, we can skip it
        if thread_file not in submissions_to_check:
            continue

        thread_file_path = os.path.join(subreddit_path, thread_file, f"{thread_file}.csv")
        try:
            thread_df = pd.read_csv(thread_file_path)
        except pd.errors.EmptyDataError:
            print("Empty file:", thread_file_path)
            continue

        if 'author_id' not in thread_df:
            print(f"Problems with {thread_file_path}")
            continue

        all_authors = set(thread_df['author_id'].values)
        one_timers_in_thread = all_authors.intersection(one_timers)

        for user in one_timers_in_thread:
            # we need to check whether the user has posted a leaf comment
            # this means that no other comment has its id as parent_id

            # comm_id = 't1_' + thread_df[thread_df['author_id'] == user]['comm_id'].values[0]
            # as_parent = thread_df[thread_df['parent_id'] == comm_id]
            comm_id = thread_df[thread_df['author_id'] == user]['comm_id'].values[0]
            parent_ids = set([p_id[3:] for p_id in thread_df['parent_id'].values])
            
            if comm_id in parent_ids:
                # the user has not posted a leaf comment
                # so we should remove it from the list of one-timers
                one_timers.remove(user)
        

Working on askreddit
Working on california
Working on news
Working on collapse
Working on bayarea
Working on politics


In [23]:
len(one_timers)

40020

#### Check which one-timer is a submitter

In [None]:
path = ""

bayarea_sub_file = os.path.join(path, "submissions", "bayarea_2020-07-01_2022-12-31", "bayarea_subs.csv")
california_sub_file = os.path.join(path, "submissions", "California_2020-07-01_2022-12-31", "California_subs.csv")
collapse_sub_file = os.path.join(path, "submissions", "collapse_2020-07-01_2022-12-31", "collapse_subs.csv")
news_sub_file = os.path.join(path, "submissions", "news_2020-07-01_2022-12-31", "news_subs.csv")
politics_sub_file = os.path.join(path, "submissions", "politics_2020-07-01_2022-12-31", "politics_subs.csv")
askreddit_sub_file = os.path.join(path, "submissions", "AskReddit_2020-07-01_2022-12-31", "AskReddit_subs.csv")

In [25]:
bayarea_sub_df = pd.read_csv(bayarea_sub_file)
california_sub_df = pd.read_csv(california_sub_file)
collapse_sub_df = pd.read_csv(collapse_sub_file)
news_sub_df = pd.read_csv(news_sub_file)
politics_sub_df = pd.read_csv(politics_sub_file)
askreddit_sub_df = pd.read_csv(askreddit_sub_file)

In [None]:
# we should not remove one-timers that have posted a submission

# author_id -> [sub_ids]
submitters = {}

for subreddit in subreddit_dirs:
    subreddit_name = subreddit.split("_")[0].lower()
    print("Working on", subreddit_name)

    _df = eval(subreddit_name + "_sub_df")
    #print(len(_df))

    for index, row in _df.iterrows():
        
        if pd.isnull(row["author_id"]):
            continue

        sub_id = 't3_'+ row["sub_id"]
        author_id = row["author_id"]

        if author_id not in submitters:
            submitters[author_id] = {sub_id}
        else:
            submitters[author_id].add(sub_id)

Working on askreddit
Working on california
Working on news
Working on collapse
Working on bayarea
Working on politics


In [27]:
len(submitters)

3877

In [28]:
cnt = 0
one_timers_cp = one_timers.copy()

for user in one_timers_cp:
    if user in submitters:
        cnt += 1
        one_timers.remove(user)

In [29]:
len(one_timers)

38923

### Save the results

In [None]:
# store one_timers with pickle
path = ""
pickle.dump(one_timers, open(os.path.join(path, "one_timers.pkl"), "wb"))

### Look for all submissions in which each user has contributed only to it 

In [None]:
path = ""

bayarea_sub_file = os.path.join(path, "submissions", "bayarea_2020-07-01_2022-12-31", "bayarea_subs.csv")
california_sub_file = os.path.join(path, "submissions", "California_2020-07-01_2022-12-31", "California_subs.csv")
collapse_sub_file = os.path.join(path, "submissions", "collapse_2020-07-01_2022-12-31", "collapse_subs.csv")
news_sub_file = os.path.join(path, "submissions", "news_2020-07-01_2022-12-31", "news_subs.csv")
politics_sub_file = os.path.join(path, "submissions", "politics_2020-07-01_2022-12-31", "politics_subs.csv")
askreddit_sub_file = os.path.join(path, "submissions", "AskReddit_2020-07-01_2022-12-31", "AskReddit_subs.csv")

In [31]:
bayarea_sub_df = pd.read_csv(bayarea_sub_file)
california_sub_df = pd.read_csv(california_sub_file)
collapse_sub_df = pd.read_csv(collapse_sub_file)
news_sub_df = pd.read_csv(news_sub_file)
politics_sub_df = pd.read_csv(politics_sub_file)
askreddit_sub_df = pd.read_csv(askreddit_sub_file)

In [32]:
to_remove = bots.union(one_timers)
redditors_to_include = {x for x in redditors_to_include if x not in to_remove}

In [33]:
len(redditors_to_include)

55613

In [34]:
_subs_to_include = [sub_id[3:] for sub_id in subs_to_include]
_subs_to_include;

In [35]:
subs_to_ignore = set()
subs_to_ignore_authors = set()
subs_to_ignore_cnt = 0

for subreddit in subreddit_dirs:
    subreddit_name = subreddit.split("_")[0].lower()
    print("Working on", subreddit_name)

    subreddit_path = os.path.join(thread_dir, subreddit)
    thread_files = os.listdir(subreddit_path)

    # first, we load all submissions' metadata 
    # within the subreddit
    _df = eval(subreddit_name + "_sub_df")

    for thread_file in thread_files:
        if thread_file not in _subs_to_include:
            continue

        # we get the submitter_id
        submitter_id = _df[_df['sub_id'] == thread_file]['author_id'].iloc[0]

        thread_file_path = os.path.join(subreddit_path, thread_file, f"{thread_file}.csv")
        try:
            thread_df = pd.read_csv(thread_file_path)
        except pd.errors.EmptyDataError:
            print("Empty file:", thread_file_path)
            continue

        if 'author_id' not in thread_df:
            print(f"Problems with {thread_file_path}")
            continue
       
        # filter out all users not in redditors_to_include
        thread_df = thread_df[thread_df['author_id'].isin(redditors_to_include)]

        if len(thread_df) == 0:
            #print("Empty file:", thread_file_path)
            subs_to_ignore.add('t3_'+thread_file)
            subs_to_ignore_authors.add(submitter_id)
            subs_to_ignore_cnt += 1
            continue
        

Working on askreddit
Working on california
Working on news
Working on collapse
Working on bayarea
Working on politics


In [27]:
len(subs_to_ignore), len(subs_to_ignore_authors)

(26, 19)

In [50]:
# okay, now we have all conversations in which only one-timers have interacted
# we need to remove these conversations from the dataset
cleaned_subs_to_include = [sub_id for sub_id in subs_to_include if sub_id not in subs_to_ignore]
len(cleaned_subs_to_include)

6253

In [None]:
# path = ""
# pickle.dump(subs_to_ignore, open(os.path.join(path, "one_timer_subs.pkl"), "wb"))

In [76]:
one_timer_subs_authors = set()

for author in subs_to_ignore_authors:
    try:
        n_cont = sum([n_actions for actions in (reddittors_data[author]).values() for n_actions in actions])
    
        if n_cont == 1:
            one_timer_subs_authors.add(author)
        
        if n_cont == 2:
            subs = list(submitters[author])
            if subs[0] in subs_to_ignore and subs[1] in subs_to_ignore:
                print("Both in subs_to_ignore:", author)
                one_timer_subs_authors.add(author)
    except KeyError:
        print("KeyError:", author)
        continue

KeyError: nan


In [78]:
len(one_timer_subs_authors)

9

In [None]:
# pickle one_timer_subs_authors
path = ""
pickle.dump(one_timer_subs_authors, open(os.path.join(path, "one_timer_subs_authors.pkl"), "wb"))