## This notebook prunes the filtered dataset considering only the largest connected component

We build the (undirected) reply-based graph, and we compute the largest connected component

    - we link two users u and v if v has replied to u.
    - we only consider all submissions created by a user contained in the giant component.

In [1]:
import os
import pickle

import networkx as nx
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
path = ""

bayarea_sub_file = os.path.join(path, "submissions", "bayarea_2020-07-01_2022-12-31", "bayarea_subs.csv")
california_sub_file = os.path.join(path, "submissions", "California_2020-07-01_2022-12-31", "California_subs.csv")
collapse_sub_file = os.path.join(path, "submissions", "collapse_2020-07-01_2022-12-31", "collapse_subs.csv")
news_sub_file = os.path.join(path, "submissions", "news_2020-07-01_2022-12-31", "news_subs.csv")
politics_sub_file = os.path.join(path, "submissions", "politics_2020-07-01_2022-12-31", "politics_subs.csv")
askreddit_sub_file = os.path.join(path, "submissions", "AskReddit_2020-07-01_2022-12-31", "AskReddit_subs.csv")

In [3]:
bayarea_sub_df = pd.read_csv(bayarea_sub_file)
california_sub_df = pd.read_csv(california_sub_file)
collapse_sub_df = pd.read_csv(collapse_sub_file)
news_sub_df = pd.read_csv(news_sub_file)
politics_sub_df = pd.read_csv(politics_sub_file)
askreddit_sub_df = pd.read_csv(askreddit_sub_file)

### Build user and subs data dict

In [4]:
thread_dir = os.path.join(path, "threads")
subreddit_dirs = os.listdir(thread_dir)

In [6]:
reddittors_data = {}
subs_data = {}

In [None]:
# here we populate the reddittors_data and subs_data dictionary
# based on submission data
for subreddit in subreddit_dirs:
    subreddit_name = subreddit.split("_")[0].lower()
    print("Working on", subreddit_name)

    _df = eval(subreddit_name + "_sub_df")

    for index, row in _df.iterrows():
        if pd.isnull(row["author_id"]):
            continue

        sub_id = 't3_'+ row["sub_id"]

        # populate reddittors_data
        if row["author_id"] not in reddittors_data:
            reddittors_data[row["author_id"]] = {
                "submissions": {sub_id}
            }
        else:
            reddittors_data[row["author_id"]]["submissions"].add(sub_id)

        # populate subs_data
        if sub_id not in subs_data:
            subs_data[sub_id] = {
                "reddittors": {row["author_id"]}
            }
        else:
            subs_data[sub_id]["reddittors"].add(row["author_id"])

Working on askreddit
Working on california
Working on news
Working on collapse
Working on bayarea
Working on politics


In [9]:
len(reddittors_data)

3877

In [10]:
# here we add users from conversational threads
for subreddit in subreddit_dirs:
    subreddit_name = subreddit.split("_")[0].lower()
    print("Working on", subreddit_name)

    subreddit_path = os.path.join(thread_dir, subreddit)
    thread_files = os.listdir(subreddit_path)

    for thread_file in thread_files:
        thread_file_path = os.path.join(subreddit_path, thread_file, f"{thread_file}.csv")
        try:
            thread_df = pd.read_csv(thread_file_path)
        except pd.errors.EmptyDataError:
            print("Empty file:", thread_file_path)
            continue

        if 'author_id' not in thread_df:
            print(f"Problems with {thread_file_path}")
            continue

        for index, row in thread_df.iterrows():
            # populate reddittors_data
            if row["author_id"] not in reddittors_data:
                reddittors_data[row["author_id"]] = {
                    "submissions": {row["sub_id"]},
                }
            else:
                reddittors_data[row["author_id"]]["submissions"].add(row["sub_id"])

            # populate subs_data
            if row["sub_id"] not in subs_data:
                subs_data[row["sub_id"]] = {
                    "reddittors": {row["author_id"]},
                }
            else:
                subs_data[row["sub_id"]]["reddittors"].add(row["author_id"])

Working on askreddit
Working on california
Working on news
Working on collapse
Working on bayarea
Working on politics


In [11]:
len(reddittors_data)

102115

### Retrieve all submissions in which each user has contributed only to it 

In [24]:
# helper function returning the number of submissions a reddittor has contributed to
def get_n_contributions(reddittors_data, user_id):
    return len(reddittors_data[user_id]["submissions"])

In [None]:
# let's check how many reddittors have contributed to 1 submission
# we have 7 conversations for which the submitter has also participated in other submissions
subs_to_ignore_one_time = set()
n_to_remove = 0

for subreddit in subreddit_dirs:
    subreddit_name = subreddit.split("_")[0].lower()
    print("Working on", subreddit_name)

    subreddit_path = os.path.join(thread_dir, subreddit)
    thread_files = os.listdir(subreddit_path)

    for thread_file in thread_files:
        thread_file_path = os.path.join(subreddit_path, thread_file, f"{thread_file}.csv")
        try:
            thread_df = pd.read_csv(thread_file_path)
        except pd.errors.EmptyDataError:
            print("Empty file:", thread_file_path)
            continue

        if 'author_id' not in thread_df:
            print(f"Problems with {thread_file_path}")
            continue

        contributed_subs = set()
        
        for index, row in thread_df.iterrows():
            user_id = row["author_id"]
            sub_id = row["sub_id"]

            contributed_subs.update(reddittors_data[user_id]["submissions"])
                

        # if all users have contributed only to this submission, we ignore it
        if len(contributed_subs) == 1 and sub_id in contributed_subs:
            n_to_remove += 1
            subs_to_ignore_one_time.add('t3_'+thread_file)
            #print("All users have only contributed to this submission")

Working on askreddit
Working on california
Working on news
Working on collapse
Working on bayarea
Working on politics


In [95]:
len(subs_to_ignore_one_time)

93

### Build the interaction network

In [12]:
G = nx.Graph()

In [13]:
# sub_id -> author_id
subs2author = {}

# author_id -> [sub_ids]
submitters = {}
submitters_subreddits = {}

In [14]:
# add reddittors as nodes
# from submission data
# we cannot just use the dict we built before
# since we need to 
# - store which user has submitted which submission
# - remove 1-time contributors

#_ignored_subs = set()

deleted_users_cnt = 0

for subreddit in subreddit_dirs:
    subreddit_name = subreddit.split("_")[0].lower()
    print("Working on", subreddit_name)

    _df = eval(subreddit_name + "_sub_df")
    #print(len(_df))

    for index, row in _df.iterrows():
        # if row["sub_id"] in subs_to_ignore:
        #     #print("Ignoring submission (1-time contributors)", row["sub_id"])
        #     _ignored_subs.add(row["sub_id"])
        #     continue
        
        if pd.isnull(row["author_id"]):
            deleted_users_cnt += 1
            continue

        sub_id = 't3_'+ row["sub_id"]
        author_id = row["author_id"]

        G.add_node(author_id)
        subs2author[sub_id] = author_id

        if author_id not in submitters:
            submitters[author_id] = {sub_id}
            submitters_subreddits[author_id] = {subreddit_name}
        else:
            submitters[author_id].add(sub_id)
            submitters_subreddits[author_id].add(subreddit_name)

Working on askreddit
Working on california
Working on news
Working on collapse
Working on bayarea
Working on politics


In [15]:
deleted_users_cnt

984

In [16]:
len(submitters)

3877

In [17]:
# the discrepancy between the number of nodes in the graph
# and the number of reddittors in the dict is due to the fact
# that we have removed submissions with only 1-time contributors
G.number_of_nodes()

3877

In [18]:
subreddit_dirs[4]

'bayarea_2020-07-01_2022-12-31'

In [21]:
# now we add users from conversational threads
# and we link two users u and v if v has replied to u
# here we add users from conversational threads
for subreddit in subreddit_dirs:#[4:5]
    subreddit_name = subreddit.split("_")[0].lower()
    print("Working on", subreddit_name)

    subreddit_path = os.path.join(thread_dir, subreddit)
    thread_files = os.listdir(subreddit_path)

    for thread_file in thread_files:
        thread_file_path = os.path.join(subreddit_path, thread_file, f"{thread_file}.csv")
        try:
            thread_df = pd.read_csv(thread_file_path)
        except pd.errors.EmptyDataError:
            print("Empty file:", thread_file_path)
            continue

        if 'author_id' not in thread_df:
            print(f"Problems with {thread_file_path}")
            continue

        # if row['sub_id'] in subs_to_ignore:
        #     print("Ignoring submission (1-time contributors)", row['sub_id'])
        #     continue

        # we store the authors' and comments' ids
        # because we need to link them later
        aut_comms_df = thread_df[["author_id", "comm_id"]]

        for index, row in thread_df.iterrows():
            # if the author is a submitter,
            # we do not add any other node or edge
            # if row['is_submitter']:
            #     continue

            # if the submitter of the submission is not in the graph
            # it means that they have deleted their account
            # so we do not consider them
            # if row['sub_id'] not in subs2author:
            #     continue

            author_id = row["author_id"]

            # add the author as a node
            # if the node already exists, nothing happens
            G.add_node(author_id)

            # add the edge between the author and the user they replied to
            # if the edge already exists, nothing happens
            parent_id = row["parent_id"]

            # prnt parent id in submissions of submitters

            # if the parent is a submission
            if parent_id.startswith('t3_'):
                try:
                    author_parent_id = subs2author[parent_id]
                except KeyError:
                    #print("Submission not found:", thread_file_path)
                    # if this happens, it means that the submitter has deleted their account
                    # we just ignore it
                    continue
            else: # if the parent is a comment
                comm_id = parent_id[3:]
                try:
                    author_parent_id = aut_comms_df[aut_comms_df["comm_id"] == comm_id]["author_id"].values[0]
                except IndexError:
                    # if this happens, it means that the parent comment has been deleted
                    # we just ignore it
                    continue

            G.add_edge(author_id, author_parent_id)

Working on askreddit
Working on california
Working on news
Working on collapse
Working on bayarea
Working on politics


In [22]:
G.number_of_nodes(), G.number_of_edges()

(102115, 214780)

### Now we analyze the CC

In [23]:
cc = list(nx.connected_components(G))

In [24]:
cc_sizes = [len(c) for c in cc]
# sort cc_sizes in descending order
cc_sizes.sort(reverse=True)

In [25]:
# sort cc by size
cc.sort(key=len, reverse=True)

In [26]:
len(cc[0])

94537

#### Eval subs to include (considering the giant component)

In [36]:
gc_users = cc[0]
subs_to_include = set()

for redditor in gc_users:
    subs_to_include.update(reddittors_data[redditor]['submissions'])

len(subs_to_include)

148

In [None]:
data_dir = ''

with open(os.path.join(data_dir, 'subs_to_include.pkl'), 'wb') as f:
    pickle.dump(subs_to_include, f)

with open(os.path.join(data_dir, 'users_to_include.pkl'), 'wb') as f:
    pickle.dump(gc_users, f)

#### Eval subs to ignore (if we don't consider only the giant component)

In [107]:
# basically, we will ignore all users that are not in the giant component
# this means that, in principle, we could safely ignore all
# the submissions they have contributed to
redditors_to_ignore = set()

for component in cc[1:]:
    for redditor in component:
        redditors_to_ignore.add(redditor)

In [108]:
G.number_of_nodes(), len(redditors_to_ignore), G.number_of_nodes() - len(redditors_to_ignore)

(102116, 7578, 94538)

In [111]:
# let's evaluate how many submissions we can potentially ignore

possible_subs_to_ignore = set()

for redditor in redditors_to_ignore:
    for sub_id in reddittors_data[redditor]['submissions']:
        possible_subs_to_ignore.add(sub_id)


In [112]:
len(possible_subs_to_ignore), deleted_users_cnt, len(subs2author), len(subs2author) + deleted_users_cnt

(1606, 984, 5409, 6393)

In [113]:
tot_subs = len(subs2author) + deleted_users_cnt
tot_subs - len(possible_subs_to_ignore)

4787

In [114]:
subs_to_consider = set()
to_ignore = 0

for redditor in reddittors_data.keys():
    if redditor in redditors_to_ignore:
        to_ignore += 1
        continue

    for sub_id in reddittors_data[redditor]['submissions']:
        subs_to_consider.add(sub_id)


In [115]:
z = subs_to_consider.intersection(possible_subs_to_ignore)
len(z)

1492

In [116]:
# it may happen that we add a user to the graph
# but we do not add any edge because they have only replied to deleted users
subs_to_ignore = possible_subs_to_ignore.difference(subs_to_consider)
len(subs_to_ignore)

114

In [121]:
len(subs2author) + deleted_users_cnt - len(subs_to_ignore)

6279

In [119]:
z = subs_to_ignore_one_time.intersection(subs_to_ignore)
len(z)

86

In [None]:
for sub in z:
    print(sub, subs2author[sub], reddittors_data[subs2author[sub]]['submissions'])

In [120]:
# saving subs_to_ignore and redditors_to_ignore
# to avoid recomputing them
data_dir = '/home/aleant/coccons/Groot-py/data/california_wildfire_use_case/filtered_subs_and_threads/to_ignore'

with open(os.path.join(data_dir, "subs_to_ignore.pkl"), "wb") as f:
    pickle.dump(subs_to_ignore, f)

with open(os.path.join(data_dir, "redditors_to_ignore.pkl"), "wb") as f:
    pickle.dump(redditors_to_ignore, f)

In [128]:
cnt = 0
for redditor in redditors_to_ignore:
    if len(reddittors_data[redditor]['submissions']) > 1:
        cnt += 1
        #print(redditor, len(reddittors_data[redditor]['submissions']))
cnt

125