In [1]:
import os
import pickle

import pandas as pd

### Input data

In [None]:
input_basepath = ''

subs_input_path = os.path.join(input_basepath, 'submissions')
threads_input_path = os.path.join(input_basepath, 'threads')

subs_to_include_path = os.path.join(input_basepath, 'to_include', 'final_subs_to_include.pkl')
users_to_include_path = os.path.join(input_basepath, 'to_include', 'final_users_to_include.pkl')

In [19]:
subs_to_include = pickle.load(open(subs_to_include_path, 'rb'))
users_to_include = pickle.load(open(users_to_include_path, 'rb'))

In [20]:
subs_to_include_no_prefix = [sub[3:] for sub in subs_to_include]
subs_to_include_no_prefix;

### Output data

In [None]:
output_basepath = ''
subs_output_path = os.path.join(output_basepath, 'submissions')
threads_output_path = os.path.join(output_basepath, 'threads')

### Filter submissions

In [23]:
subreddit_dirs = os.listdir(subs_input_path)
subreddit_dirs

['AskReddit_2020-07-01_2022-12-31',
 'California_2020-07-01_2022-12-31',
 'news_2020-07-01_2022-12-31',
 'collapse_2020-07-01_2022-12-31',
 'bayarea_2020-07-01_2022-12-31',
 'politics_2020-07-01_2022-12-31']

In [24]:
len(subs_to_include_no_prefix)

6251

In [25]:
for subreddit_dir in subreddit_dirs:
    subreddit_name = subreddit_dir.split("_")[0]#.lower()
    print("Working on", subreddit_name)

    subreddit_input_path = os.path.join(subs_input_path, subreddit_dir)
    subreddit_output_path = os.path.join(subs_output_path, subreddit_dir)

    if not os.path.exists(subreddit_output_path):
        os.makedirs(subreddit_output_path)

    # load the df containing the submissions' metadata
    _df = pd.read_csv(os.path.join(subreddit_input_path, f'{subreddit_name}_subs.csv'))
    print(len(_df))

    # filter the df to only include the submissions that are in the list of submissions to include
    df = _df[_df['sub_id'].isin(subs_to_include_no_prefix)]
    print(len(df))

    # store the filtered df
    df.to_csv(os.path.join(subreddit_output_path, f'{subreddit_name}_subs.csv'), index=False)

Working on AskReddit
2738
2618
Working on California
149
148
Working on news
391
385
Working on collapse
2430
2429
Working on bayarea
184
181
Working on politics
501
490


### Filter conversational threads

In [None]:
for subreddit_dir in subreddit_dirs:
    subreddit_name = subreddit_dir.split("_")[0]
    print("Working on", subreddit_name)

    subreddit_input_path = os.path.join(threads_input_path, subreddit_dir)
    thread_files = os.listdir(subreddit_input_path)

    subreddit_output_path = os.path.join(threads_output_path, subreddit_dir)

    if not os.path.exists(subreddit_output_path):
        os.makedirs(subreddit_output_path)

    print(len(thread_files))
    cnt = 0

    for thread_file in thread_files:
        if thread_file not in subs_to_include_no_prefix:
            continue

        thread_file_path = os.path.join(subreddit_input_path, thread_file, f"{thread_file}.csv")

        try:
            _thread_df = pd.read_csv(thread_file_path)
        except pd.errors.EmptyDataError:
            print("Empty file:", thread_file_path)
            continue

        if 'author_id' not in _thread_df:
            print(f"Problems with {thread_file_path}")
            continue

        # filter out all authors that are not in the list of users to include
        thread_df = _thread_df[_thread_df['author_id'].isin(users_to_include)]

        if len(thread_df) == 0:
            print(f"Empty thread: {thread_file_path}")
            continue

        # store the filtered df
        thread_df.to_csv(os.path.join(subreddit_output_path, f"{thread_file}.csv"), index=False)
        cnt += 1
    
    print(cnt)

Working on AskReddit
2738
2618
Working on California
149
148
Working on news
391
385
Working on collapse
2430
2429
Working on bayarea
184
181
Working on politics
501
490
