In [1]:
import os
import pickle
import tiktoken

import numpy as np
import pandas as pd

from matplotlib import pyplot as plt

## Loading the data

In [None]:
path = ""

bayarea_sub_file = os.path.join(path, "submissions", "bayarea_2020-07-01_2022-12-31", "bayarea_subs.csv")
california_sub_file = os.path.join(path, "submissions", "California_2020-07-01_2022-12-31", "California_subs.csv")
collapse_sub_file = os.path.join(path, "submissions", "collapse_2020-07-01_2022-12-31", "collapse_subs.csv")
news_sub_file = os.path.join(path, "submissions", "news_2020-07-01_2022-12-31", "news_subs.csv")
politics_sub_file = os.path.join(path, "submissions", "politics_2020-07-01_2022-12-31", "politics_subs.csv")
askreddit_sub_file = os.path.join(path, "submissions", "AskReddit_2020-07-01_2022-12-31", "AskReddit_subs.csv")

In [3]:
bayarea_sub_df = pd.read_csv(bayarea_sub_file)
california_sub_df = pd.read_csv(california_sub_file)
collapse_sub_df = pd.read_csv(collapse_sub_file)
news_sub_df = pd.read_csv(news_sub_file)
politics_sub_df = pd.read_csv(politics_sub_file)
askreddit_sub_df = pd.read_csv(askreddit_sub_file)

In [None]:
subs_to_include = pickle.load(open(os.path.join(path, "to_include", "final_subs_to_include.pkl"), "rb"))
redditors_to_include = pickle.load(open(os.path.join(path, "to_include", "final_users_to_include.pkl"), "rb"))

In [5]:
subs_to_include_no_prefix = [sub[3:] for sub in subs_to_include]

In [6]:
# filter out the subs and redditors to ignore
bayarea_sub_df = bayarea_sub_df[bayarea_sub_df["sub_id"].isin(subs_to_include_no_prefix)]
california_sub_df = california_sub_df[california_sub_df["sub_id"].isin(subs_to_include_no_prefix)]
collapse_sub_df = collapse_sub_df[collapse_sub_df["sub_id"].isin(subs_to_include_no_prefix)]
news_sub_df = news_sub_df[news_sub_df["sub_id"].isin(subs_to_include_no_prefix)]
politics_sub_df = politics_sub_df[politics_sub_df["sub_id"].isin(subs_to_include_no_prefix)]
askreddit_sub_df = askreddit_sub_df[askreddit_sub_df["sub_id"].isin(subs_to_include_no_prefix)]

In [7]:
len(bayarea_sub_df), len(california_sub_df), len(collapse_sub_df), len(news_sub_df), len(politics_sub_df), len(askreddit_sub_df)

(181, 148, 2429, 385, 490, 2618)

## Evaluating the tokens needed

In [8]:
thread_dir = os.path.join(path, "threads")
subreddit_dirs = os.listdir(thread_dir)

In [9]:
# token counter utilities (since we need to be aware of their number)
encoding = tiktoken.encoding_for_model('gpt-3.5-turbo')
#num_tokens = len(encoding.encode(string))

In [10]:
subs_tokens = 0
subs_tokens_dict = {'askreddit':0, 'bayarea':0, 'california':0, 'collapse':0, 'news':0, 'politics':0}

### Submission tokens

In [11]:
# here we count the tokens we need to evaluate
# submissions' titles and descriptions
for subreddit in subreddit_dirs:
    subreddit_name = subreddit.split("_")[0].lower()
    print("Working on", subreddit_name)

    _path = os.path.join(os.path.dirname(eval(subreddit_name + "_sub_file")), 'text')
    #print(_path)

    _df = eval(subreddit_name + "_sub_df")

    for index, row in _df.iterrows():
        if pd.isnull(row["author_id"]):
            continue

        if row["author_id"] not in redditors_to_include:
            continue

        tokens = 0

        sub_id = row["sub_id"]
        sub_title = row["sub_title"].strip()
        sub_desc = ""

        desc_path = os.path.join(_path, sub_id + ".txt")
        with open(desc_path, "r") as f:
            sub_desc = f.readlines()

        sub_desc = " ".join(sub_desc)

        # print(sub_title)
        # print(sub_desc)

        if sub_title == sub_desc.strip():
            tokens = len(encoding.encode(sub_title))
        else:
            tokens = len(encoding.encode(sub_title)) + len(encoding.encode(sub_desc))

        subs_tokens += tokens
        subs_tokens_dict[subreddit_name] += tokens

Working on askreddit
Working on california
Working on news
Working on collapse
Working on bayarea
Working on politics


In [12]:
subs_tokens, subs_tokens_dict

(1106001,
 {'askreddit': 45365,
  'bayarea': 47367,
  'california': 3490,
  'collapse': 994735,
  'news': 4628,
  'politics': 10416})

In [13]:
(subs_tokens / 1000) * 0.002

2.212002

### Comment tokens

In [14]:
# here we add users from conversational threads
for subreddit in subreddit_dirs:
    subreddit_name = subreddit.split("_")[0].lower()
    print("Working on", subreddit_name)

    subreddit_path = os.path.join(thread_dir, subreddit)
    thread_files = os.listdir(subreddit_path)

    for thread_file in thread_files:
        if thread_file not in subs_to_include_no_prefix:
            continue

        thread_file_path = os.path.join(subreddit_path, thread_file, f"{thread_file}.csv")
        comments_base_path = os.path.join(subreddit_path, thread_file, "text")
        
        try:
            thread_df = pd.read_csv(thread_file_path)
        except pd.errors.EmptyDataError:
            print("Empty file:", thread_file_path)
            continue

        if 'author_id' not in thread_df:
            print(f"Problems with {thread_file_path}")
            continue

        for index, row in thread_df.iterrows():
            if row["author_id"] not in redditors_to_include:
                continue

            comment_path = os.path.join(comments_base_path, row["comm_id"] + ".txt")
            comment = ""

            with open(comment_path, "r") as f:
                comment = f.readlines()
            
            comment = " ".join(comment)

            tokens = len(encoding.encode(comment))

            subs_tokens += tokens
            subs_tokens_dict[subreddit_name] += tokens

Working on askreddit
Working on california
Working on news
Working on collapse
Working on bayarea
Working on politics


In [16]:
subs_tokens, subs_tokens_dict

(18097610,
 {'askreddit': 1368107,
  'bayarea': 416564,
  'california': 212651,
  'collapse': 12908209,
  'news': 1783612,
  'politics': 1408467})

In [17]:
(subs_tokens / 1000) * 0.002

36.19522

In [18]:
(subs_tokens / 1000) * 0.04

726.76612