In [4]:
import json
import os
import gzip
from collections import defaultdict
from tqdm import tqdm
import glob

input_dir = "/shared/4/datasets/long-reddit/"
output_dir = "/shared/3/projects/hiatus/idiolect/data/pilot/long-reddit"
output_path = os.path.join(output_dir, "corpus.jsonl")

# Make sure the output directory exists
os.makedirs(output_dir, exist_ok=True)

author_counts = defaultdict(list)

# Define the latest year to include in the dataset
latest_year = 2020

# for file in glob.glob(os.path.join(input_dir, "*.gz")):
#     print(file[-10:-6])
# Find all .gz files in the directory and filter by year
gz_files = [file for file in glob.glob(os.path.join(input_dir, "*.gz")) if int(file[-10:-6]) <= latest_year]

# print(gz_files)
# Process each file
for file_path in gz_files:
    with gzip.open(file_path, 'rt', encoding='utf-8') as file:
        for line in tqdm(file, desc=f"Processing {file_path}"):
            try:
                doc = json.loads(line)
                if "author" in doc: 
                    author_counts[doc["author"]].append(doc)
            except json.JSONDecodeError:
                pass  # Ignore the JSON decode error and move on


Processing /shared/4/datasets/long-reddit/RC_2007-12.gz: 3845it [00:00, 18546.91it/s]
Processing /shared/4/datasets/long-reddit/RC_2007-07.gz: 2329it [00:00, 19164.74it/s]
Processing /shared/4/datasets/long-reddit/RC_2008-06.gz: 5546it [00:00, 24567.75it/s]
Processing /shared/4/datasets/long-reddit/RC_2009-07.gz: 18361it [00:00, 21533.22it/s]
Processing /shared/4/datasets/long-reddit/RC_2016-06.gz: 619269it [00:43, 14175.30it/s]
Processing /shared/4/datasets/long-reddit/RC_2013-06.gz: 276482it [00:18, 14665.74it/s]
Processing /shared/4/datasets/long-reddit/RC_2017-06.gz: 684042it [00:47, 14351.02it/s]
Processing /shared/4/datasets/long-reddit/RC_2006-04.gz: 331it [00:00, 15521.43it/s]
Processing /shared/4/datasets/long-reddit/RC_2012-02.gz: 150459it [00:10, 15005.59it/s]
Processing /shared/4/datasets/long-reddit/RC_2009-06.gz: 14566it [00:01, 13830.66it/s]
Processing /shared/4/datasets/long-reddit/RC_2013-10.gz: 303669it [00:21, 13821.36it/s]
Processing /shared/4/datasets/long-reddit/R

In [5]:
# import json
# import os
# from collections import defaultdict 
# from tqdm import tqdm

# file_path= "/shared/3/projects/hiatus/aggregated_trainset_v2/raw_datasets/bookcorpus/corpus.jsonl"
# output_path = "/shared/3/projects/hiatus/idiolect/data/pilot/bookcorpus/corpus.jsonl"

# os.makedirs(os.path.dirname(output_path), exist_ok = True)

# author_counts = defaultdict(list)
# with open(file_path, 'r') as file:
#     for line in tqdm(file, desc = f"processing {file_path}"):
#         doc = json.loads(line)
#         if len(doc['authorIDs']) == 1 and doc['lengthWords'] >= 150:  
#             author_id = doc['authorIDs'][0]
#             author_counts[author_id].append(doc)
# # print(author_counts)

In [6]:
def print_stats(post_counts):
    total_posts = 0
    authors_over_5 = 0
    authors_over_10 = 0

    for author, posts in post_counts.items():
        total_posts += len(posts)
        
        if len(posts) >= 5:
            authors_over_5 += 1
        if len(posts) >= 10:
            authors_over_10 += 1

    total_authors = len(post_counts)
    avg_posts = float(total_posts) / total_authors if total_authors else 0.0

    print(f"Total number of authors: {total_authors}")
    print(f"Total number of posts: {total_posts}")
    print(f"Average number of posts per author: {avg_posts:.2f}")
    print(f"Total number of authors with at least 5 posts : {authors_over_5}")
    print(f"Total number of authors with at least 10 posts: {authors_over_10}")

print_stats(author_counts)    

Total number of authors: 7208158
Total number of posts: 70893122
Average number of posts per author: 9.84
Total number of authors with at least 5 posts : 1770314
Total number of authors with at least 10 posts: 984685


In [None]:
import time 
bots = grab_bot_accounts()
non_bots = list(set(author_counts.keys()) - set(bots))
sorted_author_count = sorted(non_bots, key=lambda author: len(author_counts[author]), reverse=True)
total = 0
for author in sorted_author_count[100:1100]:
    time.sleep(0.01)
    total += len(author_counts[author])
#     print(author_counts[author][0])
#     print(len(author_counts[author]))
print(total)



In [19]:
# print(len(author_counts[sorted_author_count[100]]))
# docs = set()
# count = 0
# for post in author_counts[sorted_author_count[100]]:
#     if post['body'] not in docs:
#         count += 1
#         docs.add(post['body'])
# print(count)
print(total)

2452319


In [None]:
import json

# Path to your output .jsonl file

# Open the file for writing
with open(output_path, 'w') as file:
    # Iterate over all sorted authors
    for author in tqdm(sorted_author_count[100:1100], desc = f"writing author documents to {output_path}"):
        # Iterate over each document in the list associated with the author
        for document in author_counts[author]:
            # Assuming each document is already a dictionary that can be serialized to JSON
            json_string = json.dumps(document)
            file.write(json_string + '\n')


In [8]:
def grab_bot_accounts():
    fname = '/shared/0/projects/prosocial/known-bots.tsv'
    bots = []

    with open(fname, 'rt') as f:
        lines = f.readlines()

        for line in lines:
            bots.append(line.split('\t')[1])

    print("Known bots: %d" % len(bots))
    bots.append('[deleted]')
    bots.append('deleted')
    bots.append('AutoModerator')
    return bots

Known bots: 393
