In [None]:
import json
import os
import gzip
from collections import defaultdict
from tqdm import tqdm
import glob

input_dir = "/shared/4/datasets/long-reddit/"
output_dir = "/shared/3/projects/hiatus/idiolect/data/full_pilot/long-reddit"
output_path = os.path.join(output_dir, "corpus.jsonl")

# Make sure the output directory exists
os.makedirs(output_dir, exist_ok=True)

author_counts = defaultdict(list)
author_docs = defaultdict(set)
# Define the latest year to include in the dataset
latest_year = 2020

# for file in glob.glob(os.path.join(input_dir, "*.gz")):
#     print(file[-10:-6])
# Find all .gz files in the directory and filter by year
gz_files = [file for file in glob.glob(os.path.join(input_dir, "*.gz")) if int(file[-10:-6]) <= latest_year]

# print(gz_files)
# Process each file
for file_path in gz_files:
    with gzip.open(file_path, 'rt', encoding='utf-8') as file:
        for line in tqdm(file, desc=f"Processing {file_path}"):
            try:
                doc = json.loads(line)
                if "author" in doc and doc['body'] not in author_docs[doc['author']]: 
                    author_counts[doc["author"]].append(doc)
                    author_docs[doc['author']].add(doc['body'])
            except json.JSONDecodeError:
                pass  # Ignore the JSON decode error and move on


Processing /shared/4/datasets/long-reddit/RC_2007-12.gz: 3845it [00:00, 22622.05it/s]
Processing /shared/4/datasets/long-reddit/RC_2007-07.gz: 2329it [00:00, 28934.37it/s]
Processing /shared/4/datasets/long-reddit/RC_2008-06.gz: 5546it [00:00, 28574.35it/s]
Processing /shared/4/datasets/long-reddit/RC_2009-07.gz: 18361it [00:00, 24994.87it/s]
Processing /shared/4/datasets/long-reddit/RC_2016-06.gz: 619269it [00:29, 21189.22it/s]
Processing /shared/4/datasets/long-reddit/RC_2013-06.gz: 276482it [00:13, 21119.14it/s]
Processing /shared/4/datasets/long-reddit/RC_2017-06.gz: 684042it [00:34, 19562.28it/s]
Processing /shared/4/datasets/long-reddit/RC_2006-04.gz: 331it [00:00, 15478.00it/s]
Processing /shared/4/datasets/long-reddit/RC_2012-02.gz: 116503it [00:05, 21489.63it/s]

In [None]:
# import json
# import os
# from collections import defaultdict 
# from tqdm import tqdm
# import glob

# input_dir = "/shared/3/projects/hiatus/aggregated_trainset_v2/raw_datasets/ao3"
# output_dir = "/shared/3/projects/hiatus/idiolect/data/full_pilot/ao3"
# output_path = os.path.join(output_dir, "corpus.jsonl")

# # Make sure the output directory exists
# os.makedirs(output_dir, exist_ok=True)

# author_counts = defaultdict(list)

# # Find all .jsonl files in the directory
# jsonl_files = glob.glob(os.path.join(input_dir, "*.jsonl"))

# # Process each file
# for file_path in jsonl_files:
#     with open(file_path, 'r') as file:
#         for line in tqdm(file, desc=f"Processing {file_path}"):
#             try:
#                 doc = json.loads(line)

#                 # Adjusted to access authorIDs from within sourceSpecific
#                 if (len(doc['authorIDs']) == 1 and 
#                      doc['lengthWords'] >= 150 and  
#                      ',' not in doc['sourceSpecific']['author']): 
#                     author_id = doc['authorIDs'][0]
#                     author_counts[author_id].append(doc)
#             except json.JSONDecodeError:
#                 pass  # Ignore the JSON decode error and move on


In [None]:
# import json
# import os
# from collections import defaultdict 
# from tqdm import tqdm

# file_path= "/shared/3/projects/hiatus/aggregated_trainset_v2/raw_datasets/gmane/corpus.jsonl"
# output_path = "/shared/3/projects/hiatus/idiolect/data/full_pilot/gmane/corpus.jsonl"

# os.makedirs(os.path.dirname(output_path), exist_ok = True)

# author_counts = defaultdict(list)
# author_docs = defaultdict(set)
# with open(file_path, 'r') as file:
#     for line in tqdm(file, desc = f"processing {file_path}"):
#         doc = json.loads(line)
#         author_id = doc['authorIDs'][0]
#         if len(doc['authorIDs']) == 1 and doc['lengthWords'] >= 150 and doc["fullText"] not in author_docs[author_id]:  
#             author_docs[author_id].add(doc['fullText'])
#             author_counts[author_id].append(doc)
# # print(author_counts)

In [2]:
def print_stats(post_counts):
    total_posts = 0
    authors_over_5 = 0
    authors_over_10 = 0

    for author, posts in post_counts.items():
        total_posts += len(posts)
        
        if len(posts) >= 5:
            authors_over_5 += 1
        if len(posts) >= 10:
            authors_over_10 += 1

    total_authors = len(post_counts)
    avg_posts = float(total_posts) / total_authors if total_authors else 0.0

    print(f"Total number of authors: {total_authors}")
    print(f"Total number of posts: {total_posts}")
    print(f"Average number of posts per author: {avg_posts:.2f}")
    print(f"Total number of authors with at least 5 posts : {authors_over_5}")
    print(f"Total number of authors with at least 10 posts: {authors_over_10}")

print_stats(author_counts)    

Total number of authors: 1585076
Total number of posts: 6407975
Average number of posts per author: 4.04
Total number of authors with at least 5 posts : 253398
Total number of authors with at least 10 posts: 111452


In [None]:
def grab_bot_accounts():
    fname = '/shared/0/projects/prosocial/known-bots.tsv'
    bots = []

    with open(fname, 'rt') as f:
        lines = f.readlines()

        for line in lines:
            bots.append(line.split('\t')[1])

    print("Known bots: %d" % len(bots))
    bots.append('[deleted]')
    bots.append('deleted')
    bots.append('AutoModerator')
    return bots

In [None]:
# import time 
bots = grab_bot_accounts()
non_bots = list(set(author_counts.keys()) - set(bots))
sorted_author_count = sorted(non_bots, key=lambda author: len(author_counts[author]), reverse=True)
pruned_author_count = sorted_author_count[:100] + [author for author in sorted_author_count[1100:] if len(author_counts[author]) >= 10]
total = 0
for author in pruned_author_count:
    total += len(author_counts[author])
#     print(len(author_counts[author]))
print(total)



In [None]:
import json

# Path to your output .jsonl file

# Open the file for writing
with open(output_path, 'w') as file:
    # Iterate over all sorted authors
    for author in tqdm(pruned_author_count, desc = f"writing author documents to {output_path}"):
        # Iterate over each document in the list associated with the author
        for document in author_counts[author]:
            # Assuming each document is already a dictionary that can be serialized to JSON
            json_string = json.dumps(document)
            file.write(json_string + '\n')
