In [None]:
import json
import os
import gzip
from collections import defaultdict
from tqdm import tqdm
import glob

input_dir = "/shared/4/datasets/long-reddit/"
output_dir = "/shared/3/projects/hiatus/idiolect/data/pilot/long-reddit"
output_path = os.path.join(output_dir, "corpus.jsonl")

# Make sure the output directory exists
os.makedirs(output_dir, exist_ok=True)

author_counts = defaultdict(list)
author_docs = defaultdict(set)
# Define the latest year to include in the dataset
latest_year = 2020

# for file in glob.glob(os.path.join(input_dir, "*.gz")):
#     print(file[-10:-6])
# Find all .gz files in the directory and filter by year
gz_files = [file for file in glob.glob(os.path.join(input_dir, "*.gz")) if int(file[-10:-6]) <= latest_year]

# print(gz_files)
# Process each file
for file_path in gz_files:
    with gzip.open(file_path, 'rt', encoding='utf-8') as file:
        for line in tqdm(file, desc=f"Processing {file_path}"):
            try:
                doc = json.loads(line)
                if "author" in doc and doc['body'] not in author_docs[doc['author']]: 
                    author_counts[doc["author"]].append(doc)
                    author_docs[doc['author']].add(doc['body'])
            except json.JSONDecodeError:
                pass  # Ignore the JSON decode error and move on


Processing /shared/4/datasets/long-reddit/RC_2007-12.gz: 3845it [00:00, 17346.95it/s]
Processing /shared/4/datasets/long-reddit/RC_2007-07.gz: 2329it [00:00, 17744.36it/s]
Processing /shared/4/datasets/long-reddit/RC_2008-06.gz: 5546it [00:00, 23702.50it/s]
Processing /shared/4/datasets/long-reddit/RC_2009-07.gz: 18361it [00:00, 22064.25it/s]
Processing /shared/4/datasets/long-reddit/RC_2016-06.gz: 31445it [00:02, 14315.33it/s]

In [5]:
# import json
# import os
# from collections import defaultdict 
# from tqdm import tqdm

# file_path= "/shared/3/projects/hiatus/aggregated_trainset_v2/raw_datasets/bookcorpus/corpus.jsonl"
# output_path = "/shared/3/projects/hiatus/idiolect/data/pilot/bookcorpus/corpus.jsonl"

# os.makedirs(os.path.dirname(output_path), exist_ok = True)

# author_counts = defaultdict(list)
# with open(file_path, 'r') as file:
#     for line in tqdm(file, desc = f"processing {file_path}"):
#         doc = json.loads(line)
#         if len(doc['authorIDs']) == 1 and doc['lengthWords'] >= 150:  
#             author_id = doc['authorIDs'][0]
#             author_counts[author_id].append(doc)
# # print(author_counts)

In [17]:
def print_stats(post_counts):
    total_posts = 0
    authors_over_5 = 0
    authors_over_10 = 0

    for author, posts in post_counts.items():
        total_posts += len(posts)
        
        if len(posts) >= 5:
            authors_over_5 += 1
        if len(posts) >= 10:
            authors_over_10 += 1

    total_authors = len(post_counts)
    avg_posts = float(total_posts) / total_authors if total_authors else 0.0

    print(f"Total number of authors: {total_authors}")
    print(f"Total number of posts: {total_posts}")
    print(f"Average number of posts per author: {avg_posts:.2f}")
    print(f"Total number of authors with at least 5 posts : {authors_over_5}")
    print(f"Total number of authors with at least 10 posts: {authors_over_10}")

print_stats(author_counts)    

Total number of authors: 7208158
Total number of posts: 66493806
Average number of posts per author: 9.22
Total number of authors with at least 5 posts : 1754174
Total number of authors with at least 10 posts: 974587


In [15]:
import time 
bots = grab_bot_accounts()
non_bots = list(set(author_counts.keys()) - set(bots))
sorted_author_count = sorted(non_bots, key=lambda author: len(author_counts[author]), reverse=True)
total = 0
for author in sorted_author_count[100:1100]:
    time.sleep(0.01)
    total += len(author_counts[author])
    print(len(author_counts[author]))
#     print(len(author_counts[author]))
print(total)



Known bots: 393
5002
4999
4987
4880
4877
4813
4790
4732
4677
4670
4628
4615
4614
4611
4606
4584
4560
4524
4472
4438
4402
4390
4352
4322
4304
4264
4260
4260
4236
4235
4230
4229
4185
4182
4174
4150
4148
4146
4106
4091
4072
4070
4066
4063
4062
4058
4046
4033
4010
3997
3994
3983
3966
3960
3957
3952
3945
3945
3926
3910
3899
3895
3895
3836
3835
3823
3811
3797
3796
3792
3785
3763
3759
3746
3734
3704
3698
3680
3674
3663
3655
3645
3644
3629
3626
3615
3608
3602
3601
3601
3599
3582
3581
3544
3505
3499
3497
3473
3459
3451
3435
3432
3426
3419
3417
3408
3399
3399
3396
3395
3395
3355
3351
3345
3344
3331
3325
3322
3315
3313
3307
3274
3272
3271
3268
3265
3256
3252
3245
3237
3228
3221
3215
3210
3202
3201
3193
3165
3160
3152
3148
3145
3141
3139
3116
3115
3113
3103
3100
3094
3083
3082
3077
3071
3069
3063
3062
3056
3044
3016
2997
2994
2991
2988
2987
2977
2966
2964
2958
2951
2935
2932
2925
2899
2886
2886
2882
2874
2873
2869
2867
2865
2855
2853
2852
2840
2838
2833
2830
2827
2823
2819
2812
2806
2805
2804
2803

In [16]:
print(len(author_counts[sorted_author_count[100]]))
docs = set()
count = 0
for post in author_counts[sorted_author_count[100]]:
    if post['body'] not in docs:
        count += 1
        docs.add(post['body'])
print(count)
# print(total)

5002
5002


In [13]:
import json

# Path to your output .jsonl file

# Open the file for writing
with open(output_path, 'w') as file:
    # Iterate over all sorted authors
    for author in tqdm(sorted_author_count[100:1100], desc = f"writing author documents to {output_path}"):
        # Iterate over each document in the list associated with the author
        for document in author_counts[author]:
            # Assuming each document is already a dictionary that can be serialized to JSON
            json_string = json.dumps(document)
            file.write(json_string + '\n')


writing author documents to /shared/3/projects/hiatus/idiolect/data/pilot/long-reddit/corpus.jsonl: 1


In [9]:
def grab_bot_accounts():
    fname = '/shared/0/projects/prosocial/known-bots.tsv'
    bots = []

    with open(fname, 'rt') as f:
        lines = f.readlines()

        for line in lines:
            bots.append(line.split('\t')[1])

    print("Known bots: %d" % len(bots))
    bots.append('[deleted]')
    bots.append('deleted')
    bots.append('AutoModerator')
    return bots

In [None]:
fname = "/shared/3/projects/hiatus/idiolect/data/pilot/long-reddit"
with open(fname, 'r') as corpus:
    author_set = set()
    for line in corpus:
        author_set.add(line['author'])