In [2]:
import cProfile
import pstats
import reddit_json
import zst_handling
import os

def folder_get_file_triplets(root_folder, *, 
                            suffix_submissions = '_submissions.zst', 
                            suffix_comments = '_comments.zst'):
    ''' Generator that yields triplets of file paths for Reddit's .ZST submissions, 
    comments and output files. '''
    for file in os.listdir(root_folder):
        if file.endswith('submissions.zst'):
            sub_file_path = os.path.join(root_folder, file)
            com_file_path = sub_file_path.replace(suffix_submissions, suffix_comments)
            out_file_name = file.replace(suffix_submissions, '_threads.jsonl')
            out_file_path = os.path.join(root_folder, 'out', out_file_name)
            yield sub_file_path, com_file_path, out_file_path

def main():
    root = '/home/zel/ml-projects/HUMOR/Reddit-data/'
    for sub_file, com_file, out_file in folder_get_file_triplets(root):
        print(f'{sub_file}\n{com_file}\n{out_file}\n')
        # Create threads from the submissions and comments.
        threads = reddit_json.create_threads(
            zst_handling.read_zst_file(sub_file), 
            zst_handling.read_zst_file(com_file))
        # Write JSON lines with the threads.
        reddit_json.save_jsonl(out_file, reddit_json.generate_json(threads))

# Run the main function under the cProfile profiler
cProfile.run('main()', 'profiling_results.out')

# Create a pstats.Stats object to analyze the profiling results
p = pstats.Stats('profiling_results.out')

# Sort the statistics by the cumulative time spent in the function
p.sort_stats('tottime')

# Print the statistics
p.print_stats(10)

/home/zel/ml-projects/HUMOR/Reddit-data/lol_submissions.zst
/home/zel/ml-projects/HUMOR/Reddit-data/lol_comments.zst
/home/zel/ml-projects/HUMOR/Reddit-data/out/lol_threads.jsonl



100%|██████████| 8.99M/8.99M [00:00<00:00, 30.9MB/s]
100%|██████████| 2.30M/2.30M [00:00<00:00, 29.3MB/s]


/home/zel/ml-projects/HUMOR/Reddit-data/FollowThePunchline_submissions.zst
/home/zel/ml-projects/HUMOR/Reddit-data/FollowThePunchline_comments.zst
/home/zel/ml-projects/HUMOR/Reddit-data/out/FollowThePunchline_threads.jsonl



100%|██████████| 252k/252k [00:00<00:00, 16.1MB/s]
100%|██████████| 1.89M/1.89M [00:00<00:00, 15.0MB/s]


/home/zel/ml-projects/HUMOR/Reddit-data/oneliners_submissions.zst
/home/zel/ml-projects/HUMOR/Reddit-data/oneliners_comments.zst
/home/zel/ml-projects/HUMOR/Reddit-data/out/oneliners_threads.jsonl



100%|██████████| 4.81M/4.81M [00:00<00:00, 18.2MB/s]
100%|██████████| 6.68M/6.68M [00:00<00:00, 22.0MB/s]


Fri May  3 18:15:55 2024    profiling_results.out

         977013 function calls (968060 primitive calls) in 1.135 seconds

   Ordered by: internal time
   List reduced from 176 to 10 due to restriction <10>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
   196897    0.409    0.000    0.410    0.000 {orjson.loads}
     6379    0.209    0.000    0.209    0.000 {method 'decode' of 'bytes' objects}
        6    0.123    0.020    0.123    0.020 {method 'split' of 'str' objects}
       12    0.095    0.008    0.095    0.008 {method 'read' of 'zstd.ZstdDecompressionReader' objects}
   196903    0.078    0.000    0.990    0.000 /home/zel/ml-projects/HUMOR/Reddit-filter/zst_handling.py:52(read_zst_file)
        3    0.055    0.018    0.514    0.171 /home/zel/ml-projects/HUMOR/Reddit-filter/reddit_json.py:34(load_comments)
        3    0.043    0.014    0.577    0.192 /home/zel/ml-projects/HUMOR/Reddit-filter/reddit_json.py:13(load_submissions)
   196897    0.040    0

<pstats.Stats at 0x7f1973c4efb0>