In [1]:
#pull in the libraries we need
import bz2
import orjson
import csv
import os
import pandas as pd
from collections import defaultdict
import re
import string

#extract word frequencies pulls in word frequencies from the jsons, handles them in various ways and aggregates
def extract_word_frequencies(file_name):
    try:
        #pull in compressed json
        with bz2.BZ2File(file_name, 'rb') as input_file:
            #read and decode
            input_file_content = input_file.read()
            json_input = input_file_content.decode('utf-8')

            #orjson is the quickest library for this purpose
            data = orjson.loads(json_input)
            
            #initialise a dictionary to hold the frequencies
            word_frequencies = defaultdict(int)

            #get features, assign to features 
            features = data.get('features', {})

            #if features isn't a dictionary
            if not isinstance(features, dict):
                #throw an error
                raise ValueError("Invalid format for 'features' field.")

            #if pages is in features and features is a list
            if 'pages' in features and isinstance(features['pages'], list):
                #slice pages out of features and assign it to pages
                pages = features['pages']
                #extract every element in body and assign to body
                body = [d['body'] for d in pages if 'body' in d]
            else:
                raise ValueError("Invalid format for 'pages' field, expected a list.")

            #iterate through each page's body
            for page_body in body:
                #if its a dictionary
                if isinstance(page_body, dict): 
                    #get token_pos_count
                    token_pos_count = page_body.get('tokenPosCount', {})
                    #if token_pos_count is a dictionary
                    if isinstance(token_pos_count, dict):
                        #for word, tags_data
                        for word, tags_data in token_pos_count.items():
                            #lower the word, remove punctuation and whitespace
                            word = word.lower()
                            word = re.sub(r'\d+', '', word)
                            word = word.translate(str.maketrans('', '', string.punctuation))
                            word = word.strip()

                            #if the result is not a word anymore, keep going
                            if not word:
                                continue

                            #aggregate all the words
                            if isinstance(tags_data, dict):
                                for tags, frequency in tags_data.items():
                                    if isinstance(frequency, int):
                                        # Aggregate frequency per word type
                                        word_frequencies[word] += frequency

            return word_frequencies

    except Exception as e:
        print(f"Error processing {file_name}: {e}")
        return None

#write_frequencies_to_csv and txt writes these frequencies to a csv
def write_frequencies_to_csv_and_txt(word_frequencies, pub_year):
    #define the file paths 
    csv_file = f"{pub_year}.csv"
    txt_file = f"{pub_year}_top_10000.txt"

    #check if CSV file exists to write the header only once
    file_exists = os.path.isfile(csv_file)
    
    #write frequencies to CSV file
    with open(csv_file, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        if not file_exists:
            writer.writerow(["word_type", "count"])

        for word_type, count in word_frequencies.items():
            writer.writerow([word_type, count])

    # Sort words by frequency and select the top 10,000
    top_words = sorted(word_frequencies.items(), key=lambda x: x[1], reverse=True)[:10000]
    
    # Write top 10,000 words to the .txt file
    with open(txt_file, mode='w', encoding='utf-8') as file:
        for word, count in top_words:
            file.write(f"{word}: {count}\n")

def update_processed_files(file_name, processed_file_list):
    with open(processed_file_list, mode='a') as f:
        f.write(file_name + "\n")

def load_processed_files(processed_file_list):
    if os.path.exists(processed_file_list):
        with open(processed_file_list, 'r') as f:
            return set(f.read().splitlines())
    return set()

def process_json_files_from_csv(csv_file, processed_file_list):
    df = pd.read_csv(csv_file)
    processed_files = load_processed_files(processed_file_list)

    for _, row in df.iterrows():
        file_path = row['file_path']  
        pub_year = row['pub_date']    

        if file_path in processed_files:
            continue

        word_frequencies = extract_word_frequencies(file_path)
        
        if word_frequencies:
            write_frequencies_to_csv_and_txt(word_frequencies, pub_year)
            update_processed_files(file_path, processed_file_list)

csv_file = 'fiction_jsons.csv' 
processed_file_list = 'processed_files.txt'

process_json_files_from_csv(csv_file, processed_file_list)


KeyboardInterrupt: 