In [1]:
#pull in the libraries we need
import bz2
import orjson
import csv
import os
import pandas as pd
from collections import defaultdict
import re
import string

#extract word frequencies gets the word_frequencies out of the json
def extract_word_frequencies(file_name):
    try:
        #open the compressed file
        with bz2.BZ2File(file_name, 'rb') as input_file:
            #read and decode
            input_file_content = input_file.read()
            json_input = input_file_content.decode('utf-8')

            #load it
            data = orjson.loads(json_input)

            #set up a dictionary that's going to hold the frequencies
            word_frequencies = defaultdict(int)

            #make sure features is a dictionary - this seems to vary
            features = data.get('features', {})
            #if it isn't throw an error
            if not isinstance(features, dict):
                raise ValueError("Invalid format for 'features' field.")

            #make sure pages is a list - this too seems to vary - and extract body
            if 'pages' in features and isinstance(features['pages'], list):
                pages = features['pages']
                body = [d['body'] for d in pages if 'body' in d]
            else:
                raise ValueError("Invalid format for 'pages' field, expected a list.")

            #iterate through each object in body, each of which is a page
            for page_body in body:
                #ensure page body is a dictionary
                if isinstance(page_body, dict):
                    #extract tokenPosCount from page_body
                    token_pos_count = page_body.get('tokenPosCount', {})
                    if isinstance(token_pos_count, dict):
                        for word, tags_data in token_pos_count.items():
                            #convert word to lowercase, remove blank space, remove punctuation
                            word = word.lower()
                            word = re.sub(r'\d+', '', word)
                            word = word.translate(str.maketrans('', '', string.punctuation))
                            word = word.strip()

                            #skip if the word is now empty, this means it was purely numeric / punctuation
                            if not word:
                                continue

                            #iterate through tags_data to sum the frequencies
                            if isinstance(tags_data, dict):
                                for tags, frequency in tags_data.items():
                                    if isinstance(frequency, int):
                                        word_frequencies[word] += frequency
                                    else:
                                        #handle unexpected non-integer frequencies (skip for now)
                                        continue

            return word_frequencies

    except Exception as e:
        print(f"Error processing {file_name}: {e}")
        return None


#this function writes counts to year-specific CSV files
def write_frequencies_to_csv(word_frequencies, pub_year):
    csv_file = f"{pub_year}.csv"
    file_exists = os.path.isfile(csv_file)
    
    with open(csv_file, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)

        #if the file does not exist, write the header
        if not file_exists:
            writer.writerow(["word_type", "count"])

        #write frequencies to csv file
        for word_type, count in word_frequencies.items():
            writer.writerow([word_type, count])

#keep track of those files we have processed by writing the ids to a .txt file
def update_processed_files(file_name, processed_file_list):
    with open(processed_file_list, mode='a') as f:
        f.write(file_name + "\n")

#load the list of already processed ids from this .txt
def load_processed_files(processed_file_list):
    if os.path.exists(processed_file_list):
        with open(processed_file_list, 'r') as f:
            return set(f.read().splitlines())
    return set()

#process the csv and extract data from json filess
def process_json_files_from_csv(csv_file, processed_file_list):
    
    #load CSV
    df = pd.read_csv(csv_file)
    
    #pull in list of already processed files
    processed_files = load_processed_files(processed_file_list)

    #iterate through csvs
    for _, row in df.iterrows():
        file_path = row['file_path']  #The column with the file path
        pub_year = row['pub_date']    #The column with the publication year

        #check if file has already been processed
        if file_path in processed_files:
            continue

        #pull in word frequencies from json
        word_frequencies = extract_word_frequencies(file_path)
        
        if word_frequencies:
            #write them ot year-specific csv
            write_frequencies_to_csv(word_frequencies, pub_year)

            #update list of processed files
            update_processed_files(file_path, processed_file_list)

#defining document names
csv_file = 'fiction_jsons.csv' 
processed_file_list = 'processed_files.txt'

#off we go
process_json_files_from_csv(csv_file, processed_file_list)
