In [1]:
#this script takes in fiction_jsons and writes word frequencies to year-specific csvs

#load the libraries we need
import bz2
import orjson
import csv
import os
import pandas as pd
from collections import defaultdict
import re
import string

#declare the dictionary yearly_frequencies, this accumulates word frequencies for each publication year
yearly_frequencies = defaultdict(lambda: defaultdict(int))

#define the function extract word frequencies
def extract_word_frequencies(file_name):
    try:
        #taking a zipped json - bz2 - as an input
        with bz2.BZ2File(file_name, 'rb') as input_file:
            #read
            input_file_content = input_file.read()
            #and decode
            json_input = input_file_content.decode('utf-8')

            #use orjson or load it
            data = orjson.loads(json_input)

            #declare word_frequencies
            word_frequencies = defaultdict(int)

            #extract the features section
            features = data.get('features', {})

            #if features is not a dictionary
            if not isinstance(features, dict):

                #throw an error
                raise ValueError("Invalid format for 'features' field.")

            #if 'pages' is in features and is a list    
            if 'pages' in features and isinstance(features['pages'], list):
                #pull paages into a variable called pages
                pages = features['pages']

                #and for the body in pages pull this in
                body = [d['body'] for d in pages if 'body' in d]

            #otherwise throw and error
            else:
                raise ValueError("Invalid format for 'pages' field, expected a list.")

            #iterate through each page's body
            for page_body in body:

                #if its a dictionary
                if isinstance(page_body, dict): 

                    #pull the part of speech counts into token_pos_count
                    token_pos_count = page_body.get('tokenPosCount', {})

                    #if its a dictionary
                    if isinstance(token_pos_count, dict):

                        #for each word
                        for word, tags_data in token_pos_count.items():
                            #lower it
                            word = word.lower()
                            #remove all digits
                            word = re.sub(r'\d+', '', word)
                            #punctuation
                            word = word.translate(str.maketrans('', '', string.punctuation))
                            #and blank spaces
                            word = word.strip()

                            #if no word, keep going
                            if not word:
                                continue

                            if isinstance(tags_data, dict):
                                for tags, frequency in tags_data.items():
                                    if isinstance(frequency, int):
                                        #aggregate word frequency for each word type
                                        word_frequencies[word] += frequency

            return word_frequencies

    except Exception as e:
        print(f"Error processing {file_name}: {e}")
        return None

#accumulate word_frequencies function - this groups word types by year of publication
def accumulate_word_frequencies(word_frequencies, pub_year):
    for word_type, count in word_frequencies.items():
        yearly_frequencies[pub_year][word_type] += count

#write_yearly_frequencies to csv creates a csv file named for the relevant publication year
def write_yearly_frequencies_to_csv(pub_year):
    csv_file = f"{pub_year}.csv"
    
    #under the column headings 'word_type' and 'count'
    with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(["word_type", "count"])
        for word_type, count in yearly_frequencies[pub_year].items():
            writer.writerow([word_type, count])
    
    #clear pub_year to avoid repetition
    yearly_frequencies[pub_year].clear()

#update_processed_files function - writes file_name and introduces a new line
def update_processed_files(file_name, processed_file_list):
    with open(processed_file_list, mode='a') as f:
        f.write(file_name + "\n")

#load_processed_files function: have we already loaded this file
def load_processed_files(processed_file_list):
    if os.path.exists(processed_file_list):
        with open(processed_file_list, 'r') as f:
            return set(f.read().splitlines())
    return set()

#process_json_files_from_csv: process each json file which appears in the csv we're iterating through
#accumulate the frequencies
#write results for each year sequentially
def process_json_files_from_csv(csv_file, processed_file_list):
    
    #read the csv file, assign it to df
    df = pd.read_csv(csv_file)

    #create processed files which holds the processed files from the processed file list
    processed_files = load_processed_files(processed_file_list)

    #create current publication year
    current_year = None

    #for every row in df identify file path and publication year
    for _, row in df.iterrows():
        file_path = row['file_path']  
        pub_year = row['pub_date']    

        #if_file path is in processed_files, lean ar aghaidh
        if file_path in processed_files:
            continue

        #if we've moved onto another year we move onto a new one
        if current_year is not None and pub_year != current_year:
            write_yearly_frequencies_to_csv(current_year)
        
        #set the current year to the publication year of the current file
        current_year = pub_year

        #extract word frequencies, assign results to word_frequencies
        word_frequencies = extract_word_frequencies(file_path)

        if word_frequencies:
            accumulate_word_frequencies(word_frequencies, pub_year)
            update_processed_files(file_path, processed_file_list)

    #write the previous year's aggregated frequencies
    if current_year is not None:
        write_yearly_frequencies_to_csv(current_year)

#specify the csv file and processed files list
csv_file = 'fiction_jsons.csv' 
processed_file_list = 'processed_files.txt'

#run the primary processing function
process_json_files_from_csv(csv_file, processed_file_list)