In [1]:
#pull in the libraries we need
import bz2
import orjson
import csv
import os
import pandas as pd
from collections import defaultdict
import re
import string

#function to extract word frequencies from a JSON file
def extract_word_frequencies(file_name):
    try:
        #take the zipped json as an input
        with bz2.BZ2File(file_name, 'rb') as input_file:

            #read and decode
            input_file_content = input_file.read()
            json_input = input_file_content.decode('utf-8')

            #load the JSON
            data = orjson.loads(json_input)

            #initialize a dictionary to store the frequencies
            word_frequencies = defaultdict(int)

            #ensure the 'features' field is a dictionary
            features = data.get('features', {})
            if not isinstance(features, dict):
                raise ValueError("Invalid format for 'features' field.")

            #ensure 'pages' is a list and extract 'body'
            if 'pages' in features and isinstance(features['pages'], list):
                pages = features['pages']
                body = [d['body'] for d in pages if 'body' in d]
            else:
                raise ValueError("Invalid format for 'pages' field, expected a list.")

            #iterate through each object in the 'body', each of which represent a page
            for page_body in body:
                #ensure 'page_body' is a dictionary
                if isinstance(page_body, dict):
                    #extract 'tokenPosCount' from page_body
                    token_pos_count = page_body.get('tokenPosCount', {})
                    if isinstance(token_pos_count, dict):
                        for word, tags_data in token_pos_count.items():
                            #lower-case the word, remove numbers, punctuation, whitespace
                            word = word.lower()
                            word = re.sub(r'\d+', '', word)
                            word = word.translate(str.maketrans('', '', string.punctuation)) 
                            word = word.strip()

                            #if the word is now empty, keep going
                            if not word:
                                continue

                            #add the frequency to the word count
                            if isinstance(tags_data, dict):
                                for tags, frequency in tags_data.items():
                                    if isinstance(frequency, int):
                                        #aggregate
                                        word_frequencies[word] += frequency
                                    else:
                                        continue

            return word_frequencies

    except Exception as e:
        print(f"Error processing {file_name}: {e}")
        return None

#this function writes word frequencies to year-specific csv files
def write_frequencies_to_csv(word_frequencies, pub_year):
    csv_file = f"{pub_year}.csv"
    file_exists = os.path.isfile(csv_file)

    try:
        with open(csv_file, mode='a', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)

            #if the file doesn't exist write the header
            if not file_exists:
                writer.writerow(["word_type", "count"])

            #add word frequencies to the csv file
            for word_type, count in word_frequencies.items():
                writer.writerow([word_type, count])

        print(f"Wrote to CSV for year {pub_year}")

    except Exception as e:
        print(f"Error writing to CSV {csv_file}: {e}")

#this updates the list of the processed files so we can start the process again if we need or want to
def update_processed_files(file_name, processed_file_list):
    try:
        with open(processed_file_list, mode='a') as f:
            f.write(file_name + "\n")
    except Exception as e:
        print(f"Error updating processed files: {e}")

#load the list of already processed files
def load_processed_files(processed_file_list):
    if os.path.exists(processed_file_list):
        with open(processed_file_list, 'r') as f:
            return set(f.read().splitlines())
    return set()

#this function processes JSON files from a CSV and writes them year by year
def process_json_files_from_csv(csv_file, processed_file_list):
    #load CSV with file paths and publication years
    df = pd.read_csv(csv_file)

    #pull in the list of already processed files
    processed_files = load_processed_files(processed_file_list)

    #group files by publication year
    files_by_year = df.groupby('pub_date')

    #iterate through each year and its corresponding files
    for pub_year, group in files_by_year:
        print(f"Processing files for year: {pub_year}")

        #declare the dictionary year_word_frequencies, this will hold aggregated frequencies for this year
        year_word_frequencies = defaultdict(int)

        #iterate through each file for the current year
        for _, row in group.iterrows():
            file_path = row['file_path']

            #check if the file has already been processed
            if file_path in processed_files:
                continue

            #create word_frequencies, this holds word frequencies from the JSON file
            word_frequencies = extract_word_frequencies(file_path)

            if word_frequencies:
                #aggregate word frequencies for the current year
                for word_type, count in word_frequencies.items():
                    year_word_frequencies[word_type] += count

                #update the list of processed files immediately after processing each file
                update_processed_files(file_path, processed_file_list)

        #write the aggregated frequencies for the current year to the CSV
        write_frequencies_to_csv(year_word_frequencies, pub_year)

#defining document names
csv_file = 'post_45_fiction.csv'
processed_file_list = 'processed_files.txt'

#start processing
process_json_files_from_csv(csv_file, processed_file_list)


Processing files for year: 1946
Wrote to CSV for year 1946
Processing files for year: 1947
Wrote to CSV for year 1947
Processing files for year: 1948
Wrote to CSV for year 1948
Processing files for year: 1949
Wrote to CSV for year 1949
Processing files for year: 1950
Wrote to CSV for year 1950
Processing files for year: 1951
Wrote to CSV for year 1951
Processing files for year: 1952
Wrote to CSV for year 1952
Processing files for year: 1953
Wrote to CSV for year 1953
Processing files for year: 1954
Wrote to CSV for year 1954
Processing files for year: 1955
Wrote to CSV for year 1955
Processing files for year: 1956
Wrote to CSV for year 1956
Processing files for year: 1957
Wrote to CSV for year 1957
Processing files for year: 1958
Wrote to CSV for year 1958
Processing files for year: 1959
Wrote to CSV for year 1959
Processing files for year: 1960
Wrote to CSV for year 1960
Processing files for year: 1961
Wrote to CSV for year 1961
Processing files for year: 1962
Wrote to CSV for year 19