In [1]:
#this script reads json files and aggregates relative word frequencies by author

#libraries we need
import bz2
import orjson
import csv
import os
import pandas as pd
from collections import defaultdict
import re
import string
import numpy as np

#create author_frequencies which holds word frequencies for each author
author_frequencies = defaultdict(lambda: defaultdict(list))

#load the target word list from a .txt file
def load_word_list(word_list_file):
    with open(word_list_file, 'r') as f:
        words = f.read().splitlines()
    return set(words)

#extract_word_frequencies gets us the word frequencies from a json file
def extract_word_frequencies(file_name):
    try:
        #taking the zipped bs2 file as the input
        with bz2.BZ2File(file_name, 'rb') as input_file:
            #read, decode
            input_file_content = input_file.read()
            json_input = input_file_content.decode('utf-8')
            #load it
            data = orjson.loads(json_input)
            #and define a dictionary
            word_frequencies = defaultdict(int)

            #extract pages and token frequencies
            features = data.get('features', {})
            if 'pages' in features and isinstance(features['pages'], list):
                pages = features['pages']
                body = [d['body'] for d in pages if 'body' in d]
            else:
                raise ValueError("Invalid format for 'pages' field, expected a list.")

            #iterate through each page's body to accumulate word counts
            for page_body in body:
                if isinstance(page_body, dict):
                    #token_pos_count holds what it says
                    token_pos_count = page_body.get('tokenPosCount', {})
                    #for word and tags_data in token_pos_count
                    for word, tags_data in token_pos_count.items():
                        #lower
                        word = word.lower()
                        #remove puncutation and whitespace
                        word = re.sub(r'\d+', '', word)
                        word = word.translate(str.maketrans('', '', string.punctuation))
                        word = word.strip()

                        if word and isinstance(tags_data, dict):
                            for _, frequency in tags_data.items():
                                if isinstance(frequency, int):
                                    word_frequencies[word] += frequency

            #calculate total word count
            total_words = sum(word_frequencies.values())
            # and relative frequencies
            relative_frequencies = {word: (count / total_words) * 100 for word, count in word_frequencies.items()}
            return relative_frequencies

    except Exception as e:
        print(f"Error processing {file_name}: {e}")
        return None

#accumulate frequencies by author
def accumulate_frequencies_by_author(relative_frequencies, author_name, target_words):
    #for word and relative frequency in relative_frequencies
    for word, relative_freq in relative_frequencies.items():
        #if word in target_words
        if word in target_words:
            #append the word and relative frequency to author_frequencies
            author_frequencies[author_name][word].append(relative_freq)

#write_author_frequencies_to_csv does what it suggests
def write_author_frequencies_to_csv(author_name):
    #create a csv file named for the author
    csv_file = f"{author_name}.csv"

    #open it
    with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        #write our column headers
        writer.writerow(["word_type", "median_relative_frequency"])

        #calculate median relative frequency for each word type
        for word_type, freq_list in author_frequencies[author_name].items():
            median_freq = np.median(freq_list)
            writer.writerow([word_type, median_freq])

    #clear author_frequencies we've accumulated to start the process over
    author_frequencies[author_name].clear()

#process json files from csv loads the jsons, accumulates the frequencies and writes the result
def process_json_files_from_csv(csv_file, word_list_file):
    
    #create df which reads the author_jsons.csv
    df = pd.read_csv(csv_file)

    #target words pulls in the word list (mfws.txt)
    target_words = load_word_list(word_list_file)

    #clears the current author
    current_author = None

    #for row in df
    for _, row in df.iterrows():
        #take the author name and file_path
        file_path = row['file_path']
        author_name = row['author']

        #if we've moved onto a new author, write out the data for the previous one
        if current_author and author_name != current_author:
            write_author_frequencies_to_csv(current_author)

        #set current author equal to current file's author
        current_author = author_name

        #process the json file and accumulate relative frequencies
        relative_frequencies = extract_word_frequencies(file_path)
        if relative_frequencies:
            accumulate_frequencies_by_author(relative_frequencies, author_name, target_words)

    #write the last author's accumulated frequencies
    if current_author:
        write_author_frequencies_to_csv(current_author)

#specify the CSV file and word list file
csv_file = 'author_jsons.csv'
word_list_file = 'mfws.txt'

#run the main function
process_json_files_from_csv(csv_file, word_list_file)
