In [3]:
#pull in the libraries we want
import bz2
import orjson
import csv
import os
import pandas as pd
from collections import defaultdict
import re
import string
import numpy as np

#dictionary to store word frequencies for each author
author_frequencies = defaultdict(lambda: defaultdict(list))

#load the target word list from a .txt file
def load_word_list(word_list_file):
    with open(word_list_file, 'r') as f:
        words = f.read().splitlines()
    return set(words)

#load word replacement mappings from a CSV file
def load_replacement_map(replacement_csv):
    replacements = {}
    with open(replacement_csv, 'r', encoding='utf-8') as f:
        reader = csv.reader(f)
        rows = list(reader)

        #check if the first row contains headers
        if rows[0][0].lower() == 'id1' and rows[0][1].lower() == 'id2':
            #skip the header row
            rows = rows[1:]

        #process the rows
        for row in rows:
            if len(row) >= 2: #make sure there's at least two columns
                replacements[row[0]] = row[1]

    return replacements

#extract word frequencies from a single JSON file, applying replacements
def extract_word_frequencies(file_name, replacements):
    try:
        #load the compressed json
        with bz2.BZ2File(file_name, 'rb') as input_file:
            #read and decode
            input_file_content = input_file.read()
            json_input = input_file_content.decode('utf-8')
            data = orjson.loads(json_input)
            word_frequencies = defaultdict(int)

            #get 'features' and assign the result to features
            features = data.get('features', {})
            #if pages is in there and its a list
            if 'pages' in features and isinstance(features['pages'], list):
                #slice features
                pages = features['pages']
                #get everything in pages if body is in there
                body = [d['body'] for d in pages if 'body' in d]
            else:
                raise ValueError("Invalid format for 'pages' field, expected a list.")

            #iterate through every page's body to accumulate word counts
            for page_body in body:
                if isinstance(page_body, dict):
                    #get token_pos_count, assign result as shown
                    token_pos_count = page_body.get('tokenPosCount', {})
                    #for word, tags_data in token_pos_count
                    for word, tags_data in token_pos_count.items():
                        #lowercase, replace numbers, blank space and punctuation
                        word = word.lower()
                        word = re.sub(r'\d+', '', word)
                        word = word.translate(str.maketrans('', '', string.punctuation))
                        word = word.strip()

                        #apply a replacement if the word is in the mapping
                        word = replacements.get(word, word)

                        #if tags_data is a dictionary, for every frequency and word, aggregate
                        if word and isinstance(tags_data, dict):
                            for _, frequency in tags_data.items():
                                if isinstance(frequency, int):
                                    word_frequencies[word] += frequency

            #calculate total word count and compute the relative frequencies
            total_words = sum(word_frequencies.values())
            relative_frequencies = {word: (count / total_words) * 100 for word, count in word_frequencies.items()}
            return relative_frequencies

    except Exception as e:
        print(f"Error processing {file_name}: {e}")
        return None

#accumulate frequencies by author
def accumulate_frequencies_by_author(relative_frequencies, author_name, target_words):
    """Store relative frequencies for target words for each author."""
    for word, relative_freq in relative_frequencies.items():
        if word in target_words:
            author_frequencies[author_name][word].append(relative_freq)

#rrite aggregated median frequencies to CSV for each author
def write_author_frequencies_to_csv(author_name):
    csv_file = f"{author_name}.csv"
    with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(["word_type", "median_relative_frequency"])

        #compute the median relative frequency for each word type
        for word_type, freq_list in author_frequencies[author_name].items():
            median_freq = np.median(freq_list)
            writer.writerow([word_type, median_freq])

    #clear accumulated data for the author
    author_frequencies[author_name].clear()

#process json_files_from_csv accumulates frequencies by author and writes the result
def process_json_files_from_csv(csv_file, word_list_file, replacement_csv):
    #load word replacements and target words
    replacements = load_replacement_map(replacement_csv)
    target_words = load_word_list(word_list_file)

    #read author csv and assign result to df
    df = pd.read_csv(csv_file)
    current_author = None

    for _, row in df.iterrows():
        file_path = row['file_path']
        author_name = row['author']

        #write out data for the previous author if moving to a new one
        if current_author and author_name != current_author:
            write_author_frequencies_to_csv(current_author)

        #set the current author to the current file's author
        current_author = author_name

        #process the JSON file and accumulate relative frequencies for the author
        relative_frequencies = extract_word_frequencies(file_path, replacements)
        if relative_frequencies:
            accumulate_frequencies_by_author(relative_frequencies, author_name, target_words)

    #write the last author's accumulated frequencies
    if current_author:
        write_author_frequencies_to_csv(current_author)

#specify the CSV file, word list file, and replacement CSV file
csv_file = 'author_jsons.csv'
word_list_file = 'mfws.txt'
replacement_csv = 'replacement_words.csv'

#run the main processing function
process_json_files_from_csv(csv_file, word_list_file, replacement_csv)
