In [1]:
#load the libraries we need
import bz2
import orjson
import csv
import os
import pandas as pd
from collections import defaultdict
import re
import string
import numpy as np

#dictionary to store word frequencies for each author
author_frequencies = defaultdict(lambda: defaultdict(list))

#extract word frequencies from a single JSON file
def extract_word_frequencies(file_name):
    try:
        #load compressed json
        with bz2.BZ2File(file_name, 'rb') as input_file:
            #read and decode
            input_file_content = input_file.read()
            json_input = input_file_content.decode('utf-8')
            #load with orjson
            data = orjson.loads(json_input)
            word_frequencies = defaultdict(int)

            #extract pages and token frequencies
            features = data.get('features', {})
            #if pages is in features and is a list
            if 'pages' in features and isinstance(features['pages'], list):
                #slice features for pages and assign result as shown
                pages = features['pages']
                body = [d['body'] for d in pages if 'body' in d]
            else:
                raise ValueError("Invalid format for 'pages' field, expected a list.")

            #iterate through each page's body to accumulate word counts
            for page_body in body:
                #if page_body is a dictionary
                if isinstance(page_body, dict):
                    #get the token_pos_count, assign result as shown
                    token_pos_count = page_body.get('tokenPosCount', {})
                    #for every word, tags_data in token_pos_count
                    for word, tags_data in token_pos_count.items():
                        #lower-case, remove digits, punctuation and whitespace
                        word = word.lower()
                        word = re.sub(r'\d+', '', word) 
                        word = word.translate(str.maketrans('', '', string.punctuation))
                        word = word.strip()

                        #aggregate word_types
                        if word and isinstance(tags_data, dict):
                            for _, frequency in tags_data.items():
                                if isinstance(frequency, int):
                                    word_frequencies[word] += frequency

            #calculate total word count and compute the relative frequencies
            total_words = sum(word_frequencies.values())
            relative_frequencies = {word: (count / total_words) * 100 for word, count in word_frequencies.items()}
            return relative_frequencies

    except Exception as e:
        print(f"Error processing {file_name}: {e}")
        return None

#aggregate frequencies by author
def accumulate_frequencies_by_author(relative_frequencies, author_name):
    for word, relative_freq in relative_frequencies.items():
        author_frequencies[author_name][word].append(relative_freq)

#write aggregated median frequencies to CSV for each author
def write_author_frequencies_to_csv(author_name):
    csv_file = f"{author_name}.csv"
    with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(["word_type", "median_relative_frequency"])

        #compute median relative frequencies for each author
        for word_type, freq_list in author_frequencies[author_name].items():
            median_freq = np.median(freq_list)
            writer.writerow([word_type, median_freq])

    #clear accumulated data for the author
    author_frequencies[author_name].clear()

#process json files from the csv, accumulate frequencies by author, write results for each author separately
def process_json_files_from_csv(csv_file):
    """Process JSON files, accumulate frequencies by author, and write results for each author."""
    #read the csv, assign result to df
    df = pd.read_csv(csv_file)
    #clear the decks re: current_author
    current_author = None

    #for every row in df
    for _, row in df.iterrows():
        #define file_path as the relevant file_path
        file_path = row['file_path']
        #same with author
        author_name = row['author']

        #write out data for the previous author if moving to a new one
        if current_author and author_name != current_author:
            write_author_frequencies_to_csv(current_author)

        #set the current author to the current file's author
        current_author = author_name

        #process the JSON file and accumulate relative frequencies for the author
        relative_frequencies = extract_word_frequencies(file_path)
        if relative_frequencies:
            accumulate_frequencies_by_author(relative_frequencies, author_name)

    #write the last author's accumulated frequencies
    if current_author:
        write_author_frequencies_to_csv(current_author)

#specify CSV file
csv_file = 'author_jsons.csv'

#off we go
process_json_files_from_csv(csv_file)


KeyboardInterrupt: 

In [6]:
print(author_frequencies)

