In [None]:
import bz2
import orjson
import csv
import pandas as pd

# function to extract publication data, title, contributor and each year from the metadata section of the json
def extract_metadata(file_name):
    try:
        # open and read the compressed file
        with bz2.BZ2File(file_name, 'rb') as input_file:
            input_file_content = input_file.read()
            json_input = input_file_content.decode('utf-8')

            # orjson
            data = orjson.loads(json_input)

            # extract metadata
            metadata = data.get('metadata', {})

            # get publication date, title and contributors
            pub_date = metadata.get('pubDate')
            title = metadata.get('title')
            contributors = metadata.get('contributor', [])

            # handle different contributor structures (list or dict)
            first_contributor = None
            if isinstance(contributors, list) and len(contributors) > 0:
                first_contributor = contributors[0].get('name', None)
            elif isinstance(contributors, dict):
                first_contributor = contributors.get('name', None)

            # Return relevant metadata
            return file_name[34:], pub_date, title, first_contributor

    except Exception as e:
        # throw any errors with file name
        print(f"Error processing {file_name}: {e}")

    # return none if this throws
    return None

# load list of english texts
file_path = 'engtexts.txt'
with open(file_path, 'r') as file:
    # split lines
    paths = file.read().splitlines()

# load existing data if it already exists
csv_file = 'extracted_pub_dates.csv'
try:
    dated_texts = pd.read_csv(csv_file)
    # find starting index
    index = paths.index(list(dated_texts['file_name'])[-1]) + 1
except (FileNotFoundError, IndexError):
    # if the file does not exist, start over
    index = 0

# slice paths to access new files
paths = paths[index:]

# add the folder prefix
prefix = '../../Volumes/My Passport for Mac/'
paths = [prefix + filename for filename in paths]

# open the csv, append new information
with open(csv_file, mode='a', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)

    # write the header row
    if index == 0:
        writer.writerow(["file_name", "pub_date", "title", "first_contributor"])

    # iterate through each JSON file path and append the metadata
    for name in paths:
        extracted_metadata = extract_metadata(name)
        if extracted_metadata:
            writer.writerow(extracted_metadata)

print(f"CSV file '{csv_file}' has been updated with new metadata (file names, publication dates, titles, contributors, and years).")
