In [None]:
import bz2
import orjson
import csv
import pandas as pd

# Function to extract pubDate, title, contributor, and year from the metadata portion
def extract_metadata(file_name):
    try:
        # Open and read the bz2 compressed file
        with bz2.BZ2File(file_name, 'rb') as input_file:
            input_file_content = input_file.read()
            json_input = input_file_content.decode('utf-8')

            # Load the JSON content using orjson
            data = orjson.loads(json_input)

            # Extract the metadata portion
            metadata = data.get('metadata', {})

            # Extract publication date, title, contributors, and year
            pub_date = metadata.get('pubDate')
            title = metadata.get('title')
            contributors = metadata.get('contributor', [])
            year = metadata.get('year')

            # Handle different contributor structures (list or dict)
            first_contributor = None
            if isinstance(contributors, list) and len(contributors) > 0:
                first_contributor = contributors[0].get('name', None)
            elif isinstance(contributors, dict):
                first_contributor = contributors.get('name', None)

            # Return relevant metadata if pubDate exists
            if pub_date:
                return file_name[34:], pub_date, title, first_contributor, year

    except Exception as e:
        # Print out any errors encountered during processing
        print(f"Error processing {file_name}: {e}")

    # If no valid metadata is found or an error occurred, return None
    return None

# Load the list of file paths from a text file
file_path = 'engtexts.txt'
with open(file_path, 'r') as file:
    # Read and split lines into a list of paths
    paths = file.read().splitlines()

# Load the existing data if the file already exists
csv_file = 'extracted_pub_dates.csv'
try:
    dated_texts = pd.read_csv(csv_file)
    # Find the starting index for new paths
    index = paths.index(list(dated_texts['file_name'])[-1]) + 1
except (FileNotFoundError, IndexError):
    # If the file does not exist or there are no records, start from the beginning
    index = 0

# Slice paths to process only new files
paths = paths[index:]

# Add the folder prefix for the file locations
prefix = '../../Volumes/My Passport for Mac/'
paths = [prefix + filename for filename in paths]

# Open the CSV file in append mode and write only new entries
with open(csv_file, mode='a', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)

    # Write the header row if the file was just created
    if index == 0:
        writer.writerow(["file_name", "pub_date", "title", "first_contributor", "year"])

    # Iterate through each JSON file path and append the metadata to the CSV
    for name in paths:
        extracted_metadata = extract_metadata(name)
        if extracted_metadata:
            writer.writerow(extracted_metadata)

print(f"CSV file '{csv_file}' has been updated with new metadata (file names, publication dates, titles, contributors, and years).")
