In [3]:
import bz2
import orjson
import csv
import pandas as pd

# extract publication date, title, author and year from the metadata
def extract_metadata(file_name):
    try:
        # open and read file
        with bz2.BZ2File(file_name, 'rb') as input_file:
            input_file_content = input_file.read()
            json_input = input_file_content.decode('utf-8')

            # load the JSON content using orjson
            data = orjson.loads(json_input)

            # extract metadata
            metadata = data.get('metadata', {})
            pub_date = metadata.get('pubDate')
            title = metadata.get('title')
            contributors = metadata.get('contributor', [])
            year = metadata.get('year')

            # handle different contributor structures
            first_contributor = None
            if isinstance(contributors, list) and len(contributors) > 0:
                first_contributor = contributors[0].get('name', None)
            elif isinstance(contributors, dict):
                first_contributor = contributors.get('name', None)

            # return relevant metadata even if some fields are missing
            return file_name[34:], pub_date, title, first_contributor, year

    except Exception as e:
        # throw errors
        print(f"Error processing {file_name}: {e}")

    # return placeholder values
    return file_name[34:], None, None, None, None

# load the list of text files written in english
file_path = 'engtexts.txt'
with open(file_path, 'r') as file:
    # read and split lines
    paths = file.read().splitlines()

# loading existing data if it already exists
csv_file = 'extracted_metadata.csv'
try:
    dated_texts = pd.read_csv(csv_file)
    # find starting index for new paths
    index = paths.index(list(dated_texts['file_name'])[-1]) + 1
except (FileNotFoundError, IndexError):
    # if the file doesn't already exist start from the beginning
    index = 0

# slice paths to process only new files
paths = paths[index:]

# add folder prefix
prefix = '../../Volumes/My Passport for Mac/'
paths = [prefix + filename for filename in paths]

# open the csv in append
with open(csv_file, mode='a', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)

    # write the header row if it is new
    if index == 0:
        writer.writerow(["file_name", "pub_date", "title", "first_contributor", "year"])

    # iterate through
    for name in paths:
        extracted_metadata = extract_metadata(name)
        if extracted_metadata:
            writer.writerow(extracted_metadata)

print(f"CSV file '{csv_file}' has been updated with new metadata (file names, publication dates, titles, contributors, and years).")


CSV file 'rejected_extracted_pub_dates.csv' has been updated with new metadata (file names, publication dates, titles, contributors, and years).
