In [None]:
#pull in the libraries we need
import bz2
import orjson
import csv
import pandas as pd

#extract pub_date from metadata portion
def extract_pub_date(file_name):
    try:
        #open and read the compressed files
        with bz2.BZ2File(file_name, 'rb') as input_file:
            input_file_content = input_file.read()
            json_input = input_file_content.decode('utf-8')

            #load json content using rjson
            data = orjson.loads(json_input)

            #extract metadata portion, get pub_date
            metadata = data.get('metadata', {})
            pub_date = metadata.get('pubDate')

            #if its there return with file name
            if pub_date:
                return file_name[34:], pub_date

    except Exception as e:
        #throw any errors
        print(f"Error processing {file_name}: {e}")

    #if no pubdate is found
    return None

#load list of english texts
file_path = 'engtexts.txt'
with open(file_path, 'r') as file:
    # split lines 
    paths = file.read().splitlines()

#load existing data, if the id is already in there
csv_file = 'extracted_pub_dates.csv'
try:
    dated_texts = pd.read_csv(csv_file)
    #find starting index for new paths
    index = paths.index(list(dated_texts['file_name'])[-1]) + 1
except (FileNotFoundError, IndexError):
    #if the file does not exist or there are no paths
    index = 0

# lice paths to ensure we're only pulling on new files
paths = paths[index:]

#add folder prefix
prefix = '../../Volumes/My Passport for Mac/'
paths = [prefix + filename for filename in paths]

#open csv file
with open(csv_file, mode='a', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)

    #header row
    if index == 0:
        writer.writerow(["file_name", "pub_date"])

    #iterate through and append
    for name in paths:
        extracted_date = extract_pub_date(name)
        if extracted_date:
            writer.writerow(extracted_date)

print(f"CSV file '{csv_file}' has been updated with new file names and publication dates.")
