<a href="https://colab.research.google.com/github/aliciama16/is262a/blob/main/Internet_Archive_Metadata_API_Pull.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

To access and work with Internet Archive metadata, you can use the Metadata API (MDAPI), which allows you to retrieve, add, and update item metadata via HTTP RESTful endpoints. You can also use the ia command-line interface or the internetarchive Python library.  

https://blog.archive.org/2013/07/04/metadata-api/#:~:text=For%20example%2C%20frenchenglishmed00gorduoft%20is%20the,identifier/files/0/name

In [1]:
#import libraries
import requests
import pandas as pd
from bs4 import BeautifulSoup
import re

#Artist Files to pull
identifiers = [
    "artistfilebaldessarijohn_202007",
]

#clean HTML tags & extract text
def clean_html(raw_text):
    if isinstance(raw_text, list):
        raw_text = " ".join(raw_text)
    if raw_text:
        soup = BeautifulSoup(raw_text, "html.parser")
        cleaned_text = soup.get_text(separator=" ").strip()

        # Extract text between "Note:" and "Rights:"
        match = re.search(r'(?i)Note:\s*(.*?)\s*Rights:', cleaned_text)
        return match.group(1).strip() if match else cleaned_text
    return "No Description"

#fetch metadata from the Internet Archive API
def fetch_metadata(identifier):
    url = f"https://archive.org/metadata/{identifier}"
    response = requests.get(url)

    if response.status_code == 200:
        data = response.json()
        metadata = data.get("metadata", {})
        files = data.get("files", [])

        #extract digitization details
        scanner = metadata.get("scanner", "Unknown Scanner")
        ppi = metadata.get("ppi", "Unknown PPI")

        #extract file size
        file_size = "Unknown Size"
        for file in files:
            if "size" in file:
                file_size = f"{file['size']} bytes"
                break

        #format digitization specifications
        digitization_specs = f"Scanned using {scanner}, {ppi} ppi. File size: {file_size}."

        #convert subject field from list to string
        subject_data = metadata.get("subject", "No Subject")
        subject_data = "; ".join(subject_data) if isinstance(subject_data, list) else subject_data

        #extract and clean description
        raw_description = metadata.get("description", "No Description")
        clean_description = clean_html(raw_description)

        added_date = metadata.get("addeddate", "")
        date_digital = added_date.split(" ")[0] if added_date else ""

        #metadata columns
        return {
            "Item Type": "URL",
            "URL": f"https://archive.org/embed/{identifier}/",
            "Title": metadata.get("title", "Unknown Title"),
            "Creator": metadata.get("creator", "Unknown Creator"),
            "Publisher": "Los Angeles County Museum of Art",
            "Place of Publication": "Los Angeles (Calif.)",
            "Type": "Artist File",
            "Description": clean_description,
            "Date": metadata.get("date", "Unknown Date"),
            "DCMI Type": "Collection",
            "Subject": subject_data,
            "Language": metadata.get("language", "und"),
            "Digitization Specifications": digitization_specs,
            "Repository": "Los Angeles County Museum of Art. Mr. and Mrs. Allan C. Blach Art Research Library",
            "Digital Collection": "LACMA Digital Collections",
            "Date Digital": date_digital,
            "Image Request": "To request the use of any of these images, contact rights@lacma.org",
            "Rights": "Rights are owned by Los Angeles County Museum of Art. Transmission or reproduction of materials protected by copyright beyond that allowed by fair use requires the written permission of the Copyright Holder. In addition, the reproduction of some materials may be restricted by terms of gift or purchase agreements, donor restrictions, privacy and publicity rights, licensing and trademarks. Works not in the public domain cannot be commercially exploited without permission of the copyright owner. Responsibility for any use rests exclusively with the user.",
        }
    else:
        return None

#starting number for Unique Digital Identifier
start_index = 1  # Change this number based on where you want to start (e.g., dc2_007)

#fetch metadata for each item and assign a Unique Digital Identifier
metadata_list = []
for i, identifier in enumerate(identifiers):
    metadata = fetch_metadata(identifier)
    if metadata:
        metadata["Unique Digital Identifier"] = f"dc2_{str(start_index + i).zfill(3)}"  # Change dc2 to the collection number
        metadata_list.append(metadata)

#convert to DataFrame
metadata_df = pd.DataFrame(metadata_list)

#test preview
metadata_df.head(20)



Unnamed: 0,Item Type,URL,Title,Creator,Publisher,Place of Publication,Type,Description,Date,DCMI Type,Subject,Language,Digitization Specifications,Repository,Digital Collection,Date Digital,Image Request,Rights,Unique Digital Identifier
0,URL,https://archive.org/embed/artistfilebaldessari...,Artist file : John Baldessari : miscellaneous ...,"Baldessari, John, 1931-",Los Angeles County Museum of Art,Los Angeles (Calif.),Artist File,Artist files are miscellaneous uncataloged mat...,Unknown Date,Collection,"Artist files ; Baldessari, John, 1931-",und,Scanned using Internet Archive HTML5 Uploader ...,Los Angeles County Museum of Art. Mr. and Mrs....,LACMA Digital Collections,2020-07-08,"To request the use of any of these images, con...",Rights are owned by Los Angeles County Museum ...,dc2_001


Format to a TXT File

In [None]:
#remove tab characters inside fields
metadata_df = metadata_df.applymap(lambda x: str(x).replace("\t", " ") if isinstance(x, str) else x)

#ensure text fields are single-line strings (removes unwanted newlines)
metadata_df = metadata_df.applymap(lambda x: " ".join(str(x).splitlines()) if isinstance(x, str) else x)

#convert double quotes to single quotes -avoid CONTENTdm duplication issues
metadata_df = metadata_df.applymap(lambda x: str(x).replace('"', "'") if isinstance(x, str) else x)



Export File

In [None]:
import csv

output_path = "/content/NAMEFILEHERE.txt"

metadata_df.to_csv(
    output_path,
    sep="\t",  #tab-separated
    index=False,  #no index column
    encoding="utf-8-sig",
    quoting=csv.QUOTE_NONE,  #prevents added quotes
    escapechar="\\",  #special characters don't break formatting
)

print(f"âœ… Tab-delimited file saved as: {output_path}")
