In [1]:
%%capture

import warnings
# suppress user warnings during execution
warnings.filterwarnings(action='ignore', category=UserWarning)

# load required dependencies
%pip install --upgrade pip
%pip install pandas urllib

In [None]:
# script to download latest FISH vocabularies from heritage-standards.org.uk/fish-vocabularies/
# process them and output JSON files of spaCy NER patterns for use in entity recognition

import json
import os
import requests
import zipfile
import shutil
from slugify import slugify
import pandas
from urllib.parse import urlparse, unquote, quote_plus

# HeritageData used a standardised list of scheme_id for vocabularies, 
# _sometimes_ different to the (numeric) CLA_GR_UID as used in AMIE DB
# Convert a vocabulary ID to the corresponding HeritageData scheme_id
def vocabulary_id_to_heritagedata_scheme_id(vocabulary_id: str) -> str:
    mapping = {
        "1": "eh_tmt2",
        "92": "eh_evd",
        "128": "mda_obj",
        "129": "eh_tbm",
        "143": "eh_tmc",
        "546": "eh_com",
        "566": "agl_et",        
    }
    return mapping.get(vocabulary_id, vocabulary_id)


# download file from URL to output path and return filename including path
# use a previously cached file if it exists, rather than downloading again
def download_file_from_url(url: str, output_path: str=".") -> str:    

    # ensure the intended output path exists   
    if not os.path.exists(output_path): os.makedirs(output_path)

    # just use previously cached file if it exists
    file_name = url.split("?")[0].split("/")[-1]
    file_path = f"{output_path}/{file_name}" 

    if os.path.exists(file_path):
        print(f"Using cached file {file_path}")        
    else:
        print(f"Downloading file {url} to {file_path}")
        response = requests.get(url, timeout=30, stream=True)
        with open(file_path, "wb") as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
    return file_path


# get latest zip files of CSV data from https://heritage-standards.org.uk/fish-vocabularies/ 
# Note the URLs for the 'latest' files do change so check the URL above for the latest links 
remote_file_urls = [
    r"https://heritage-standards.org.uk/2025/zip_files/ArchaeologicalObjectsV28.zip",
    r"https://heritage-standards.org.uk/2025/zip_files/ArchaeologicalSciencesV28.zip",
    r"https://heritage-standards.org.uk/2025/zip_files/BuildingMaterialsV28.zip",
    r"https://heritage-standards.org.uk/2025/zip_files/ComponentsV28.zip",
    r"https://heritage-standards.org.uk/2025/zip_files/EventV28.zip",
    r"https://heritage-standards.org.uk/2025/zip_files/EvidenceV28.zip",
    r"https://heritage-standards.org.uk/2025/zip_files/MaritimeCraftV28.zip",
    r"https://heritage-standards.org.uk/2025/zip_files/MonumentV28.zip"    
]

# ensure local data directory exists first
local_directory = r"./data/fish/"

for url in remote_file_urls:    
    # download and cache the zip file locally
    local_file_path = download_file_from_url(url, local_directory)

    # create directory to extract zip contents into
    local_file_name = url.split("?")[0].split("/")[-1]
    local_data_path = os.path.join(local_directory, slugify(local_file_name))
    if not os.path.exists(local_data_path):
        os.makedirs(local_data_path)

    # extract zip contents locally to named directory   
    with zipfile.ZipFile(local_file_path, 'r') as zf:
        zf.extractall(local_data_path)
    
    # read only the data files we are interested in for this exercise
    terms_file_name = os.path.join(local_data_path, "thesaurus_terms.csv")
    df = pandas.read_csv(terms_file_name, encoding_errors="replace")
    terms = df.to_dict(orient="records")
    
    prefs_file_name = os.path.join(local_data_path, "thesaurus_term_preferences.csv")
    df = pandas.read_csv(prefs_file_name, encoding_errors="replace")
    prefs = df.to_dict(orient="records")
    
    # generate a list of spaCy patterns for use in the NER 
    data = []
    scheme_id = "unknown"

    for term in terms:
        vocabulary_id = str(term.get("CLA_GR_UID", "unknown")).strip()
        scheme_id = quote_plus(vocabulary_id_to_heritagedata_scheme_id(vocabulary_id)) 
        term_id = quote_plus(str(term.get("THE_TE_UID", "unknown")).strip())
        preferred = next((item for item in prefs if str(item.get("THE_TE_UID_1", "")) == term_id), None)
        if(preferred is not None): term_id = quote_plus(str(preferred.get("THE_TE_UID_2", term_id)).strip())

        term_label = str(term.get("TERM", "")).strip().title() # title case for diff...
        concept_id = f"http://purl.org/heritagedata/schemes/{scheme_id}/concepts/{term_id}"
        data.append({"id": concept_id, "pattern": term_label })
    
    # write out the patterns to a JSON file
    output_file_name = os.path.join(local_directory, f"patterns_en_FISH_{scheme_id}.json")  
    with open(output_file_name, "w") as fp: #, encoding="utf-8"
        json.dump(data, fp, indent=2)

    # clean up intermediate files (but keeping cached zip files)
    shutil.rmtree(local_data_path, ignore_errors=True)  



Using cached file ./data/fish//ArchaeologicalObjectsV28.zip
Using cached file ./data/fish//ArchaeologicalSciencesV28.zip
Using cached file ./data/fish//BuildingMaterialsV28.zip
Using cached file ./data/fish//ComponentsV28.zip
Using cached file ./data/fish//EventV28.zip
Using cached file ./data/fish//EvidenceV28.zip
Using cached file ./data/fish//MaritimeCraftV28.zip
Using cached file ./data/fish//MonumentV28.zip
