In [2]:
# find compound terms where the last word doesn't exist in the vocabulary
import json
import re

# read json file to list [ {id, term}, {id, term}, ... ]
def read_data(file_name: str="") -> list:
    data = []
    with open(file_name, mode="r", encoding="utf-8") as read_file:
        data = json.load(read_file)
    return list(map(lambda item: {"id": item.get("id", ""), "term": item.get("pattern", "")}, data))


def term_exists(term: str="", data: list=[]) -> bool:
    terms = list(map(lambda item: item.get("term", "").strip().lower(), data))
    clean = term.strip().lower()
    return any(t ==  clean for t in terms)


def is_compound(term: str="") -> bool: return len(term.split()) > 1


def get_last_word(term: str="") -> str: return term.split()[-1]


def clean_term(term: str="") -> str:
    cleaned = term.strip() # strip leading and training spaces  
    cleaned = re.sub(r"\s*\<[^\>]+\>", "", cleaned) # remove angle bracket qualifiers
    cleaned = re.sub(r"\s*\([^\)]+\)", "", cleaned) # remove round bracket qualifiers
    return cleaned


if __name__ == '__main__':
    # read input data from file        
    FILE1 = "./rematch2/vocabularies/patterns_en_FISH_ARCHOBJECTS_20210921.json"
    FILE2 = "./rematch2/vocabularies/patterns_en_FISH_MONUMENT_TYPES_20210921.json"
    data = read_data(FILE2)

    # find compound terms where the last word is not in the vocabulary 
    for item in data:
        uri = item.get("id", "")
        term = item.get("term", "")
        clean = clean_term(term)
        last = get_last_word(clean)    
        if is_compound(clean) and not term_exists(last, data):
            print(f"{uri}\t\"{term}\"\t\"{last}\"")
       



http://purl.org/heritagedata/schemes/eh_tmt2/concepts/70379	"A P Linear System"	"System"
http://purl.org/heritagedata/schemes/eh_tmt2/concepts/94047	"Aa Box"	"Box"
http://purl.org/heritagedata/schemes/eh_tmt2/concepts/103392	"Abbots Lodging"	"Lodging"
http://purl.org/heritagedata/schemes/eh_tmt2/concepts/140311	"Ablutions Block"	"Block"
http://purl.org/heritagedata/schemes/eh_tmt2/concepts/71687	"Academy Of Art"	"Art"
http://purl.org/heritagedata/schemes/eh_tmt2/concepts/92722	"Academy Of Music"	"Music"
http://purl.org/heritagedata/schemes/eh_tmt2/concepts/93370	"Acoustic Detection Post"	"Post"
http://purl.org/heritagedata/schemes/eh_tmt2/concepts/93370	"Acoustic Mirror"	"Mirror"
http://purl.org/heritagedata/schemes/eh_tmt2/concepts/143374	"Activity Centre"	"Centre"
http://purl.org/heritagedata/schemes/eh_tmt2/concepts/69048	"Adit Entrance"	"Entrance"
http://purl.org/heritagedata/schemes/eh_tmt2/concepts/71359	"Administration Block"	"Block"
http://purl.org/heritagedata/schemes/eh_tmt2/