In [4]:
# find compound terms where the last word doesn't exist in the vocabulary
import json
import re

# read json file to list [ {id, term}, {id, term}, ... ]
def read_data(file_name: str="") -> list:
    data = []
    with open(file_name, mode="r", encoding="utf-8") as read_file:
        data = json.load(read_file)
    return list(map(lambda item: {"id": item.get("id", ""), "term": item.get("pattern", "")}, data))


def term_exists(term: str="", data: list=[]) -> bool:
    terms = list(map(lambda item: item.get("term", "").strip().lower(), data))
    clean = term.strip().lower()
    return any(t ==  clean for t in terms)


def is_compound(term: str="") -> bool: return len(term.split()) > 1


def get_last_word(term: str="") -> str: return term.split()[-1]


def clean_term(term: str="") -> str:
    cleaned = term.strip() # strip leading and training spaces  
    cleaned = re.sub(r"\s*\<[^\>]+\>", "", cleaned) # remove angle bracket qualifiers
    cleaned = re.sub(r"\s*\([^\)]+\)", "", cleaned) # remove round bracket qualifiers
    return cleaned


def get_compound_terms_1():
    INPUT_FILE = "./rematch2/vocabularies/patterns_en_FISH_ARCHOBJECTS_20210921.json"
    OUTPUT_FILE = "./data/tmp/compound-terms-final-word-not-in-objects-thesaurus.txt"
    with open(OUTPUT_FILE, 'w') as output:
        data = read_data(INPUT_FILE)
        for item in data:
            uri = item.get("id", "")
            term = item.get("term", "")
            clean = clean_term(term)
            last = get_last_word(clean)    
            if is_compound(clean) and not term_exists(last, data):
                output.write(f"{uri}\t\"{term}\"\t\"{last}\"\n")


def get_compound_terms_2():
    INPUT_FILE = "./rematch2/vocabularies/patterns_en_FISH_MONUMENT_TYPES_20210921.json"
    OUTPUT_FILE = "./data/tmp/compound-terms-final-word-not-in-monuments-thesaurus.txt"
    with open(OUTPUT_FILE, 'w') as output:
        data = read_data(INPUT_FILE)
        for item in data:
            uri = item.get("id", "")
            term = item.get("term", "")
            clean = clean_term(term)
            last = get_last_word(clean)    
            if is_compound(clean) and not term_exists(last, data):
                output.write(f"{uri}\t\"{term}\"\t\"{last}\"\n")


def get_compound_terms_3():
    INPUT_FILE = "./rematch2/vocabularies/patterns_en_FISH_ARCHOBJECTS_20210921.json"
    OUTPUT_FILE = "./data/tmp/compound-terms-in-objects-thesaurus-with-bracketed-suffix.txt"
    with open(OUTPUT_FILE, 'w') as output:
        data = read_data(INPUT_FILE)
        for item in data:
            uri = item.get("id", "")
            term = item.get("term", "")
            m = re.search(r"\s*\<[^\>]+\>", term)
            if m: output.write(f"{uri}\t\"{term}\"\n")
            m = re.search(r"\s*\([^\>]+\)", term)
            if m: output.write(f"{uri}\t\"{term}\"\n")


def get_compound_terms_4():
    INPUT_FILE = "./rematch2/vocabularies/patterns_en_FISH_MONUMENT_TYPES_20210921.json"
    OUTPUT_FILE = "./data/tmp/compound-terms-in-monuments-thesaurus-with-bracketed-suffix.txt"
    with open(OUTPUT_FILE, 'w') as output:
        data = read_data(INPUT_FILE)
        for item in data:
            uri = item.get("id", "")
            term = item.get("term", "")
            m = re.search(r"\s*\<[^\>]+\>", term)
            if m: output.write(f"{uri}\t\"{term}\"\n")
            m = re.search(r"\s*\([^\>]+\)", term)
            if m: output.write(f"{uri}\t\"{term}\"\n")


if __name__ == '__main__':
    get_compound_terms_1()
    get_compound_terms_2()
    get_compound_terms_3()
    get_compound_terms_4()
       

