In [1]:
%%capture
# load required dependencies
%pip install --upgrade pip
%pip install -r "./requirements.txt"

## Categorisation and parsing of GoTriple keyword values

This notebook demonstrates categorization of GoTriple keywords by comparing them to predefined known patterns. It also demonstrates the parsing of identification codes and/or labels (where possible). A keyword analysis was undertaken on GoTriple resources by Net7 for the ATRIUM project, the results were uploaded to a Google Sheet for review and further processing. It was observed that the 'KEYWORD' column sometimes contains values that are not free-text words or phrases, but instead are codes (or codes combined with terms) representing concepts from an external scheme (e.g. DDC, AGROVOC, MeSH). Some observed examples are listed below

### CNRS CLASSIFICATION
The origin of these codes is "Centre national de la recherche scientifique (CNRS) France"
Examples from the keywords analysis spreadsheet:
* "[SHS.ARCHEO] Humanities and Social Sciences/Archaeology and Prehistory"
* "[SHS.SCIPO] Humanities and Social Sciences/Political science" 

### VLB (Verzeichnis lieferbarer Bücher) genre
Examples from the keywords analysis spreadsheet:
* "(VLB-WN)2115: TB/Belletristik/Anthologien"
* "(VLB-WN)2580: Taschenbuch / Kunst"

### BISAC subject headings
See [https://www.bisg.org/complete-bisac-subject-headings-list](https://www.bisg.org/complete-bisac-subject-headings-list)
eg "(BISAC Subject Heading)FIC031000: FICTION / Thrillers / General" - see https://www.bisg.org/fiction - listed here

### Mixed indexing
Example of a GoTriple record with a mixture of keyword indexing: [https://www.gotriple.eu/documents/dnb_1274564107](https://www.gotriple.eu/documents/dnb_127456410)
The keywords section contains English, German, VLB, BISAC (code), BISAC (code & labels), semicolon delimited values, prefixed values.

In [27]:
# import required library modules
import json
import os.path
import re
import pandas as pd
from collections import Counter
from IPython.display import display, HTML


def normalize_whitespace(s: str="") -> str:
    return " ".join(s.strip().split()) 
    

def get_gotriple_keywords_analysis_data() -> []:
    # read first tab of GoTriple Keywords analysis Google spreadsheet to local CSV file
    # subsequently use as cache, only do remote request if the local file is not present
    # returns array of dict [{KEYWORD, COUNT}, {KEYWORD, COUNT}, ..]
    REMOTE_CSV = "https://docs.google.com/spreadsheets/d/1rI_BrE6BcyWCkipaKqTZ5bQJXcLwFMMkLpLMu1s7gKg/export?format=csv"
    LOCAL_CSV = "./data/gotriple-keywords-analysis.csv"
    if not os.path.exists(LOCAL_CSV):    
        df = pd.read_csv(REMOTE_CSV, skip_blank_lines=True, index_col=0)
        df.to_csv(LOCAL_CSV)      
    else: 
        df = pd.read_csv(LOCAL_CSV, skip_blank_lines=True, index_col=0)

     # set any NaN values to blank string
    df.fillna("", inplace=True)
    return df.to_dict(orient="records") # TODO: need to avoid outputting the 'unnamed' index field..
    

def categorize_by_regex(items: list=[], pattern: str="^.*$", category: str=""):
    reg = re.compile(pattern)
    
    # try matching the given pattern against all currently uncategorised items
    uncategorized_items = filter(lambda item: item.get(category, "") == "", items)
    for item in uncategorized_items:
        keyword = normalize_whitespace(str(item.get("KEYWORD", "")))
        if(keyword != ""):
            match = reg.fullmatch(keyword)
            # if matched, supplement item with additional properties
            if match is not None:
                item["category"] = category
                groups = match.groupdict()            
                item["code"] = groups.get("code", "")
                item["subcode"] = groups.get("subcode", "")
                item["label"] = groups.get("label", "")


def categorize(items: list=[]):
    # try matching keywords against each regular expression pattern. 
    # Patterns contain named groups for parsing of codes and labels
    categorize_by_regex(items, r'^http://aims\.fao\.org/aos/agrovoc/(?P<code>c_\d+)$', "AGROVOC") # 111 categorized
    categorize_by_regex(items, r'^\(BIC subject category\)(?P<code>[A-Z]+)(?::\s)?(?P<label>.*)$', "BIC") # 14 categorized
    categorize_by_regex(items, r'^\(BISAC Subject Heading\)(?P<code>[A-Z]+[0-9]+)(?::\s)?(?P<label>.*)$', "BISAC") # 243 categorized
    categorize_by_regex(items, r'^(?P<code>\d{2}\.\d{2})\s(?P<label>[A-ZÄÖÜß].*)$', "BK") # 124 categorized
    categorize_by_regex(items, r'^ddc:(?P<code>[\d.]+)$', "DDC") # 195 categorized
    categorize_by_regex(items, r'^ddc:(?P<code>[\d]+\.[\d]+)$', "DDC") #  21 categorized
    categorize_by_regex(items, r'^(?P<code>[\d]{3}(?:\.[\d]+)?)\s(?P<label>[A-ZÄÖÜß].+)$', "DDC") # 214 categorized
    categorize_by_regex(items, r'^\(DDC-Sachgruppen der Deutschen Nationalbibliografie\)(?P<code>[\d.]+)$', "DDC") # 9 categorized
    categorize_by_regex(items, r'^info:eu-repo/classification/ddc/(?P<code>[\d.]+).*$', "DDC") # 74 categorized
    categorize_by_regex(items, r'^/dk/atira/pure/core/keywords/(?P<label>[^0-9]+)$', "DK") # 24 categorized
    categorize_by_regex(items, r'^/dk/atira/pure/core/keywords/(?P<code>[0-9]+)$', "DK") # 24 categorized
    categorize_by_regex(items, r'^\[(?P<code>[A-Z\-]{3,}(\.[A-Z\-]+)+)\](?P<label>.*)$', "HAL") # 334 categorized
    categorize_by_regex(items, r'^jel:(?P<code>[A-Z]\d+)$', "JEL") # 329 categorized
    categorize_by_regex(items, r'^JEL:\s(?P<code>[^\s]+)\s-\s(?P<label>.*)$', "JEL") # 196 categorized
    #categorize_by_regex(items, r'^JEL:\s(?P<code>.*)$', "JEL3") # 196 categorized
    categorize_by_regex(items, r'^MESH:\s(?P<label>[A-Z].+)$', "MESH") # 69 categorized
    categorize_by_regex(items, r'^Settore\s(?P<code>[^\s]+)\s-\s(?P<label>[A-Z].*)$', "SSD") # 52 categorized
    categorize_by_regex(items, r'^\(stw\)(?P<label>[A-ZÄÖÜß].+)$', "STW") # 42 categorized
    categorize_by_regex(items, r'^Temporal coverage:(?P<label>.+)$', "TEMPORAL") # 98 categorized
    categorize_by_regex(items, r'^\((?P<code>VLB-[A-ZÄÖÜß]+)\)(?P<subcode>[^:\s]+):\s(?P<label>.*$)', "VLB") # 516 categorized   
    categorize_by_regex(items, r'^\[(?P<code>[A-Z]+-?[A-Z+](?:\.[[A-Z]+-?[A-Z+])*)\](?P<label>.*)$', "CNRS") # 250 categorized
    categorize_by_regex(items, r'^\(Produktform\)(?P<label>.*)$', "UNKNOWN2") # 20 categorized
    categorize_by_regex(items, r'^(?P<code>[A-Z]+[0-9.]+)$', "UNKNOWN3") # 844 categorized
    categorize_by_regex(items, r'^(?P<code>[A-Z]+[0-9]+)\s(?P<label>.*)$', "UNKNOWN4") # 393 categorized
    categorize_by_regex(items, r'^(?P<code>[A-Z]+[0-9.]+-[\d\.]+)$', "UNKNOWN5") # 351 categorized
    categorize_by_regex(items, r'^(?P<code>\d+)$', "NUMBER") # 295 categorized
    categorize_by_regex(items, r'^(?P<code>[A-Z])$', "ALPHA") # 23 categorized
    categorize_by_regex(items, r'^(?:Exact|Close)\sMatch:\s(?P<label>.+)$', "UNKNOWN6") # 19 categorized
    #categorize_by_regex(items, r'^(?P<code>[A-Z0-9]+)$', "ALPHANUM") # 


def summarize(items):
    # summarize count of items per category
    summary = {}
    # calculate counts by category
    values = list(map(lambda i: i.get("category", "(BLANK)"), items))
    counts = list(Counter(values).items()) 
    counts.sort(key=lambda t: t[0])

    # display the calculated counts
    print(f"item count = {len(items)}") # expected = 65,500
    print("categories:")
    for category, count in counts:
        print(f"\"{category}\" : {count}")
 

if __name__ == "__main__":
    # get the input data
    items = get_gotriple_keywords_analysis_data()

    # process the input data
    categorize(items)
    summarize(items)

    # write results to a JSON file
    with open("./data/gotriple-keywords-categorize-results.json", "w") as out_file:
        json.dump(items, out_file)

item count = 65500
categories:
"(BLANK)" : 60899
"AGROVOC" : 111
"ALPHA" : 23
"BIC" : 14
"BISAC" : 243
"BK" : 124
"CNRS" : 250
"DDC" : 492
"DK" : 23
"HAL" : 97
"JEL" : 525
"MESH" : 69
"NUMBER" : 295
"SSD" : 52
"STW" : 42
"TEMPORAL" : 98
"UNKNOWN2" : 20
"UNKNOWN3" : 844
"UNKNOWN4" : 393
"UNKNOWN5" : 351
"UNKNOWN6" : 19
"VLB" : 516
