In [1]:
# load required dependencies
%pip install --upgrade pip
%pip install -r "./requirements.txt"





Collecting pip
  Downloading pip-24.1-py3-none-any.whl.metadata (3.6 kB)
Downloading pip-24.1-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.0
    Uninstalling pip-24.0:
      Successfully uninstalled pip-24.0
Successfully installed pip-24.1
Note: you may need to restart the kernel to use updated packages.
Collecting lxml>=5.2.1 (from -r ./requirements.txt (line 6))
  Using cached lxml-5.2.2-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Using cached lxml-5.2.2-cp312-cp312-manylinux_2_28_x86_64.whl (4.9 MB)
Installing collected packages: lxml
Successfully installed lxml-5.2.2
Note: you may need to restart the kernel to use updated packages.


In [17]:
# import required library modules
import pprint 
#import csv
import json
import os.path
import re
import pandas as pd
from IPython.display import display, HTML

# not used now?
def try_parse_int(s, base=10, val=None):
  try:
    return int(s, base)
  except ValueError:
    return val



def get_keyword_analysis_data() -> []:
    # read first tab of GoTriple Keywords analysis Google spreadsheet to local CSV file
    # subsequently use as cache, only do remote request if the local file doesn't exist
    #returns array of dict [{KEYWORD, COUNT}, {KEYWORD, COUNT}..]
    REMOTE_CSV = "https://docs.google.com/spreadsheets/d/1rI_BrE6BcyWCkipaKqTZ5bQJXcLwFMMkLpLMu1s7gKg/export?format=csv"
    LOCAL_CSV = "./data/gotriple-keywords-analysis.csv"
    if not os.path.exists(LOCAL_CSV):    
        df = pd.read_csv(REMOTE_CSV, skip_blank_lines=True)
        df.to_csv(LOCAL_CSV)      
    else: 
        df = pd.read_csv(LOCAL_CSV, skip_blank_lines=True)

    #print(df.columns)
    #print(df.head(10)) 
    # set any NaN values to blank string
    df.fillna("", inplace=True)
    items = df.to_dict(orient="records") 
    return items


def filter_by_regex(items: list=[], pattern: str="^.*$"):
    reg = re.compile(pattern)
    return list(filter(lambda item: reg.fullmatch(str(item.get("KEYWORD", ""))), items))


def categorize_by_regex(items: list=[], pattern: str="^.*$", category: str=""):
    matches = filter_by_regex(items, pattern)
    categorized = 0
    for item in matches:
        if item.get(category, "") == "":
            item["category"] = category
            categorized += 1
    print(f"\"{category}\" : {categorized} categorized")
    return matches


items = get_keyword_analysis_data()
print(f"item count = {len(items)}") # expected = 65,500
# categorizing the patterns identified in the keywords
# by categorizing first, we can then treat each keyword appropriately
# longer term - create a series of handlers for the different categories
# e,g, class ddc_handler: handler, 
#   .pattern : str (private), 
#   .category : str
#   .is_match(text: str) -> bool
#   .parse(text: str) -> dict
#
categorize_by_regex(items, r'^http://aims\.fao\.org/aos/agrovoc/(?P<code>c_\d+)$', "AGROVOC") # 111 categorized
categorize_by_regex(items, r'^\(BIC subject category\)(?P<code>[A-Z]+)(?::\s)?(?P<label>.*)$', "BIC") # 14 categorized
categorize_by_regex(items, r'^\(BISAC Subject Heading\)(?P<code>[A-Z]+[0-9]+)(?::\s)?(?P<label>.*)$', "BISAC") # 243 categorized
categorize_by_regex(items, r'^(?P<code>\d{2}\.\d{2})\s(?P<label>[A-ZÄÖÜß].*)$', "BK") # 124 categorized
categorize_by_regex(items, r'^ddc:(?P<code>[\d.]+)$', "DDC1") # 195 categorized
categorize_by_regex(items, r'^(?P<code>[\d]{3}(?:\.[\d]+)?)\s(?P<label>[A-ZÄÖÜß].+)$', "DDC2") # 214 categorized
categorize_by_regex(items, r'^\(DDC-Sachgruppen der Deutschen Nationalbibliografie\)(?P<code>[\d.]+)$', "DDC3") # 9 categorized
categorize_by_regex(items, r'^info:eu-repo/classification/ddc/(?P<code>[\d.]+).*$', "DDC4") # 74 categorized
categorize_by_regex(items, r'^/dk/atira/pure/core/keywords/(?P<label>[^0-9]+)$', "DK1") # 24 categorized
categorize_by_regex(items, r'^/dk/atira/pure/core/keywords/(?P<code>[0-9]+)$', "DK2") # 24 categorized
categorize_by_regex(items, r'^\[(?P<code>[A-Z\-]{3,}(\.[A-Z\-]+)+)\](?P<label>.*)$', "HAL") # 334 categorized
categorize_by_regex(items, r'^jel:(?P<code>[A-Z]\d+)$', "JEL1") # 329 categorized
categorize_by_regex(items, r'^JEL:\s(?P<multi>.*)$', "JEL2") # 196 categorized
categorize_by_regex(items, r'^MESH:\s(?P<label>[A-Z].+)$', "MESH") # 69 categorized
categorize_by_regex(items, r'^Settore\s(?P<code>[^\s]+)\s-\s(?P<label>[A-Z].*)$', "SSD") # 52 categorized
categorize_by_regex(items, r'^\(stw\)(?P<label>[A-ZÄÖÜß].+)$', "STW") # 42 categorized
categorize_by_regex(items, r'^Temporal coverage:(?P<label>.+)$', "TEMPORAL") # 98 categorized
categorize_by_regex(items, r'^(?P<code>\(VLB-[A-ZÄÖÜß]+\)\d+):\s(?P<label>.*)$', "VLB1") # 491 categorized
categorize_by_regex(items, r'^\((?P<code>VLB-[A-ZÄÖÜß]+)\)(?P<label>[A-Z].*$)', "VLB2") # 268 categorized
categorize_by_regex(items, r'^\[(?P<code>[A-Z]+-?[A-Z+](?:\.[[A-Z]+-?[A-Z+])*)\](?P<label>.*)$', "CNRS") # 250 categorized

#categorize_by_regex(items, r'^[A-Z]{1,2}\d+\-\d+$', "HYPHENATED_CODE")
#categorize_by_regex(items, r'^[A-Z]$', "SINGLE_UPPERCASE_LETTER")
#categorize_by_regex(items, r'^[A-Z\s]{2,}$', "ALL_UPPERCASE")
#categorize_by_regex(items, r'^\([A-Z]+\-[A-Z]+\).*$', "PREFIXED_ROUND_BRACKETS_HYPHENATED_CODE")
#reg = re.compile(r'^ddc\:\d{3}$')                    # Compile the regex
#lst = list(filter(lambda item: reg.fullmatch(item.get("keyword")), items))

# data cleansing steps
# TODO strip leading & trailing spaces, normalize remaining spaces 
# TODO normalize separators and punctuation
# TODO extract code(s) and label(s) from phrases
# TODO determine language of labels (if not a code)

# "CNRS_CLASSIFICATION":
# origin of these codes = Centre national de la recherche scientifique (CNRS) France
# Prefix eg "SHS" = "Sciences Humaines et Sociales" (Humanities and Social Sciences)
# "[SHS.ARCHEO] Humanities and Social Sciences/Archaeology and Prehistory"
# "[SHS.SCIPO] Humanities and Social Sciences/Political science" 

# VLB (Verzeichnis lieferbarer Bücher) genre
# e.g. (VLB-WN)

# example record with a mixture of keyword indexing:
# https://www.gotriple.eu/documents/dnb_1274564107
# English, German, VLB, BISAC (code), BISAC (code & labels), semicolon delimited values, prefixed values

# BISAC subject heading:
# see https://www.bisg.org/complete-bisac-subject-headings-list
# eg "(BISAC Subject Heading)FIC031000: FICTION / Thrillers / General"
# see https://www.bisg.org/fiction - listed here
#pprint.pprint(items)
with open("myfile.json", "w") as out_file:
    json.dump(items, out_file)

item count = 65500
"AGROVOC" : 111 categorized
"BIC" : 14 categorized
"BISAC" : 243 categorized
"BK" : 124 categorized
"DDC1" : 195 categorized
"DDC2" : 214 categorized
"DDC3" : 9 categorized
"DDC4" : 74 categorized
"DK1" : 10 categorized
"DK2" : 13 categorized
"HAL" : 334 categorized
"JEL1" : 329 categorized
"JEL2" : 196 categorized
"MESH" : 69 categorized
"SSD" : 52 categorized
"STW" : 42 categorized
"TEMPORAL" : 98 categorized
"VLB1" : 491 categorized
"VLB2" : 268 categorized
"CNRS" : 250 categorized
