x

In [1]:
import warnings
# suppress user warnings during execution
warnings.filterwarnings(action='ignore', category=UserWarning)

# load required dependencies
%pip install --upgrade pip
%pip install spacy
#%pip install ipywidgets
%pip install -U pycld2
%sx python -m spacy download en_core_web_sm
%sx python -m spacy download de_core_news_sm
%sx python -m spacy download fr_core_news_sm

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Collecting pycld2
  Downloading pycld2-0.42-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.5 kB)
Downloading pycld2-0.42-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m47.5 MB/s[0m  [33m0:00:00[0mm0:00:01[0m
[?25hInstalling collected packages: pycld2
Successfully installed pycld2-0.42
Note: you may need to restart the kernel to use updated packages.


['Collecting fr-core-news-sm==3.8.0',
 '  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.8.0/fr_core_news_sm-3.8.0-py3-none-any.whl (16.3 MB)',
 '\x1b[?25l     \x1b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\x1b[0m \x1b[32m0.0/16.3 MB\x1b[0m \x1b[31m?\x1b[0m eta \x1b[36m-:--:--\x1b[0m',
 '\x1b[2K     \x1b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\x1b[0m\x1b[91m╸\x1b[0m \x1b[32m16.3/16.3 MB\x1b[0m \x1b[31m218.2 MB/s\x1b[0m eta \x1b[36m0:00:01\x1b[0m',
 '\x1b[2K     \x1b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\x1b[0m \x1b[32m16.3/16.3 MB\x1b[0m \x1b[31m68.4 MB/s\x1b[0m  \x1b[33m0:00:00\x1b[0m',
 '\x1b[?25h\x1b[38;5;2m✔ Download and installation successful\x1b[0m',
 "You can now load the package via spacy.load('fr_core_news_sm')"]

In [5]:
import os, json
import pandas as pd  # for DataFrame
import spacy # for NER processing
from datetime import datetime as DT # for timestamps
import pycld2 as cld2  # for language detection
from rematch2 import TextNormalizer
from rematch2.Util import *


def detect_language(text: str="") -> str:
    is_reliable, text_bytes_found, details = cld2.detect(text)
    if is_reliable:
        return details[0][1]
    return "en"


# parse and extract records from source CSV file of GoTriple keywords analysis
# returns [{"id", "text"}, {"id", "text"}, ...] for subsequent processing
def get_records_from_csv_file(file_path: str="") -> list[dict]:
    records: list[dict] = []
    
    # read the CSV file to a DataFrame
    df: pd.DataFrame = pd.read_csv(file_path, skip_blank_lines=True, header=0, names=["index", "keyword", "count"])
    # set any NaN values to blank string
    df.fillna("", inplace=True)
    # convert the data to a dict structure
    items = df.to_dict(orient="records", index=True) 
    
    records = list(map(lambda item: {
        "index": int(item.get("index", 0)),
        "keyword": str(item.get("keyword", "")).strip(),
        "count": int(item.get("count", 0))
        }, items))
    
    return records

In [None]:

# prepare English pipeline
nlp_en: Language = spacy.load("en_core_web_sm", disable = ['ner'])
nlp_en.add_pipe("normalize_text", before = "parser")
nlp_en.add_pipe("yearspan_ruler", last=True)  
# using HE Cultural Periods authority
nlp_en.add_pipe("periodo_ruler", last=True, config={ "periodo_authority_id": "p0kh9ds" }) 
nlp_en.add_pipe("child_span_remover", last=True) 

# prepare French pipeline
nlp_fr: Language = spacy.load("fr_core_news_sm", disable = ['ner'])
nlp_fr.add_pipe("normalize_text", before = "parser")
nlp_fr.add_pipe("yearspan_ruler", last=True)    
# using 'PACTOLS chronology periods used in DOLIA data'
nlp_fr.add_pipe("periodo_ruler", last=True, config={ "periodo_authority_id": "p02chr4" }) 
nlp_fr.add_pipe("child_span_remover", last=True) 

# prepare German pipeline
nlp_de: Language = spacy.load("de_core_news_sm", disable = ['ner'])
nlp_de.add_pipe("normalize_text", before = "parser")
nlp_de.add_pipe("yearspan_ruler", last=True)    
# using ARIADNE authority (no DAI authority??)
nlp_de.add_pipe("periodo_ruler", last=True, config={ "periodo_authority_id": "p0qhb66" }) 
nlp_de.add_pipe("child_span_remover", last=True)

# I/O file names and paths
input_directory: str = "./data/ner-input/gotriple"
input_file_name: str = "gotriple-keywords-analysis.csv"
input_file_path: str = os.path.join(input_directory, input_file_name)
output_directory: str = f"./data/ner-output/gotriple"
if not os.path.exists(output_directory): os.makedirs(output_directory)
timestamp: str = DT.now().strftime('%Y%m%d')
output_file_name: str = f"ner-output-gotriple-keywords-{timestamp}.json"
output_file_path: str = os.path.join(output_directory, output_file_name)

# read and parse records from input file [{index, keyword, count}]
input_records: list[dict] = get_records_from_csv_file(input_file_path)

record_count: int = len(input_records)
current_record: int = 0
for record in input_records or []:  

    # print progress indicator
    if current_record % 500 == 0:
        print(f"processing record {current_record} of {record_count}")

    # temp break for testing
    if current_record == 10000: break

    input_text = str(record.get("keyword", "")).strip()

    if(len(input_text) > 0):
        # determine main language of input text and process with appropriate pipeline
        # default to English if language cannot be determined
        lang = detect_language(input_text)
        if lang == "fr":
            doc = nlp_fr(input_text)
        elif lang == "de":
            doc = nlp_de(input_text)
        else:
            doc = nlp_en(input_text)

        record["language_detected"] = lang

        # initialize found spans array
        record["found"] = []    
    
       # extract any spans from the annotated document
        for span in doc.spans.get(DEFAULT_SPANS_KEY, []):
            record["found"].append({ "id": span.id_, "text": span.text, "label": span.label_ }) 

    # increment record counter
    current_record += 1

print(f"writing results to {output_file_path}")    
with open(output_file_path, "w") as file:
    json.dump(input_records, file, indent=4)
# end of script


processing record 0 of 65500
processing record 500 of 65500
processing record 1000 of 65500
processing record 1500 of 65500
processing record 2000 of 65500
processing record 2500 of 65500
processing record 3000 of 65500
processing record 3500 of 65500
processing record 4000 of 65500
processing record 4500 of 65500
processing record 5000 of 65500
processing record 5500 of 65500
processing record 6000 of 65500
processing record 6500 of 65500
processing record 7000 of 65500
processing record 7500 of 65500
processing record 8000 of 65500
processing record 8500 of 65500
processing record 9000 of 65500
processing record 9500 of 65500
processing record 10000 of 65500
writing results to ./data/ner-output/gotriple/ner-output-gotriple-keywords-analysis-20250923.json
