x

In [1]:
import warnings
# suppress user warnings during execution
warnings.filterwarnings(action='ignore', category=UserWarning)

# load required dependencies
%pip install --upgrade pip
%pip install spacy
%pip install srsly
#%pip install ipywidgets
%pip install -U pycld2
%sx python -m spacy download en_core_web_sm
#%sx python -m spacy download de_core_news_sm
#%sx python -m spacy download fr_core_news_sm

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


['Collecting en-core-web-sm==3.8.0',
 '  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)',
 '\x1b[?25l     \x1b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\x1b[0m \x1b[32m0.0/12.8 MB\x1b[0m \x1b[31m?\x1b[0m eta \x1b[36m-:--:--\x1b[0m',
 '\x1b[2K     \x1b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\x1b[0m \x1b[32m12.8/12.8 MB\x1b[0m \x1b[31m81.5 MB/s\x1b[0m  \x1b[33m0:00:00\x1b[0m',
 '\x1b[?25h\x1b[38;5;2m✔ Download and installation successful\x1b[0m',
 "You can now load the package via spacy.load('en_core_web_sm')"]

In [2]:
import os, json
#from typing import Iterable
import spacy # for NER processing
#import time # for sleep
import srsly # for JSONL serialization/deserialization
from datetime import datetime as DT # for timestamps
from urllib.parse import quote_plus # to urlencode a string value
from bs4 import BeautifulSoup # to parse data from HTML page responses
import requests # for performing url request
import pycld2 as cld2  # for text language detection
from rematch2 import DocSummary
from rematch2.Util import *
#from IPython.display import display, HTML


# get main language code from text using cld2
def detect_language(text: str="") -> str:
    is_reliable, text_bytes_found, details = cld2.detect(text)
    if is_reliable:
        return details[0][1]
    return "en"


def get_abstract_from_gotriple_id(id: str="", lang: str="en") -> str:
    # build the URL and get the resource
    url = f"https://www.gotriple.eu/documents/{ quote_plus(id) }"
    return get_abstract_from_gotriple_url(url, lang)

# parse language-specific abstract from GoTriple URL request response
def get_abstract_from_gotriple_url(url: str="", lang: str="en") -> str:
    response = requests.get(url, timeout=30)
    # parse out the tag containing abstracts from the response 
    soup = BeautifulSoup(response.text, features="html.parser")
    tag = soup.find("script", id="__NEXT_DATA__")
    if tag:
        # parse language-specific abstract text from the contents of this tag
        meta = json.loads(str(tag.contents[0]))
        abstracts = meta.get("props", {}).get("pageProps", {}).get("document", {}).get("abstract", [])        
        abstract = next(filter(lambda a: a.get("lang", "") == lang, abstracts), {}).get("text", "")
        return str(abstract).strip()
    return ""


# download file from URL to output path and return result filename with path
# if previously cached file exists then use it rather than downloading again
def get_file_from_url(url: str, output_path: str=".") -> str:
    file_name = url.split("?")[0].split("/")[-1]
    file_path = f"{output_path}/{file_name}"    
    if not os.path.exists(output_path): os.makedirs(output_path)
    if os.path.exists(file_path):
        print(f"Using cached file {file_path}")        
    else:
        print(f"Downloading file {url} to {file_path}")
        response = requests.get(url, stream=True)
        with open(file_path, "wb") as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
    return file_path


# download GoTriple JSONL.GZ file for specified domain to output path and return filename with path
def get_gotriple_jsonl_gz_file_for_domain(domain: str, output_path: str=".") -> str:
    url = f"https://zenodo.org/records/15784401/files/{domain}_merged.jsonl.gz?download=1"
    return get_file_from_url(url, output_path)


# retrieve abstracts from a previously downloaded GoTriple JSONL.GZ file for specified language
def get_abstracts_from_gotriple_jsonl_gz_file(file_path: str, language: str="en") -> list[dict]:
    data: list[dict] = []
    for record in srsly.read_gzip_jsonl(file_path, True):
        id: str = str(record.get("id", None))
        if id is not None:
            # get abstract only for specified language            
            abstracts: list = record.get("abstract", [])
            abstract = next(filter(lambda a: a.get("lang", "") == language, abstracts), None)
            if abstract is not None:
                data.append({
                    "id": id, 
                    "lang": abstract.get("lang", ""), 
                    "text": abstract.get("text", "") 
                })           

    return data


In [3]:

# prepare English language spaCy pipeline
nlp_en: Language = spacy.load("en_core_web_sm", disable = ['ner'])
nlp_en.add_pipe("normalize_text", before = "parser")
nlp_en.add_pipe("yearspan_ruler", last=True)  
# using 'HE Cultural Periods' authority
nlp_en.add_pipe("periodo_ruler", last=True, config={ "periodo_authority_id": "p0kh9ds" })
nlp_en.add_pipe("child_span_remover", last=True) 

# prepare French language spaCy pipeline
nlp_fr: Language = spacy.load("fr_core_news_sm", disable = ['ner'])
nlp_fr.add_pipe("normalize_text", before = "parser")
nlp_fr.add_pipe("yearspan_ruler", last=True)  
# using 'PACTOLS chronology periods used in DOLIA data' authority
nlp_fr.add_pipe("periodo_ruler", last=True, config={ "periodo_authority_id": "p02chr4" })
nlp_fr.add_pipe("child_span_remover", last=True) 

# prepare Spanish language spaCy pipeline
nlp_es: Language = spacy.load("es_core_news_sm", disable = ['ner'])
nlp_es.add_pipe("normalize_text", before = "parser")
nlp_es.add_pipe("yearspan_ruler", last=True)  
# using 'SIA+ Chrono-Cultural Categories' authority
nlp_es.add_pipe("periodo_ruler", last=True, config={ "periodo_authority_id": "p07h9k6" })
nlp_es.add_pipe("child_span_remover", last=True) 

# prepare I/O paths
input_directory: str = "./data/ner-input/gotriple"
if not os.path.exists(input_directory): os.makedirs(input_directory)
output_directory: str = "./data/ner-output/gotriple"
if not os.path.exists(output_directory): os.makedirs(output_directory)

# download GoTriple data file for domain 'archeo' (archeology, history, art history, cultural heritage)
print("downloading GoTriple data file for domain 'archeo'")
input_file_path = get_gotriple_jsonl_gz_file_for_domain("archeo", input_directory)

# process abstracts for each language
languages = ["en", "fr", "es"]

for language in languages:
    # extract abstracts from the downloaded file
    print(f"extracting abstracts for language '{language}'")
    abstracts = get_abstracts_from_gotriple_jsonl_gz_file(input_file_path, language)

    #print (f"writing abstracts for language '{language}' to local file")  
    #with open(f"./data/gotriple/abstracts-{language}.json", "w") as file:
        #json.dump(abstracts, file, indent=4)

    timestamp: str = DT.now().strftime('%Y%m%d')
    output_file_name: str = f"ner-output-gotriple-abstracts-{language}-{timestamp}.json"
    output_file_path: str = os.path.join(output_directory, output_file_name)

    output_data = []
    record_count = len(abstracts)
    print(f"processing {record_count} records ({language}) from GoTriple abstracts data file")
    current_record_index = 0
    for record in abstracts: 
        # progress notification every 1000 records
        if current_record_index % 1000 == 0:
            print(f"processing record {current_record_index} of {record_count}")

        # temp break after 5000 records for testing
        if current_record_index == 5000:
            break
        
        lang = record.get("lang", "")      
        text = record.get("text", "")        
        record["spans"] = []

        if(len(text) > 0):
            # run the pipeline on the input text
            if(lang == "fr"):
                doc = nlp_fr(text)
            elif (lang == "es"):
                doc = nlp_es(text)
            else:
                doc = nlp_en(text)             
            
            summary = DocSummary(doc)
            record["spans"] = summary.spans_to_list()
            
        current_record_index += 1 
    
    print (f"writing abstracts for language '{language}' to local file")  
    with open(output_file_path, "w") as file:
        json.dump(abstracts, file, indent=4)


downloading GoTriple data file for domain 'archeo'
Using cached file ./data/ner-input/gotriple/archeo_merged.jsonl.gz
extracting abstracts for language 'en'
processing 57449 records (en) from GoTriple abstracts data file
processing record 0 of 57449
processing record 1000 of 57449
processing record 2000 of 57449
processing record 3000 of 57449
processing record 4000 of 57449
processing record 5000 of 57449
writing abstracts for language 'en' to local file
extracting abstracts for language 'fr'
processing 1066 records (fr) from GoTriple abstracts data file
processing record 0 of 1066
processing record 1000 of 1066
writing abstracts for language 'fr' to local file
extracting abstracts for language 'es'
processing 2586 records (es) from GoTriple abstracts data file
processing record 0 of 2586
processing record 1000 of 2586
processing record 2000 of 2586
writing abstracts for language 'es' to local file
