In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU available")
#elif torch.backends.mps.is_available():
    #device = torch.device("mps")
    #print("MPS available")
else:
    device = torch.device("cpu")

tokenizer = AutoTokenizer.from_pretrained("ibm/knowgl-large")
model = AutoModelForSeq2SeqLM.from_pretrained("ibm/knowgl-large").to(device)

In [3]:
# From here: https://en.wikipedia.org/wiki/The_Great_Last_Judgement_(Rubens)
input_text = """
This happy, smiling pair sits comfortably close to each other. Posing a couple together in this way was highly unusual at the time. It may have been prompted by the sitters’ friendship with the painter and the occasion for the commission – their marriage in April 1622. The painting thus contains references to love and devotion, such as the garden of love at right, and at left an eryngium thistle, known in Dutch as 'mannentrouw’, or male fidelity. """

In [4]:
import nltk

nltk.download("punkt")
from nltk.tokenize import sent_tokenize

# Split the input text into sentences using nltk tokenizer
sentences = sent_tokenize(input_text)
print(sentences)

[nltk_data] Downloading package punkt to /Users/julian/nltk_data...


['\nThis happy, smiling pair sits comfortably close to each other.', 'Posing a couple together in this way was highly unusual at the time.', 'It may have been prompted by the sitters’ friendship with the painter and the occasion for the commission – their marriage in April 1622.', "The painting thus contains references to love and devotion, such as the garden of love at right, and at left an eryngium thistle, known in Dutch as 'mannentrouw’, or male fidelity."]


[nltk_data]   Unzipping tokenizers/punkt.zip.


In [5]:
decoded_outputs = []

for sentence in sentences:
    inputs = tokenizer(sentence, return_tensors="pt").to(device)

    num_beams = 15
    output = model.generate(**inputs, max_length=1000, num_beams=num_beams)

    decoded_output = tokenizer.decode(output[0].to(device), skip_special_tokens=True)
    decoded_outputs.append(decoded_output)

print(decoded_outputs)

['[(Happy, smiling pair#Happy, smiling pair#photograph)|instance of|(pair#Photograph#work)]', '[(Mozart#Wolfgang Amadeus Mozart#human)|notable work|(The Marriage of Figaro#The Marriage of Figaro#dramatico-musical work)]$[(The Marriage of Figaro#The Marriage of Figaro#dramatico-musical work)|composer|(Mozart#Wolfgang Amadeus Mozart#human)]$[(The Marriage of Figaro#The Marriage of Figaro#dramatico-musical work)|composer|(Ludwig van Beethoven#Ludwig van Beethoven#human)]$[(Ludwig van Beethoven#Ludwig van Beethoven#human)|notable work|(The Marriage of Figaro#The Marriage of Figaro#dramatico-musical work)]$[(Ludwig van Beethoven#Ludwig van Beethoven#human)|notable work|(The Marriage of Figaro#The Marriage of Figaro#dramatico-musical work)]$[(Ludwig van Beethoven#Ludwig van Beethoven#human)|notable work|(The Marriage of Figaro#The Marriage of Figaro#dramatico-musical work)]$[(The Marriage of Figaro#The Marriage of Figaro#dramatico-musical work)|composer|(Ludwig van Beethoven#Ludwig van Beeth

In [6]:
def parse_string(s):
    s = s.strip("[]")
    # Split into subject, relation, object
    parts = s.split("|")
    result = {}
    for i, part in enumerate(parts):
        part = part.strip("()")
        mention_label_type = part.split("#")
        if i == 0:
            result["subject"] = {
                "mention": mention_label_type[0],
                "label": mention_label_type[1],
                "type": mention_label_type[2],
            }
        elif i == 1:
            result["relation"] = {"label": mention_label_type[0]}
        else:
            result["object"] = {
                "mention": mention_label_type[0],
                "label": mention_label_type[1],
                "type": mention_label_type[2],
            }
    return result


statements = []
for line in decoded_outputs:
    single_statement = line.split("$")
    for statement_text in single_statement:
        parsed_statement = parse_string(statement_text)
        statements.append(parsed_statement)
        print(parsed_statement)

{'subject': {'mention': 'Happy, smiling pair', 'label': 'Happy, smiling pair', 'type': 'photograph'}, 'relation': {'label': 'instance of'}, 'object': {'mention': 'pair', 'label': 'Photograph', 'type': 'work'}}
{'subject': {'mention': 'Mozart', 'label': 'Wolfgang Amadeus Mozart', 'type': 'human'}, 'relation': {'label': 'notable work'}, 'object': {'mention': 'The Marriage of Figaro', 'label': 'The Marriage of Figaro', 'type': 'dramatico-musical work'}}
{'subject': {'mention': 'The Marriage of Figaro', 'label': 'The Marriage of Figaro', 'type': 'dramatico-musical work'}, 'relation': {'label': 'composer'}, 'object': {'mention': 'Mozart', 'label': 'Wolfgang Amadeus Mozart', 'type': 'human'}}
{'subject': {'mention': 'The Marriage of Figaro', 'label': 'The Marriage of Figaro', 'type': 'dramatico-musical work'}, 'relation': {'label': 'composer'}, 'object': {'mention': 'Ludwig van Beethoven', 'label': 'Ludwig van Beethoven', 'type': 'human'}}
{'subject': {'mention': 'Ludwig van Beethoven', 'lab

In [7]:
import json

# Save statements to file to split the most ressource intensive steps
# First the knowledge extraction of the raw text
# After this the knowledge reconciliation with Wikidata
statements_json = json.dumps(statements, indent=4)

file_path = "statements.json"

with open(file_path, "w") as file:
    file.write(statements_json)

print("Statements saved to file:", file_path)

Statements saved to file: statements.json


In [8]:
import json

file_path = "statements.json"

with open(file_path, "r") as file:
    statements = json.load(file)

In [9]:
import requests
import json

for statement in statements:
    for key, value in statement.items():
        if key == "subject" or key == "object":
            # Reconcile type first:
            type_id = None
            if value["type"] is not None:
                url = (
                    "https://wikidata.reconci.link/en/api?queries=%7B%22q0%22%3A%7B%22query%22%3A%22"
                    + value["type"]
                    + "%22%2C%22limit%22%3A1%7D%7D"
                )
                response = requests.get(url)
                data = json.loads(response.text)
                if data["q0"]["result"][0]["id"] is not None:
                    type_id = data["q0"]["result"][0]["id"]

            url = (
                "https://wikidata.reconci.link/en/api?queries=%7B%22q0%22%3A%7B%22query%22%3A%22"
                + value["label"]
                + "%22%2C%22limit%22%3A1%7D%7D"
            )
            if type_id is not None:
                url = (
                    "https://wikidata.reconci.link/en/api?queries=%7B%22q0%22%3A%7B%22query%22%3A%22"
                    + value["label"]
                    + "%22%2C%22type%22%3A%22"
                    + type_id
                    + "%22%2C%22limit%22%3A1%7D%7D"
                )
            response = requests.get(url)
            data = json.loads(response.text)
            if len(data["q0"]["result"]) > 0:
                value["result"] = {
                    "description": data["q0"]["result"][0].get("description", ""),
                    "id": data["q0"]["result"][0]["id"],
                    "wd_name": data["q0"]["result"][0].get("name", ""),
                }
                print(
                    "Querying Wikidata for: "
                    + value["label"]
                    + " with result: "
                    + value["result"]["description"]
                )
            else:
                print("No matching entity found for: " + value["label"])
                value["result"] = {
                    "description": "",
                    "id": "",
                    "wd_name": "",
                }
        elif key == "relation":
            url = (
                "https://wikidata.reconci.link/en/suggest/property?prefix="
                + value["label"]
            )
            response = requests.get(url)
            data = json.loads(response.text)
            if data["result"][0]:
                value["result"] = data["result"][0]
            else:
                print("No matching property found for: " + value["label"])
                value["result"] = ""

No matching entity found for: Happy, smiling pair
Querying Wikidata for: Photograph with result: image created by light falling on a light-sensitive surface
Querying Wikidata for: Wolfgang Amadeus Mozart with result: Austrian composer of the Classical period (1756-1791)
Querying Wikidata for: The Marriage of Figaro with result: opera by Wolfgang Amadeus Mozart
Querying Wikidata for: The Marriage of Figaro with result: opera by Wolfgang Amadeus Mozart
Querying Wikidata for: Wolfgang Amadeus Mozart with result: Austrian composer of the Classical period (1756-1791)
Querying Wikidata for: The Marriage of Figaro with result: opera by Wolfgang Amadeus Mozart
Querying Wikidata for: Ludwig van Beethoven with result: German composer (1770–1827)
Querying Wikidata for: Ludwig van Beethoven with result: German composer (1770–1827)
Querying Wikidata for: The Marriage of Figaro with result: opera by Wolfgang Amadeus Mozart
Querying Wikidata for: Ludwig van Beethoven with result: German composer (177

In [10]:
print(json.dumps(statements, indent=4))

[
    {
        "subject": {
            "mention": "Happy, smiling pair",
            "label": "Happy, smiling pair",
            "type": "photograph",
            "result": {
                "description": "",
                "id": "",
                "wd_name": ""
            }
        },
        "relation": {
            "label": "instance of",
            "result": {
                "description": "that class of which this subject is a particular example and member; different from P279 (subclass of); for example: K2 is an instance of mountain; volcano is a subclass of mountain (and an instance of volcanic landform)",
                "id": "P31",
                "name": "instance of"
            }
        },
        "object": {
            "mention": "pair",
            "label": "Photograph",
            "type": "work",
            "result": {
                "description": "image created by light falling on a light-sensitive surface",
                "id": "Q125191",
             

In [11]:
from pyvis.network import Network

graph = Network(height="800px", width="100%", notebook=True)

for statement in statements:
    graph.add_node(
        statement["subject"]["result"]["id"],
        label=statement["subject"]["result"]["wd_name"],
    )
    graph.add_node(
        statement["object"]["result"]["id"],
        label=statement["object"]["result"]["wd_name"],
    )
    graph.add_edge(
        statement["subject"]["result"]["id"],
        statement["object"]["result"]["id"],
        label=statement["relation"]["label"],
    )

graph.show("knowledge_graph_manet.html")

knowledge_graph_manet.html
