In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [None]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU available")
#elif torch.backends.mps.is_available():
    #device = torch.device("mps")
    #print("MPS available")
else:
    print("Falling back to CPU")
    device = torch.device("cpu")

In [None]:
# This can take a while (download, and moving model to GPU)
tokenizer = AutoTokenizer.from_pretrained("ibm/knowgl-large")
model = AutoModelForSeq2SeqLM.from_pretrained("ibm/knowgl-large").to(device)

In [None]:
#import articles as json
import json

with open('data.json', 'r') as file:
    data = json.load(file)

texts = [item['text'] for item in data]

input_text = texts

In [None]:
# From here: https://en.wikipedia.org/wiki/The_Great_Last_Judgement_(Rubens)
input_text = """
The Open Window, also known as Open Window, Collioure, is a painting by Henri Matisse. The work, an oil on canvas, was painted in 1905 and exhibited at the Salon d'Automne in Paris the same year. It was bequeathed in 1998 by the estate of Mrs. John Hay Whitney to the National Gallery of Art, Washington, DC.[1]

It is an example of the Fauvist style of painting that Matisse became famous for, and for which he was a leader, roughly between the years 1900–1909.[2] The Open Window depicts the view out the window of his apartment in Collioure, on the Southern coast of France. We see sailboats on the water, as viewed from Matisse's hotel window overlooking the harbour. He returned frequently to the theme of the open window in Paris and especially during the years in Nice and Etretat, and in his final years, particularly during the late 1940s.

Henri Matisse loved painting open windows and painted them throughout his career. 
"""

In [None]:
import nltk

nltk.download("punkt")
from nltk.tokenize import sent_tokenize

# Split the input text into sentences using nltk tokenizer
sentences = sent_tokenize(input_text)
print(sentences)

In [None]:
decoded_outputs = []

for sentence in sentences:
    inputs = tokenizer(sentence, return_tensors="pt").to(device)
    # This can take a while too
    num_beams = 15
    output = model.generate(**inputs, max_length=1000, num_beams=num_beams)

    decoded_output = tokenizer.decode(output[0].to(device), skip_special_tokens=True)
    decoded_outputs.append(decoded_output)

print(decoded_outputs)

In [None]:
def parse_string(s):
    s = s.strip("[]")
    # Split into subject, relation, object
    parts = s.split("|")
    result = {}
    for i, part in enumerate(parts):
        part = part.strip("()")
        mention_label_type = part.split("#")
        if i == 0:
            result["subject"] = {
                "mention": mention_label_type[0],
                "label": mention_label_type[1],
                "type": mention_label_type[2],
            }
        elif i == 1:
            result["relation"] = {"label": mention_label_type[0]}
        else:
            result["object"] = {
                "mention": mention_label_type[0],
                "label": mention_label_type[1],
                "type": mention_label_type[2],
            }
    return result


statements = []
for line in decoded_outputs:
    single_statement = line.split("$")
    for statement_text in single_statement:
        parsed_statement = parse_string(statement_text)
        statements.append(parsed_statement)
        print(parsed_statement)

In [None]:
import json

# Save statements to file to split the most ressource intensive steps
# First the knowledge extraction of the raw text
# After this the knowledge reconciliation with Wikidata
statements_json = json.dumps(statements, indent=4)

file_path = "statements.json"

with open(file_path, "w") as file:
    file.write(statements_json)

print("Statements saved to file:", file_path)

In [None]:
import json

file_path = "statements.json"

with open(file_path, "r") as file:
    statements = json.load(file)

In [None]:
import requests


def get_wikidata_id(label, type_id=None):
    if type_id:
        # type_param = f"%2C%22type%22%3A%22{type_id}"
        type_param = f"""\"type":"{type_id}","""
    else:
        type_param = ""
    # url = f"""https://wikidata.reconci.link/en/api?queries=%7B%22q0%22%3A%7B%22query%22%3A%22{label}{type_param}%22%2C%22limit%22%3A1%7D%7D"""
    url = f"""https://wikidata.reconci.link/en/api?queries={{"q0":{{"query":"{label}",{type_param}"limit":1}}}}"""
    response = requests.get(url)
    data = json.loads(response.text)

    if "q0" in data and len(data["q0"]["result"]) > 0:
        return data["q0"]["result"][0]
    else:
        if type_id:
            # Try again without type
            return get_wikidata_id(label)
        return None


def get_wikidata_property(query):
    url = f"https://wikidata.reconci.link/en/suggest/property?prefix={query['label']}"
    response = requests.get(url)
    data = json.loads(response.text)

    return data["result"][0] if data["result"] else None


def process_statements(statements):
    for statement in statements:
        for key, value in statement.items():
            if key in ["subject", "object"]:
                if value["type"] is not None:
                    # Try to get type id from Wikidata
                    type_id = get_wikidata_id(value["type"])
                    if type_id:
                        # Reconcliation against type
                        result = get_wikidata_id(value["label"], type_id["id"])
                        # If no result, try reconcliation against no type
                    else:
                        result = get_wikidata_id(value["label"])
                else:
                    result = get_wikidata_id(value["label"])

                if result:
                    value["result"] = {
                        "description": result.get("description", ""),
                        "id": result["id"],
                        "wd_name": result.get("name", ""),
                    }
                else:
                    print(f"No matching entity found for: {value['label']}")
                    value["result"] = {
                        "description": "",
                        "id": "",
                        "wd_name": "*New: " + value["label"],
                    }

            elif key == "relation":
                result = get_wikidata_property(value)
                value["result"] = result if result else ""
                if not result:
                    print(f"No matching property found for: {value['label']}")
    return statements


reconciled_statements = process_statements(statements)

In [None]:
print(json.dumps(reconciled_statements, indent=4))

In [None]:
from pyvis.network import Network

graph = Network(height="800px", width="100%", notebook=True)

for statement in reconciled_statements:
    graph.add_node(
        statement["subject"]["result"]["id"],
        label=statement["subject"]["result"]["wd_name"],
    )
    graph.add_node(
        statement["object"]["result"]["id"],
        label=statement["object"]["result"]["wd_name"],
    )
    graph.add_edge(
        statement["subject"]["result"]["id"],
        statement["object"]["result"]["id"],
        label=statement["relation"]["label"],
    )

graph.show("knowledge_graph_manet.html")