### TRY

In [2]:
!pip install requests SPARQLWrapper wikipedia transformers networkx


Collecting SPARQLWrapper
  Downloading SPARQLWrapper-2.0.0-py3-none-any.whl.metadata (2.0 kB)
Collecting wikipedia
  Downloading wikipedia-1.4.0.tar.gz (27 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting rdflib>=6.1.1 (from SPARQLWrapper)
  Downloading rdflib-7.1.4-py3-none-any.whl.metadata (11 kB)
Downloading SPARQLWrapper-2.0.0-py3-none-any.whl (28 kB)
Downloading rdflib-7.1.4-py3-none-any.whl (565 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m565.1/565.1 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: wikipedia
  Building wheel for wikipedia (setup.py) ... [?25l[?25hdone
  Created wheel for wikipedia: filename=wikipedia-1.4.0-py3-none-any.whl size=11678 sha256=37110299088093e5284c7863a45a4464be45eca667895fc2a923e76e3adcf4bb
  Stored in directory: /root/.cache/pip/wheels/8f/ab/cb/45ccc40522d3a1c41e1d2ad53b8f33a62f394011ec38cd71c6
Successfully built wikipedia
Installing collected packages: rdflib, w

In [3]:
import requests
from SPARQLWrapper import SPARQLWrapper, JSON
import wikipedia
from transformers import AutoTokenizer, AutoModel
import torch
import networkx as nx

# Multilingual embedding model (XLM-Roberta-base)
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
model = AutoModel.from_pretrained("xlm-roberta-base")

# Function to embed text
def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
    return embeddings

# Function to extract Wikidata information
def query_wikidata(qid):
    endpoint = SPARQLWrapper("https://query.wikidata.org/sparql")
    query = f"""
    SELECT ?itemLabel ?itemDescription ?instanceOfLabel ?countryOfOriginLabel ?cultureLabel ?mainSubjectLabel WHERE {{
        wd:{qid} rdfs:label ?itemLabel.
        OPTIONAL {{ wd:{qid} schema:description ?itemDescription FILTER(LANG(?itemDescription) = "en"). }}
        OPTIONAL {{ wd:{qid} wdt:P31 ?instanceOf. ?instanceOf rdfs:label ?instanceOfLabel FILTER(LANG(?instanceOfLabel) = "en"). }}
        OPTIONAL {{ wd:{qid} wdt:P495 ?countryOfOrigin. ?countryOfOrigin rdfs:label ?countryOfOriginLabel FILTER(LANG(?countryOfOriginLabel) = "en"). }}
        OPTIONAL {{ wd:{qid} wdt:P2596 ?culture. ?culture rdfs:label ?cultureLabel FILTER(LANG(?cultureLabel) = "en"). }}
        OPTIONAL {{ wd:{qid} wdt:P921 ?mainSubject. ?mainSubject rdfs:label ?mainSubjectLabel FILTER(LANG(?mainSubjectLabel) = "en"). }}
        FILTER(LANG(?itemLabel) = "en")
    }} LIMIT 1
    """
    endpoint.setQuery(query)
    endpoint.setReturnFormat(JSON)
    results = endpoint.query().convert()
    return results['results']['bindings'][0]

# Function to get Wikipedia content and categories
def get_wikipedia_data(name):
    try:
        page = wikipedia.page(name)
        content = page.content[:1000]  # first 1000 chars as semantic embedding input
        categories = page.categories
        lang_count = len(page.langlinks)  # number of language variants
        return content, categories, lang_count
    except Exception as e:
        print(f"Wikipedia error for {name}: {e}")
        return "", [], 0

# Example dataset row (Wikidata ID: Q811389)
qid = "Q811389"  # Bauhaus Archive

# Extract Wikidata data
wikidata_info = query_wikidata(qid)

item_name = wikidata_info['itemLabel']['value']
description = wikidata_info.get('itemDescription', {}).get('value', '')
instance_of = wikidata_info.get('instanceOfLabel', {}).get('value', '')
country_origin = wikidata_info.get('countryOfOriginLabel', {}).get('value', '')
culture = wikidata_info.get('cultureLabel', {}).get('value', '')
main_subject = wikidata_info.get('mainSubjectLabel', {}).get('value', '')

# Print Wikidata features
print(f"Item Name: {item_name}")
print(f"Description: {description}")
print(f"Instance of: {instance_of}")
print(f"Country of Origin: {country_origin}")
print(f"Culture: {culture}")
print(f"Main Subject: {main_subject}")

# Extract Wikipedia data
wiki_content, wiki_categories, lang_variants = get_wikipedia_data(item_name)

print(f"Categories: {wiki_categories}")
print(f"Language Variants: {lang_variants}")

# Embed Wikipedia textual data
embedding = get_embedding(wiki_content)

# Combine features
features = {
    "name": item_name,
    "description": description,
    "instance_of": instance_of,
    "country_origin": country_origin,
    "culture": culture,
    "main_subject": main_subject,
    "categories": wiki_categories,
    "language_variants": lang_variants,
    "embedding": embedding
}

# Build Graph with NetworkX
G = nx.Graph()

# Add main node
G.add_node(qid, **features)

# Example: Adding semantic edges (to country, main subject, instance_of)
semantic_relations = {
    'country_origin': country_origin,
    'main_subject': main_subject,
    'instance_of': instance_of
}

# Add nodes and edges based on relations
for relation, entity in semantic_relations.items():
    if entity:
        # Add entity node (simplified; in practice extract further features for these nodes)
        entity_node_id = f"{relation}_{entity.replace(' ', '_')}"
        G.add_node(entity_node_id, name=entity, type=relation)

        # Add edge from main node to entity node
        G.add_edge(qid, entity_node_id, relation=relation)

# Visualization (optional sanity check)
print(f"Nodes: {G.nodes(data=True)}")
print(f"Edges: {G.edges(data=True)}")

# Now 'G' contains structured nodes with features and semantic edges


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Item Name: Bauhaus Archive
Description: museum and archive in Berlin
Instance of: collection
Country of Origin: 
Culture: 
Main Subject: 
Wikipedia error for Bauhaus Archive: 'WikipediaPage' object has no attribute 'langlinks'
Categories: []
Language Variants: 0
Nodes: [('Q811389', {'name': 'Bauhaus Archive', 'description': 'museum and archive in Berlin', 'instance_of': 'collection', 'country_origin': '', 'culture': '', 'main_subject': '', 'categories': [], 'language_variants': 0, 'embedding': array([ 7.22414851e-02,  9.90853384e-02,  2.79786568e-02, -2.09546648e-02,
        5.22477329e-02, -2.55161710e-02, -5.57560846e-03,  9.06962715e-03,
        6.40976205e-02, -1.07032537e-01,  3.85049768e-02,  6.42324761e-02,
       -7.17509072e-03, -1.35866543e-02, -2.28862632e-02,  4.61671725e-02,
       -3.48128937e-02, -4.80359793e-02,  7.54665062e-02,  6.96600080e-02,
        4.35983166e-02,  7.69041106e-03,  9.32531133e-02,  8.21319968e-02,
       -6.65654428e-03,  8.85444321e-03, -1.6437787