In [None]:
import requests
from SPARQLWrapper import SPARQLWrapper, JSON
import wikipedia
from transformers import AutoTokenizer, AutoModel
import torch
import networkx as nx

# Multilingual embedding model (XLM-Roberta-base)
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
model = AutoModel.from_pretrained("xlm-roberta-base")

# Function to embed text
def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
    return embeddings

# Function to extract Wikidata information
def query_wikidata(qid):
    endpoint = SPARQLWrapper("https://query.wikidata.org/sparql")
    query = f"""
    SELECT ?itemLabel ?itemDescription ?instanceOfLabel ?countryOfOriginLabel ?cultureLabel ?mainSubjectLabel WHERE {{
        wd:{qid} rdfs:label ?itemLabel.
        OPTIONAL {{ wd:{qid} schema:description ?itemDescription FILTER(LANG(?itemDescription) = "en"). }}
        OPTIONAL {{ wd:{qid} wdt:P31 ?instanceOf. ?instanceOf rdfs:label ?instanceOfLabel FILTER(LANG(?instanceOfLabel) = "en"). }}
        OPTIONAL {{ wd:{qid} wdt:P495 ?countryOfOrigin. ?countryOfOrigin rdfs:label ?countryOfOriginLabel FILTER(LANG(?countryOfOriginLabel) = "en"). }}
        OPTIONAL {{ wd:{qid} wdt:P2596 ?culture. ?culture rdfs:label ?cultureLabel FILTER(LANG(?cultureLabel) = "en"). }}
        OPTIONAL {{ wd:{qid} wdt:P921 ?mainSubject. ?mainSubject rdfs:label ?mainSubjectLabel FILTER(LANG(?mainSubjectLabel) = "en"). }}
        FILTER(LANG(?itemLabel) = "en")
    }} LIMIT 1
    """
    endpoint.setQuery(query)
    endpoint.setReturnFormat(JSON)
    results = endpoint.query().convert()
    return results['results']['bindings'][0]

# Function to get Wikipedia content and categories
def get_wikipedia_data(name):
    try:
        page = wikipedia.page(name)
        content = page.content[:1000]  # first 1000 chars as semantic embedding input
        categories = page.categories
        lang_count = len(page.langlinks)  # number of language variants
        return content, categories, lang_count
    except Exception as e:
        print(f"Wikipedia error for {name}: {e}")
        return "", [], 0

# Example dataset row (Wikidata ID: Q811389)
qid = "Q811389"  # Bauhaus Archive

# Extract Wikidata data
wikidata_info = query_wikidata(qid)

item_name = wikidata_info['itemLabel']['value']
description = wikidata_info.get('itemDescription', {}).get('value', '')
instance_of = wikidata_info.get('instanceOfLabel', {}).get('value', '')
country_origin = wikidata_info.get('countryOfOriginLabel', {}).get('value', '')
culture = wikidata_info.get('cultureLabel', {}).get('value', '')
main_subject = wikidata_info.get('mainSubjectLabel', {}).get('value', '')

# Print Wikidata features
print(f"Item Name: {item_name}")
print(f"Description: {description}")
print(f"Instance of: {instance_of}")
print(f"Country of Origin: {country_origin}")
print(f"Culture: {culture}")
print(f"Main Subject: {main_subject}")

# Extract Wikipedia data
wiki_content, wiki_categories, lang_variants = get_wikipedia_data(item_name)

print(f"Categories: {wiki_categories}")
print(f"Language Variants: {lang_variants}")

# Embed Wikipedia textual data
embedding = get_embedding(wiki_content)

# Combine features
features = {
    "name": item_name,
    "description": description,
    "instance_of": instance_of,
    "country_origin": country_origin,
    "culture": culture,
    "main_subject": main_subject,
    "categories": wiki_categories,
    "language_variants": lang_variants,
    "embedding": embedding
}

# Build Graph with NetworkX
G = nx.Graph()

# Add main node
G.add_node(qid, **features)

# Example: Adding semantic edges (to country, main subject, instance_of)
semantic_relations = {
    'country_origin': country_origin,
    'main_subject': main_subject,
    'instance_of': instance_of
}

# Add nodes and edges based on relations
for relation, entity in semantic_relations.items():
    if entity:
        # Add entity node (simplified; in practice extract further features for these nodes)
        entity_node_id = f"{relation}_{entity.replace(' ', '_')}"
        G.add_node(entity_node_id, name=entity, type=relation)

        # Add edge from main node to entity node
        G.add_edge(qid, entity_node_id, relation=relation)

# Visualization (optional sanity check)
print(f"Nodes: {G.nodes(data=True)}")
print(f"Edges: {G.edges(data=True)}")

# Now 'G' contains structured nodes with features and semantic edges
