install python libraries(Transformers, PEFT/LoRA, 4-bit quant, TRL, and SPARQLWrapper) so colab can load the model and talk to wikidata

In [1]:
!pip -q install "transformers>=4.39" peft "bitsandbytes>=0.43" accelerate torch \
                sentencepiece datasets "trl>=0.8.6" SPARQLWrapper


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m564.7/564.7 kB[0m [31m43.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m565.1/565.1 kB[0m [31m46.2 MB/s[0m eta [36m0:00:00[0m
[?25h

import libs, loads the Occiglot text→SPARQL model in 4-bit with its tokenizer --> to pull the SPARQL query out of the model’s text output

In [2]:
import torch, re, json
from transformers import AutoTokenizer, BitsAndBytesConfig
from peft import AutoPeftModelForCausalLM

model_id = "julioc-p/occiglot_txt_sparql_en_v2"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16, bnb_4bit_use_double_quant=True,
)

model = AutoPeftModelForCausalLM.from_pretrained(
    model_id, quantization_config=bnb_config, device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_id)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = tokenizer.pad_token_id

def extract_sparql(text: str) -> str:
    # Prefer fenced code block ```sparql ... ```
    m = re.search(r"```(?:sparql)?\s*(.*?)\s*```", text, re.DOTALL|re.IGNORECASE)
    text_to_search = m.group(1) if m else text
    # Fallback: find the first complete SPARQL statement
    m2 = re.search(r"(SELECT|ASK|CONSTRUCT|DESCRIBE).*?\}", text_to_search,
                   re.DOTALL|re.IGNORECASE)
    return m2.group(0).strip() if m2 else ""


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


adapter_config.json:   0%|          | 0.00/859 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/641 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/557 [00:00<?, ?B/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


adapter_model.safetensors:   0%|          | 0.00/3.73G [00:00<?, ?B/s]

look up Wikidata entity IDs and packages them into a small JSON so then the model uses to ground its query

In [3]:
import requests

def wd_search_entity(label, language="en"):
    url = "https://www.wikidata.org/w/api.php"
    params = {"action":"wbsearchentities","search":label,"language":language,
              "format":"json","limit":1}
    r = requests.get(url, params=params, timeout=20)
    j = r.json()
    return j["search"][0]["id"] if j.get("search") else None

def wd_search_property(label, language="en"):
    url = "https://www.wikidata.org/w/api.php"
    params = {"action":"wbsearchentities","type":"property","search":label,
              "language":language,"format":"json","limit":1}
    r = requests.get(url, params=params, timeout=20)
    j = r.json()
    return j["search"][0]["id"] if j.get("search") else None

def build_context(entities: dict, relationships: dict) -> str:
    """
    entities: {"Canada": "Q16", ...}  or {"Canada": None} to auto-resolve
    relationships: {"capital": "P36", ...} or {"capital": None} to auto-resolve
    """
    ent_ids = {}
    for k,v in entities.items():
        ent_ids[k] = v or wd_search_entity(k)
    rel_ids = {}
    for k,v in relationships.items():
        rel_ids[k] = v or wd_search_property(k)
    return json.dumps({"entities": ent_ids, "relationships": rel_ids}, ensure_ascii=False, indent=2)


tokenizes calls the model to generate text, then slices and extracts only the SPARQL string

In [4]:
def generate_sparql(question: str, context_json_str: str,
                    max_new_tokens=384, temperature=0.7, top_p=0.9) -> str:
    import re

    system_message_template = (
        "You are an expert text to SparQL query translator. "
        "Users will ask you questions in English and you will generate a SparQL query "
        "based on the provided context encloses in ```sparql <respose_query>```.\n"
        "CONTEXT:\n{context}"
    )
    system_msg = system_message_template.format(context=context_json_str)

    chat = [
        {"role": "system", "content": system_msg},
        {"role": "user", "content": question},
    ]

    # ---- Portable way: get a string prompt, then tokenize ourselves ----
    prompt_str = tokenizer.apply_chat_template(
        chat, tokenize=False, add_generation_prompt=True
    )
    enc = tokenizer(prompt_str, return_tensors="pt", add_special_tokens=False)
    enc = {k: v.to(model.device) for k, v in enc.items()}

    with torch.no_grad():
        outputs = model.generate(
            **enc,
            max_new_tokens=max_new_tokens,
            do_sample=True, temperature=temperature, top_p=top_p,
            pad_token_id=tokenizer.pad_token_id,
        )

    # Decode only the newly generated part
    gen_only = outputs[0][enc["input_ids"].shape[-1]:]
    text_full = tokenizer.decode(gen_only, skip_special_tokens=True)

    # Prefer fenced code block ```sparql ...```, else first SPARQL-looking clause
    m = re.search(r"```(?:sparql)?\s*(.*?)\s*```", text_full, re.DOTALL|re.IGNORECASE)
    text_to_search = m.group(1) if m else text_full
    m2 = re.search(r"(SELECT|ASK|CONSTRUCT|DESCRIBE).*?\}", text_to_search,
                   re.DOTALL|re.IGNORECASE)
    return m2.group(0).strip() if m2 else ""


Example: SAE question “What is the capital of Canada?” with the IDs for Canada (Q16) and capital (P36) to the generator and prints the produced SPARQL

In [5]:
question = "What is the capital of Canada?"

# You can provide IDs manually (fast) or let the helper look them up (flexible).
ctx = build_context(
    entities={"Canada": "Q16"},                    # Q16 = Canada
    relationships={"capital": "P36"}               # P36 = capital
)

sparql = generate_sparql(question, ctx)
print("Generated SPARQL:\n", sparql or "(empty)")


Generated SPARQL:
 (empty)


send the generated SPARQL to the Wikidata endpoint and return a pandas table

In [6]:
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd

def run_wikidata_query(sparql_query: str) -> pd.DataFrame:
    if not sparql_query:
        return pd.DataFrame()
    endpoint = SPARQLWrapper("https://query.wikidata.org/sparql",
                             agent="occiglot-colab-demo/1.0 (your_email@example.com)")
    endpoint.setTimeout(30)
    endpoint.setQuery(sparql_query)
    endpoint.setReturnFormat(JSON)
    res = endpoint.query().convert()
    rows = res.get("results", {}).get("bindings", [])
    # flatten to a DataFrame
    flat = [{k: v.get("value") for k,v in row.items()} for row in rows]
    return pd.DataFrame(flat)

df = run_wikidata_query(sparql)
df.head(10)


Example: SAE question “Which universities did Angela Merkel attend?” with IDs for Angela Merkel (Q567) and “educated at” (P69), generates the SPARQL, runs it, and displays the answer table

In [7]:
# Angela Merkel = Q567, "educated at" = P69
question = "Which universities did Angela Merkel attend?"
ctx = build_context(
    entities={"Angela Merkel": "Q567"},
    relationships={"educated at": "P69"}
)
sparql = generate_sparql(question, ctx)
print("Generated SPARQL:\n", sparql)
run_wikidata_query(sparql).head(10)


Generated SPARQL:
 SELECT (COUNT(?obj) AS ?value ) { wd:Q567 wdt:P69 ?obj }


Unnamed: 0,value
0,4
