In [2]:
from langchain_openai import OpenAIEmbeddings
from sklearn.metrics.pairwise import cosine_similarity
import openai

from sentence_transformers import util, SentenceTransformer
import json
import torch
from dotenv import load_dotenv
import os

load_dotenv()

api_key = os.getenv("UNI_API_KEY")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
with open('template_representatives.json', 'r') as f:
    q_templates = json.load(f)

In [4]:

def get_similar_question(question, corpus):
    model = SentenceTransformer('all-MiniLM-L6-v2')

    questions_text = [q['question'] for q in list(corpus.values())]
    embeddings = model.encode(questions_text, convert_to_tensor=True)
    question_embedding = model.encode(question, convert_to_tensor=True)
    
    hits = util.semantic_search(question_embedding, embeddings)
    hits = hits[0]  
    for hit in hits[0:5]:
        print("\t{:.3f}\t{}".format(hit['score'], questions_text[hit['corpus_id']]))
    return [questions_text[hit['corpus_id']] for hit in hits[0:2]]

In [5]:
def get_similar_questions(question, q_templates):
    model = SentenceTransformer('all-MiniLM-L6-v2')

    questions_text = [q['question'] for q in list(q_templates.values())]
    embeddings = model.encode(questions_text, convert_to_tensor=True)
    new_embedding = model.encode(question, convert_to_tensor=True)

    similarities = util.pytorch_cos_sim(new_embedding, embeddings)[0]

    # Get bottom 2 indices (least similar)
    top_indices = torch.topk(similarities, k=2).indices.tolist()

    results = []
    for idx in top_indices:
        q = list(q_templates.values())[idx]
        results.append((q['question'], q['sparql']))

    return results

In [6]:
get_similar_questions("The author Sameh S. Askar is primarily affiliated to which institution?", q_templates)

[("Do the authors of 'Assessing the veracity of identity assertions via OSNs' have Duke University as their primary affiliation?",
  "ASK { <https://dblp.org/rec/conf/comsnets/SirivianosKGY12> <https://dblp.org/rdf/schema#authoredBy> ?x . ?x <https://dblp.org/rdf/schema#primaryAffiliation> 'Duke University, Durham, USA' }"),
 ("How many different affiliations do the authors of 'The Poetess Archive Database' have?",
  'SELECT (COUNT(DISTINCT ?answer) AS ?count) WHERE { <https://dblp.org/rec/journals/dhq/Mandell09> <https://dblp.org/rdf/schema#authoredBy> ?x . ?x <https://dblp.org/rdf/schema#primaryAffiliation> ?answer }')]

In [7]:
def get_similar_question_embeddings(question, corpus, model):
    embeddings = OpenAIEmbeddings(
    openai_api_base="https://llms-inference.innkube.fim.uni-passau.de",
    api_key=api_key,
    model=model
    )
    questions_text = [q['question'] for q in list(corpus.values())]
    corpus_embeddings = [embeddings.embed_query(q) for q in questions_text]
    query_embedding = embeddings.embed_query(question)
    cos_sim = cosine_similarity([query_embedding], corpus_embeddings)[0]
    top_n = sorted(enumerate(cos_sim), key=lambda x: x[1], reverse=True)[:2]
   
    return [(list(corpus.values())[idx]['question'], list(corpus.values())[idx]['sparql']) for idx, score in top_n]
    


In [8]:
import re
import codecs

def preprocess_text(text):
    # Decode unicode escape sequences like \\u00F3 → ó
    try:
        text = codecs.decode(text, 'unicode_escape')
    except Exception:
        pass  # Fail silently if already decoded
    # Remove escaped periods like '\.' → '.'
    text = text.replace('\\.', '')
    # Remove comma between two name-like words
    text = re.sub(r'(\b[\w\-\']+\b),\s*(\b[\w\-\']+\b)', r'\1 \2', text)
    return text

In [45]:
import requests

response = requests.get('https://dblp.org/search/author/api', params={'q': "Eick Angela", 'format': 'json'})
data = response.json()
data['result']['hits']['hit']

[{'@score': '2',
  '@id': '833159',
  'info': {'author': 'Angela A. Eick', 'url': 'https://dblp.org/pid/155/2756'},
  'url': 'URL#833159'}]

In [46]:
len(data['result']['hits']['hit'])

1

In [49]:
import requests
def get_author_id(author_name):
    response = requests.get('https://dblp.org/search/author/api', params={'q': author_name, 'format': 'json'})
    if response.status_code == 200:
        data = response.json()
        return(data['result']['hits']['hit'][0]['info']['url'], len(data['result']['hits']['hit']))
    else:
        print("Request failed with status:", response.status_code)
        return None

In [66]:
def build_regex_pattern(name: str) -> str:
    name_parts = name.lower().strip().split()
    name_parts = [part.replace(".", "") for part in name_parts]
    pattern = ".*".join(name_parts)
    return f'No id is available, use regex matching, exp: ?author rdfs:label ?name .\nFILTER(REGEX(?name, "{pattern}", "i"))'
    

In [67]:
build_regex_pattern("K. Cheng")

'No id is available, use regex matching, exp: ?author rdfs:label ?name .\nFILTER(REGEX(?name, "k.*cheng", "i"))'

In [68]:
import spacy
nlp = spacy.load("en_core_web_lg")

def extract_author_dblp_ids(text):
    authors={}
    doc = nlp(text)
    nbr_ent = len(doc.ents)
    for ent in doc.ents:
        if ent.label_=='PERSON':
            match, nbr_matches = get_author_id(ent.text)
            if nbr_matches>1 and nbr_ent>1:
                authors[ent.text]=build_regex_pattern(ent.text)
            else:
                authors[ent.text]=match
    return authors
            

In [62]:
extract_author_dblp_ids("Which papers did Ji Shaoxiong publish in the last 8 years?")

{'Ji Shaoxiong': 'https://dblp.org/pid/227/0291'}

In [52]:
def extract_paper_titles(text):
    patterns = [
        r"'([^']+)'.*?'([^']+)'.*?which one.*?(?:published|authors)|which one.*?(?:published|authors).*?'([^']+)'.*?'([^']+)'",
        r"(?:\bpaper\s+|\bauthors\s+of\s+)'([^']+)'"
    ]
    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            return [g for g in match.groups() if g is not None]
    return None

In [None]:
def get_paper_id(paper_title):
    response = requests.get('https://dblp.org/search/publ/api', params={'q': paper_title, 'format': 'json'})
    if response.status_code == 200:
        data = response.json()
        return(data['result']['hits']['hit'][0]['info']['url'])
    else:
        print("Request failed with status:", response.status_code)

In [16]:
def extract_paper_ids(question):
    papers = extract_paper_titles(question)
    paper_ids = {}
    if papers:
        for paper in papers:
            paper_ids[paper]=get_paper_id(paper)
            return paper_ids
    else:
        return None

In [69]:
def prompt(question, q_templates):
    
    first_similar_question, second_similar_question = get_similar_questions(question, q_templates)
    first_question, first_sparql = first_similar_question
    second_question, second_sparql = second_similar_question
    pre_processed_question = preprocess_text(question)
    author_ids = extract_author_dblp_ids(pre_processed_question)
    paper_ids = extract_paper_ids(pre_processed_question)
    
    prompt = f"Question: {question}\nSimilar Question 1: {first_question}\n{first_sparql}\nSimilar Question 2: {second_question}\n{second_sparql}\nPaper ids: {paper_ids}\nAuthor ids: {author_ids}"
    return prompt

In [70]:
print(prompt("Find the paper written by Carlee Joe-Wong with Liang Zheng that was published in INFOCOM.", q_templates))

Question: Find the paper written by Carlee Joe-Wong with Liang Zheng that was published in INFOCOM.
Similar Question 1: Report the count of papers that Fanggang Wang has published in IEEE Wirel. Commun..
SELECT (COUNT(DISTINCT ?answer) AS ?count) WHERE { ?answer <https://dblp.org/rdf/schema#authoredBy> <https://dblp.org/pid/84/2752> . ?answer <https://dblp.org/rdf/schema#publishedIn> 'IEEE Wirel. Commun.' }
Similar Question 2: When was the paper on Manufacturing knowledge models by Loughborough University published?
SELECT DISTINCT ?answer WHERE { <https://dblp.org/rec/journals/ao/ChungooraY11> <https://dblp.org/rdf/schema#authoredBy> ?x . ?x <https://dblp.org/rdf/schema#primaryAffiliation> 'Loughborough University, UK' . <https://dblp.org/rec/journals/ao/ChungooraY11> <https://dblp.org/rdf/schema#yearOfPublication> ?answer }
Paper ids: None
Author ids: {'Carlee Joe-Wong': 'https://dblp.org/pid/40/9937', 'Liang Zheng': 'No id is available, use regex matching, exp: ?author rdfs:label ?n

In [79]:
def nl2sparql(question, templates, model="qwen2.5"):
    prompt_text = prompt(question, templates)
    
    client = openai.OpenAI(
    api_key=api_key,
    base_url="https://llms-inference.innkube.fim.uni-passau.de" 
    )
    
    
    instruction = """You are an expert in sparql query generation. Given a question, a similar question template, 
    its sparql query and DBLP entity ids if available, generate a sparql query that answers the question. If you can't generate the query
    return Nan. Provide only the sparql query without any explanation or additional text."""

    response = client.chat.completions.create(
        temperature=0.6,
        model=model,
        messages = [
            {
                "role": "system",
                "content" : instruction
            },
            {   
                "role": "user",
                "content": prompt_text
            }
        ]
    )
    
    return response.choices[0].message.content


In [80]:
nl2sparql("Find the paper written by Carlee Joe-Wong with Liang Zheng that was published in INFOCOM.", q_templates)

'SELECT DISTINCT ?paper WHERE { ?paper <https://dblp.org/rdf/schema#authoredBy> <https://dblp.org/pid/40/9937> . ?paper <https://dblp.org/rdf/schema#authoredBy> ?author . ?author rdfs:label ?name . FILTER(REGEX(?name, "liang.*zheng", "i")) . ?paper <https://dblp.org/rdf/schema#publishedIn> \'INFOCOM\' }'