In [105]:
from llama_index.core import SimpleDirectoryReader, KnowledgeGraphIndex
from llama_index.core.graph_stores import SimpleGraphStore

from llama_index.llms.openai import OpenAI
from llama_index.core import Settings
from IPython.display import Markdown, display


In [106]:
file_path = r"C:\Users\AAI47\poc2\flask-api\code\rdf\rdf.txt"

reader = SimpleDirectoryReader(input_files=[file_path])
documents = reader.load_data()
print(f"Loaded {len(documents)} documents")


Loaded 1 documents


In [107]:
import requests
from typing import Optional, List, Mapping, Any

from llama_index.core import SimpleDirectoryReader, SummaryIndex
from llama_index.core.callbacks import CallbackManager
from llama_index.core.llms import (
    CustomLLM,
    CompletionResponse,
    CompletionResponseGen,
    LLMMetadata,
)
from llama_index.core.llms.callbacks import llm_completion_callback
from llama_index.core import Settings


class LLM(CustomLLM):
    context_window: int = 3900
    num_output: int = 256
    model_name: str = "custom"
    api_url: str = "http://px101.prod.exalead.com:8110/v1/chat/completions"
    base_prompt = "Convert this natural language query to a SPARQL query based on the rdf/owl provided. map each element in the NLQ to It correspond ontology/class and relationship with properties defined in the rdf. Please respond only with SparQL Query "
    headers = {
        'Authorization': 'Bearer vtYvpB9U+iUQwl0K0MZIj+Uo5u6kilAZJdgHGVBEhNc=',
        'Content-Type': 'application/json'
    }
    base_prompt: str = """
You are an expert in transforming natural language queries into UQL queries using a specified ontology. Based on user questions about data stored in an RDF graph, employ the provided ontology, documentation, and steps to understand the UQL syntax. Then, generate the UQL query equivalent for the given natural language query. The response should contain only the UQL query without any explanations or additional text.

** UQL Documentation **
    UQL queries are expressed in a pseudo UQL format with operators like AND, OR, NOT. Attribute names should be placed in square brackets. Special characters in attribute names need to be escaped with '\\\\'. 
    1. **Basic UQL Structure**:
        - Attribute names cannot contain characters like .:%#[]$;{{}}.
        - A mapping service translates the names exposed to the user and the names used by Cloudview.
        - Use square brackets for predicate names. If a predicate is unknown, replace it with #false.
        - Escape the first square bracket ‘[’ with ‘\\\\’ to cancel the attribute name mapping, or use quotes ‘“’ to disable it inside quotes.

**Detailed UQL Query Construction Process **
    1. **Identify Relevant Classes and Properties**: Review the ontology to determine which classes or properties are relevant to the query.
    2. **Map Natural Language to RDF Classes**: Use the ontology to correlate the identified natural language elements with the appropriate RDF classes and predicates.Translate terms like 'physical products' and 'products' directly to their RDF class equivalents based on their definitions or equivalences in the ontology.
    3. **Construct the UQL Query**: Formulate the UQL query based on these mappings, ensuring to use only those RDF classes and properties directly relevant or defined as equivalent.

**Example UQL Query Construction**
    Given the natural language query: "Show me all documents created 1 juin 2024 , by John Doe"
    - **Step 1: Parse the Natural Language Query**
        - Subject: Documents .
        - Predicate: Created by and date of creation .
        - Object: John Doe and 1 juin 2024 .

    - **Step 2: Map to RDF Concepts**
        - "Documents" corresponds to instances of the 'Document' class.
        - "Created by" maps to the 'ds6w:lastModifiedBy' or 'ds6w:responsible'.
        - "Created 1 juin 2024 " maps to 'ds6w:created' .
        - "John Doe" corresponds to instances of the 'Person' class

    - **Step 3: Formulate the UQL Query**
        -[ds6w:created]>=\"2024-06-01T00:00:00.000Z\" AND [ds6w:created]<=\"2024-06-01T23:59:59.000Z\" AND [ds6w:type]:\"Document\" AND (([ds6w:lastModifiedBy]:\"John Doe\" OR [ds6w:responsible]:\"John Doe\")
    
    ** Example UQL Queries **
   - Example 1:
        - Natural Language: give me products that are created between 2024-05-01 to 2024-05-28 by : MCM OCDxComplianceUser
        - UQL: [ds6w:created]>="2024-05-01T00:00:00.000Z" AND [ds6w:created]<="2024-05-28T23:59:59.000Z" AND [ds6w:type]:"VPMReference"  AND (([ds6w:lastModifiedBy]:"MCM OCDxComplianceUser" OR [ds6w:responsible]:"MCM OCDxComplianceUser") 

Based on the natural language query "{query}" and the current date if needed, generate the corresponding UQL query using the ontology and RDF relationships. Ensure the output strictly adheres to the syntax and ontology requirements without adding or assuming types not explicitly defined.

Please respond ONLY with the valid UQL query.
"""

    @property
    def metadata(self) -> LLMMetadata:
        """Get LLM metadata."""
        return LLMMetadata(
            context_window=self.context_window,
            num_output=self.num_output,
            model_name=self.model_name,
        )

    @llm_completion_callback()
    def complete(self,prompt: str, **kwargs: Any) -> CompletionResponse:
        full_prompt = self.base_prompt + prompt

        messages = [{"role": "user", "content": full_prompt}]
        payload = {
            "model":"meta-llama/Meta-Llama-3-8B-Instruct",
            "messages": messages,
            "max_tokens": 1000,
            "top_p": 1,
            "stop": ["string"],
            "response_format": {
                "type": "text", 
                "temperature": 0.7
            }
        }
        response = requests.post(self.api_url, headers=self.headers, json=payload)
        if response.status_code == 200:
            generated_response = response.json()['choices'][0]['message']['content'].strip()
            return CompletionResponse(text=generated_response)
        else:
            return CompletionResponse(text="Error: API request failed")

    @llm_completion_callback()
    def stream_complete(
        self, prompt: str, **kwargs: Any
    ) -> CompletionResponseGen:
        full_prompt = self.base_prompt + prompt

        messages = [{"role": "user", "content": full_prompt}]
        payload = {
            "model":"meta-llama/Meta-Llama-3-8B-Instruct",
            "messages": messages,
            "max_tokens": 1500,
            "top_p": 1,
            "stop": ["string"],
            "response_format": {
                "type": "text", 
                "temperature": 0.7
            }
        }
        response = requests.post(self.api_url, headers=self.headers, json=payload)
        if response.status_code == 200:
            generated_response = response.json()['choices'][0]['message']['content'].strip()
            for token in generated_response:
                yield CompletionResponse(text=token, delta=token)
        else:
            yield CompletionResponse(text="Error", delta="Error")

In [108]:
import numpy as np
from typing import List
from llama_index.core.embeddings import BaseEmbedding
import requests

class CustomAPIEmbeddings(BaseEmbedding):
    _api_key: str = "vtYvpB9U+iUQwl0K0MZIj+Uo5u6kilAZJdgHGVBEhNc="
    _embeddings_url: str = "http://px101.prod.exalead.com:8110/v1/embeddings"
    _headers = {
        'Authorization': 'Bearer ' + _api_key,
        'Content-Type': 'application/json'
    }

    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    @classmethod
    def class_name(cls) -> str:
        return "custom_api"

    def _aget_query_embedding(self, query: str) -> List[float]:
        return self._get_embeddings([query], "Represent a document for semantic search:")[0]

    def _get_query_embedding(self, query: str) -> List[float]:
        return self._get_embeddings([query], "Represent a document for semantic search:")[0]

    def _get_text_embedding(self, text: str) -> List[float]:
        return self._get_embeddings([text], "Represent a document for semantic search:")[0]

    def _get_text_embeddings(self, texts: List[str]) -> List[List[float]]:
        return self._get_embeddings(texts, "Represent a document for semantic search:")

    def _get_embeddings(self, texts: List[str], instruction: str) -> List[List[float]]:
        payload = {
            "model": "BAAI/bge-large-en-v1.5",
            "input": texts,
            "encoding_format": "float",
            "instruct": instruction,
        }
        response = requests.post(self._embeddings_url, headers=self._headers, json=payload)
        if response.status_code == 200:
            response_data = response.json()
            embeddings_list = [item['embedding'] for item in response_data['data']]
            return np.array(embeddings_list).tolist()  
        else:
            raise Exception(f"Failed to get embeddings: {response.status_code}, {response.text}")


In [109]:
Settings.llm = LLM()
Settings.chunk_size = 512
embed_model = CustomAPIEmbeddings(embed_batch_size=2)
Settings.embed_model = embed_model

In [110]:
from llama_index.core import StorageContext

graph_store = SimpleGraphStore()
storage_context = StorageContext.from_defaults(graph_store=graph_store)

index =KnowledgeGraphIndex.from_documents(
    documents,
    storage_context=storage_context,
    max_triplets_per_chunk=20,
    include_embeddings=True,
)

In [116]:
chunks = index.get_chunks()


AttributeError: 'KnowledgeGraphIndex' object has no attribute 'get_chunks'

: 

In [115]:
query_engine = index.as_query_engine(
    include_text=False, response_mode="tree_summarize"
)
response = query_engine.query(
    "give me  type persons",
)
print(response)

[Owl:class]:"Pno:person"
