<a href="https://colab.research.google.com/github/amitca71/colab/blob/main/generic_cypher_phi3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install openai neo4j --upgrade

Collecting openai
  Downloading openai-1.35.14-py3-none-any.whl (328 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m328.5/328.5 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting neo4j
  Downloading neo4j-5.22.0-py3-none-any.whl (293 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m293.5/293.5 kB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.5-py3-none-any.whl (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai)
  Downloading h11-0.14.0-py3-none-any.whl (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━

In [2]:
%%capture
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes
!pip install torch --upgrade

In [3]:
inference_model_name="amitca71/lora_model_text2cypher_unsloth_phi-3-mini-4k-instruct"
max_seq_length=2048
dtype=None
load_in_4bit=False

In [4]:
from google.colab import drive
import os,subprocess
drive.mount('/content/gdrive', force_remount=True)
root_dir = "/content/gdrive/My Drive/"
env_file = f'{root_dir}/Colab Notebooks/env'
def get_env_vars(conf_file_path):
  variables={}
  command = f'cat "{env_file}"'
  output, error =subprocess.Popen(command, stdout=subprocess.PIPE, shell=True, executable='/bin/bash').communicate()

  if output:
      for line in output.decode().strip().replace('"', '').splitlines():
          key, _, value = line.partition("=")
          key=key.replace("export", "").strip()
          if(key!=""):
            variables[key]=value
            os.environ[key] = value
#            print(key, os.getenv(key))
  return(variables)
env_vars=get_env_vars(env_file)
hf_token=env_vars['HUGGINGFACE_TOKEN']
hf_token=env_vars['HUGGINGFACE_TOKEN']
neo4j_username=env_vars['NEO4J_USERNAME']
neo4j_password=env_vars['NEO4J_PASSWORD']
neo4j_uri=env_vars['NEO4J_URI']
openai_key=env_vars['OPEN_API_KEY']

Mounted at /content/gdrive


In [5]:
node_properties_query = """
CALL apoc.meta.data()
YIELD label, other, elementType, type, property
WHERE NOT type = "RELATIONSHIP" AND elementType = "node"
WITH label AS nodeLabels, collect(property) AS properties
RETURN {labels: nodeLabels, properties: properties} AS output

"""

rel_properties_query = """
CALL apoc.meta.data()
YIELD label, other, elementType, type, property
WHERE type = "RELATIONSHIP" AND elementType = "relationship"
WITH label AS nodeLabels, collect(property) AS properties
RETURN {type: nodeLabels, properties: properties} AS output
"""

rel_query = """
CALL apoc.meta.data()
YIELD label, other, elementType, type, property
WHERE type = "RELATIONSHIP" AND elementType = "node"
RETURN {source: label, relationship: property, target: other} AS output
"""

In [12]:
from neo4j import GraphDatabase
from neo4j.exceptions import CypherSyntaxError
import openai
from transformers import TextStreamer

def schema_text(node_props, rel_props, rels):
    return f"""
  This is the schema representation of the Neo4j database.
  Node properties are the following:
  {node_props}
  Relationship properties are the following:
  {rel_props}
  Relationship point from source to target nodes
  {rels}
  Make sure to respect relationship types and directions
  """


class Neo4jGPTQuery:
    def __init__(self, url, user, password,model, tokenizer ):
        self.driver = GraphDatabase.driver(url, auth=(user, password))
#        openai.api_key = openai_api_key
        # construct schema
        self.schema = self.generate_schema()
        self.tokenizer=tokenizer
        self.model = model

    def generate_schema(self):
        node_props = self.query_database(node_properties_query)
        rel_props = self.query_database(rel_properties_query)
        rels = self.query_database(rel_query)
        return schema_text(node_props, rel_props, rels)

    def refresh_schema(self):
        self.schema = self.generate_schema()

    def get_system_message(self):
        return f"""
        Task: Generate Cypher queries to query a Neo4j graph database based on the provided schema definition.
        Instructions:
        Use only the provided relationship types and properties.
        Do not use any other relationship types or properties that are not provided.
        If you cannot generate a Cypher statement based on the provided schema, explain the reason to the user.
        Schema:
        {self.schema}

        Note: Do not include any explanations or apologies in your responses.
        """

    def query_database(self, neo4j_query, params={}):
        with self.driver.session() as session:
            result = session.run(neo4j_query, params)
            output = [r.values() for r in result]
            output.insert(0, result.keys())
            return output

    def construct_cypher(self, question, history=None):
        messages = [
            {"role": "system", "content": self.get_system_message()},
            {"role": "user", "content": question},
        ]
        # Used for Cypher healing flows
        if history:
            messages.extend(history)
        inputs = self.tokenizer.apply_chat_template(
            messages,
            tokenize = True,
            add_generation_prompt = True, # Must add for generation
            return_tensors = "pt",
        ).to("cuda")

        text_streamer = TextStreamer(tokenizer)
        outputs = self.model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 128, use_cache = True)
        decoded_output =self.tokenizer.decode(outputs[0], skip_special_tokens=True)
#        return("MATCH " + decoded_output.split("MATCH")[1].strip() )
        return(decoded_output.split("Cypher query:")[-1].strip())
# Generate Cypher statement
#
#        completions = openai.chat.completions.create(
#            model="gpt-4",
#            temperature=0.0,
#            max_tokens=1000,
#            messages=messages
#        )
#        return completions.choices[0].message.content

    def run(self, question, history=None, retry=True):
        # Construct Cypher statement
        cypher = self.construct_cypher(question, history)
        print(cypher)
        try:
            return self.query_database(cypher)
        # Self-healing flow
        except CypherSyntaxError as e:
            # If out of retries
            if not retry:
              return "Invalid Cypher syntax"
        # Self-healing Cypher flow by
        # providing specific error to GPT-4
            print("Retrying")
            return self.run(
                question,
                [
                    {"role": "assistant", "content": cypher},
                    {
                        "role": "user",
                        "content": f"""This query returns an error: {str(e)}
                        Give me a improved query that works without any explanations or apologies""",
                    },
                ],
                retry=False
            )


In [7]:
from unsloth.chat_templates import get_chat_template
from datasets import load_dataset
from unsloth import FastLanguageModel
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = inference_model_name, # YOUR MODEL YOU USED FOR TRAINING
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


adapter_config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

==((====))==  Unsloth: Fast Mistral patching release 2024.7
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.26.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.65G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/145 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/120M [00:00<?, ?B/s]

Unsloth 2024.7 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [13]:
gds_db = Neo4jGPTQuery(
    url=neo4j_uri,
    user=neo4j_username,
    password=neo4j_password,
    model=model,
    tokenizer=tokenizer
)

In [18]:
output=gds_db.run("how many doors in the project?")
print(output)

<|system|> Task: Generate Cypher queries to query a Neo4j graph database based on the provided schema definition.
        Instructions:
        Use only the provided relationship types and properties.
        Do not use any other relationship types or properties that are not provided.
        If you cannot generate a Cypher statement based on the provided schema, explain the reason to the user.
        Schema:
        
  This is the schema representation of the Neo4j database.
  Node properties are the following:
  [['output'], [{'labels': 'IfcLocalPlacement', 'properties': ['nid']}], [{'labels': 'IfcCartesianPoint', 'properties': ['nid', 'Coordinates']}], [{'labels': 'IfcShapeRepresentation', 'properties': ['nid', 'RepresentationIdentifier', 'RepresentationType']}], [{'labels': 'IfcProductDefinitionShape', 'properties': ['nid']}], [{'labels': 'IfcPolyline', 'properties': ['nid']}], [{'labels': 'IfcColourRgb', 'properties': ['nid', 'Red', 'Blue', 'Green']}], [{'labels': 'IfcPresentatio