# Agent Import

In [134]:
%load_ext dotenv
%dotenv
import os

from IPython.display import display, Markdown

os.getenv("OPENAI_API_KEY")[:5]

The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv


'sk-oD'

In [135]:
# from langchain_openai import ChatOpenAI
# llm = ChatOpenAI()

from langchain_community.chat_models import ChatOllama

llm = ChatOllama(model="llama3", stop=["<|eot_id|>"], temperature=0)

In [136]:
from random import shuffle


def reduce_dict(original):
  if isinstance(original, list):
    shuffle(original)
    return [reduce_dict(i) for i in original[:2]]
  elif isinstance(original, dict):
    output = {}
    for k, v in original.items():
      output[k] = reduce_dict(v)
    else:
      output[k] = v
    return output

  return original

# reduce_dict({
#   "a": 1,
#   "b": {
#     "c": 2,
#     "d": 3
#   },
#   "e": [1, 2, 3, 4, 5],
#   "f": [
#     {"g": [1,2,3,4], "h": [1,2,3,4]},
#     {"i": [1,2,3,4], "j": [1,2,3,4]},
#   ],
# })

In [137]:
dir = "/Users/adam/Library/Application Support/Neo4j Desktop/Application/relate-data/dbmss/dbms-1a58e29c-f0c0-4786-8f41-c1508e581c68/import/tests_json/"

# read dir
import os
from json import loads, dumps

dicts = []

for filename in os.listdir(dir):
   if filename.endswith(".json"):
    json = open(dir + filename, "r").read()
    try:
      d = loads(json)
      reduced = reduce_dict(d)

      dicts.append(reduced)
    except:
      print(filename)
      pass

shuffle(dicts)

In [43]:
from langchain_core.prompts import PromptTemplate

summarise = PromptTemplate.from_template("""
Given the following JSON document,
create a single JSON schema specification that represents the structure of the document.

{json}

Respond with only the JSON schema specification.
All properties should include a description with an example value - for example {{"name": {{"type": "string", "description": "The name of the person - eg. Adam Cowley"}}}}.
""")


In [45]:
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.output_parsers import JsonOutputParser

class JSONSchemaSpecification(BaseModel):
    jsonschema: str = Field(description="The JSON schema specification")
    notes: str = Field(description="Any notes or comments about the schema")

structured_chain = summarise | llm | JsonOutputParser(pydantic_object=JSONSchemaSpecification)

schema = structured_chain.invoke({"json": dumps(dicts[:1])})

In [46]:
schema

{'$schema': 'http://json-schema.org/draft-07/schema#',
 'title': 'Cricket Match Data',
 'type': 'object',
 'properties': {'overs': {'type': 'array',
   'items': {'type': 'object',
    'properties': {'over': {'type': 'integer',
      'description': 'The number of the over - eg. 1'},
     'deliveries': {'type': 'array',
      'items': {'type': 'object',
       'properties': {'batter': {'type': 'string',
         'description': 'The name of the batter - eg. Sachin Tendulkar'},
        'bowler': {'type': 'string',
         'description': 'The name of the bowler - eg. Brett Lee'},
        'non_striker': {'type': 'string',
         'description': 'The name of the non-striker - eg. Rohit Sharma'},
        'runs': {'type': 'object',
         'properties': {'batter': {'type': 'integer',
           'description': 'The number of runs scored by the batter - eg. 2'},
          'extras': {'type': 'integer',
           'description': 'The number of extras scored - eg. 1'}}},
        'wickets': {'type

In [47]:
# next, given the next X records, revise the schema to include the new records
# then,

revise_prompt = PromptTemplate.from_template("""
Given the following JSON document, create a single JSON schema specification that represents the structure of the document.

{json}

Apply any changes to the following JSON schema specification to include the new structure.
You may modify the schema but you must not remove any existing fields.  Instead, you should modify any existing fields to mark them as optional if they do not exist in the new schema.

Schema: {jsonschema}
""")

revise_chain = revise_prompt | llm | JsonOutputParser(pydantic_object=JSONSchemaSpecification)

In [48]:
# take 10 at random and use them to revise the schema
for i in range(1, 10):
  schema = revise_chain.invoke({"json": dumps(dicts[i]), "jsonschema": schema})

schema

{'$schema': 'http://json-schema.org/draft-07/schema#',
 'title': 'Cricket Match Data',
 'type': 'object',
 'properties': {'overs': {'type': 'array',
   'items': {'type': 'object',
    'properties': {'over': {'type': 'integer',
      'description': 'The number of the over - eg. 1'},
     'deliveries': {'type': 'array',
      'items': {'type': 'object',
       'properties': {'batter': {'type': 'string',
         'description': 'The name of the batter - eg. Sachin Tendulkar'},
        'bowler': {'type': 'string',
         'description': 'The name of the bowler - eg. Brett Lee'},
        'non_striker': {'type': ['string', 'null'],
         'description': 'The name of the non-striker - eg. Rohit Sharma'},
        'runs': {'type': 'object',
         'properties': {'batter': {'type': 'integer',
           'description': 'The number of runs scored by the batter - eg. 2'},
          'extras': {'type': 'integer',
           'description': 'The number of extras scored - eg. 1'}}},
        'wicket

In [130]:
from langchain_core.output_parsers import StrOutputParser

model_prompt = PromptTemplate.from_template("""
Given the following JSON Schema, extract the nodes and relationships that can be used to create a graph database schema.
Use the examples provided in the description to write a Cypher statement that creates example data that adheres to the graph database schema.

Schema: {jsonschema}
""")

model_prompt = PromptTemplate.from_template("""
You are graph database developer creating a data model for a Neo4j database from a JSON schema.

Schema:

```
{jsonschema}
```

Example record:
```
{example}
```

Identify every noun and proper noun in the schema, for example: Person, Director or Movie. These will be nodes in the graph.

Nodes and relationships contain properties, which are key-value pairs.  The only valid property types are string, number, boolean, date, time, and datetime.
You must convert any nested objects to a node with a corresponding relationship.
All arrays must be converted to relationships to another node.

Nodes may have multiple labels, so attempt to collapse node definitions into a single node with multiple labels.
For example, a Person may be both an Actor and a Director, so the main label will be :Person, and additional labels should be added for :Actor and :Director.

If a property is listed as a string, use your judgement to determine if it should be a property or a node in its own right with a relationship.
For example, `{{"person": {{"type": "string"}}}}` should be a node with label `:Person` and a `name` property.

If the pattern may be repeated, then you must use this information to create a node definition rather than a property.


{additional}

Do not include any properties that can be easily derived from the graph structure.  For example, it is not necessary to store a `moviesCount` property in a Person node.

Return a list of Nodes, along with their properties and sub-labels.  Start your list with the nodes that you think will have the most occurrences.

You cannot have an array of nodes as a property, these will be relationships in the graph.

Labels should be upper-camel-case, for example UpperCamelCase.
Relationship types should be upper-snake-case, for example UPPER_SNAKE_CASE.
Property keys should be lower-camel-case, for example lowerCamelCase.

Respond with only a JSON object containing keys for `nodes` and `relationships`, both containing an array of JSON schemas.

DO NOT RETURN ANY OTHER TEXT, EXPLAINATION OR MARKDOWN.
""").partial(additional="")

temp_llm = ChatOllama(model="llama3", stop=["<|eot_id|>"], temperature=0.2)

model_chain = model_prompt | temp_llm | JsonOutputParser()

model = model_chain.invoke({"jsonschema": schema, "example": dumps(dicts[0])})

# display(Markdown(model))
# dumps(model, indent=2)
model

{'nodes': [{'label': ':Match',
   'properties': {'id': {'type': 'string'}, 'date': {'type': 'datetime'}}},
  {'label': ':Player',
   'properties': {'name': {'type': 'string'}, 'runs': {'type': 'number'}}},
  {'label': ':Bowler', 'properties': {'name': {'type': 'string'}}},
  {'label': ':Over',
   'properties': {'number': {'type': 'number'},
    'deliveries': {'type': 'array',
     'items': {'label': ':Delivery',
      'properties': {'batter': {'type': 'string'},
       'bowler': {'type': 'string'},
       'non_striker': {'type': 'string'},
       'runs': {'type': 'object'}}}}}},
  {'label': ':Delivery',
   'properties': {'batter': {'type': 'string'},
    'bowler': {'type': 'string'},
    'non_striker': {'type': 'string'},
    'runs': {'type': 'object'}}}],
 'relationships': [{'type': 'DELIVERED', 'from': ':Player', 'to': ':Delivery'},
  {'type': 'BOWLED', 'from': ':Bowler', 'to': ':Delivery'},
  {'type': 'WICKET_FELL', 'from': ':Player', 'to': ':Match'}]}

In [155]:
from langchain_core.prompts import PromptTemplate

graphql_summarise_prompt = PromptTemplate.from_template("""
[INST]Given the following JSON document, create a GraphQL schema that represents the structure of the document.

Consider whether each property in the JSON document should be a type or a field in the schema.
Use nouns to derive additional types and use them to create relationships between the types.
For example, if the "Movie" document has a key of `actor`, the actor property on should be a relationship to an "Actor" type.


Make sure you add every property to the GraphQL schema.
Also remember to consider the overall document and include it in the schema.
[/INST]

{json}
""")

graphql_summarise_prompt = PromptTemplate.from_template("""
<s>[INST]
You are a GraphQL expert designing an GraphQL API.
You have been provided with the following JSON file:
[/INST]
{json}
</s>

<s>[INST]
Your task is to create a GraphQL schema that accurately represents the structure of this document. Follow these steps:

1. **Identify the nouns:**  Nouns will describe the entites or types.
2. **Identify the verbs:** Verbs will indicate a relationship between two entities.
   Even if a property is not a relationship, you should consider whether it could be a relationship.
3. **Consider :** Make sure you inclue ALL properties mentioned in the document.
4. **Include all properties:** Make sure you inclue ALL properties mentioned in the document.

Respond with only a GraphQL schema.  Do not include any comments or preamble.
[/INST]</s>


""")

graphql_summarise_chain = graphql_summarise_prompt | temp_llm | StrOutputParser()

gql = graphql_summarise_chain.invoke({"json": dumps(dicts[1])})

display(Markdown(f"\n```\n{gql}\n```\n"))



```
type CricketMatch {
  overs: [Over]
}

type Over {
  over: Int!
  deliveries: [Delivery]
}

type Delivery {
  batter: String!
  bowler: String!
  nonStriker: String!
  runs: RunInfo!
}

type RunInfo {
  batter: Int
  extras: Int
  total: Int
}

type Wicket {
  kind: String!
  playerOut: String!
}

input CricketMatchInput {
  overs: [OverInput]
}

type CricketMatchResult {
  wickets: [Wicket]
}

schema {
  query: Query
}
```


In [163]:
revise_graphql_prompt = PromptTemplate.from_template("""
You are reviewing a GraphQL schema that was created from a JSON documen for accuracy.

JSON:
```
{schema}
```

GraphQL Schema:
```
{schema}
```

Perform the following steps:
1. Check for missing properties and append them to the schema.  Include dates as appropriate.
2. Check the property keys for nouns and verbs and ensure that these are updated to be types.
   For example: `Movie {{ director: String }}` should be updated to `type Director {{ name: String }} type Movie {{ director: Director }}`.
3. Check for any property with a type of string that
""")
revise_graphql_chain = revise_graphql_prompt | temp_llm | StrOutputParser()

revised = revise_graphql_chain.invoke({"schema": gql, "json": dumps(dicts[2])})

display(Markdown(revised))

Based on the provided JSON schema, I will perform the following steps:

**Step 1: Check for missing properties and append them to the schema**

After reviewing the JSON schema, I noticed that there are no explicit date-related properties defined. To ensure accuracy, I will add a `date` property to each type that requires it.

* `CricketMatch`: Add a `date` property with type `Date!`.
* `Over`: Add a `date` property with type `Date!`.
* `Delivery`: Add a `date` property with type `Date!`.
* `RunInfo`: No date-related properties are required.
* `Wicket`: No date-related properties are required.

Here's the updated GraphQL schema:
```graphql
type CricketMatch {
  overs: [Over]
  date: Date!
}

type Over {
  over: Int!
  deliveries: [Delivery]
  date: Date!
}

type Delivery {
  batter: String!
  bowler: String!
  nonStriker: String!
  runs: RunInfo!
  date: Date!
}

type RunInfo {
  batter: Int
  extras: Int
  total: Int
}

type Wicket {
  kind: String!
  playerOut: String!
}

input CricketMatchInput {
  overs: [OverInput]
}

type CricketMatchResult {
  wickets: [Wicket]
}
```
**Step 2: Check the property keys for nouns and verbs and ensure that these are updated to be types**

After reviewing the JSON schema, I noticed that some property keys are not properly defined as types. Here's the corrected GraphQL schema:

* `CricketMatch`: Update `overs` to be an array of `Over` objects.
* `Over`: Update `deliveries` to be an array of `Delivery` objects.
* `Delivery`: Update `runs` to be a `RunInfo` object.

Here's the updated GraphQL schema:
```graphql
type CricketMatch {
  overs: [Over]
}

type Over {
  over: Int!
  deliveries: [Delivery]
}

type Delivery {
  batter: String!
  bowler: String!
  nonStriker: String!
  runs: RunInfo!
}

type RunInfo {
  batter: Int
  extras: Int
  total: Int
}

type Wicket {
  kind: String!
  playerOut: String!
}

input CricketMatchInput {
  overs: [OverInput]
}

type CricketMatchResult {
  wickets: [Wicket]
}
```
The updated GraphQL schema should now accurately reflect the JSON schema.