## Document Chunking

In [5]:
from dotenv import load_dotenv
import os

load_dotenv()

True

In [6]:
from langchain_openai import AzureOpenAIEmbeddings

embeddings = AzureOpenAIEmbeddings(
    api_version="2024-10-21",
    azure_deployment="text-embedding-ada-002"
)

In [3]:
embeddings.embed_query("Hello world")

[-0.005540425889194012,
 0.0047363233752548695,
 -0.015009919181466103,
 -0.027093535289168358,
 -0.015173893421888351,
 0.015173893421888351,
 -0.0176082756370306,
 0.009554633870720863,
 -0.00942219328135252,
 -0.030801868066191673,
 0.02631150558590889,
 0.011169145815074444,
 -0.023397814482450485,
 -0.009510486386716366,
 0.007700467016547918,
 0.010450183413922787,
 0.027572842314839363,
 -0.012581843882799149,
 0.012783657759428024,
 0.014845944941043854,
 -0.007164398208260536,
 -0.003342545125633478,
 0.0026251592207700014,
 0.007183318492025137,
 -0.019777776673436165,
 -0.003979520406574011,
 0.010633077472448349,
 -0.017456915229558945,
 0.0280773788690567,
 -0.030928000807762146,
 0.003411918645724654,
 -0.006385522428900003,
 -0.007643706630915403,
 -0.019626416265964508,
 0.00947895273566246,
 -0.01697760634124279,
 0.002305094851180911,
 -0.013332339935004711,
 0.020067883655428886,
 -0.017847929149866104,
 0.007240078877657652,
 0.009636620059609413,
 0.012178216129541

#### Python Code Chunking

In [6]:
from langchain_community.document_loaders import DirectoryLoader, PythonLoader

source_code_loader = DirectoryLoader(
    "../vanna", glob="**/*.py", loader_cls=PythonLoader
)
source_code_docs = source_code_loader.load()

In [7]:
len(source_code_docs)

79

In [8]:
source_code_docs[0]

Document(metadata={'source': '../vanna/tests/test_imports.py'}, page_content='def test_regular_imports():\n    from vanna.anthropic.anthropic_chat import Anthropic_Chat\n    from vanna.azuresearch.azuresearch_vector import AzureAISearch_VectorStore\n    from vanna.base.base import VannaBase\n    from vanna.bedrock.bedrock_converse import Bedrock_Converse\n    from vanna.chromadb.chromadb_vector import ChromaDB_VectorStore\n    from vanna.cohere.cohere_chat import Cohere_Chat\n    from vanna.cohere.cohere_embeddings import Cohere_Embeddings\n    from vanna.faiss.faiss import FAISS\n    from vanna.google.bigquery_vector import BigQuery_VectorStore\n    from vanna.google.gemini_chat import GoogleGeminiChat\n    from vanna.hf.hf import Hf\n    from vanna.local import LocalContext_OpenAI\n    from vanna.marqo.marqo import Marqo_VectorStore\n    from vanna.milvus.milvus_vector import Milvus_VectorStore\n    from vanna.mistral.mistral import Mistral\n    from vanna.ollama.ollama import Ollama

In [5]:
from langchain_text_splitters import (
    Language, RecursiveCharacterTextSplitter
)

In [8]:
python_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON,
    chunk_size=500,
    chunk_overlap=50
)


In [None]:
python_docs = python_splitter.split_documents(source_code_docs)
len(python_docs)

21478

In [15]:
python_docs[2]

Document(metadata={'source': '../vanna/tests/test_imports.py'}, page_content='Anthropic_Chat')

#### JSON Chunking

In [20]:
from langchain_community.document_loaders import DirectoryLoader, JSONLoader

json_doc_loader = DirectoryLoader(
    "../vanna", glob="**/*.json", loader_cls=JSONLoader
)
json_docs = json_doc_loader.load()

Error loading file ../vanna/training_data/snowflake-cost/questions.json


TypeError: JSONLoader.__init__() missing 1 required positional argument: 'jq_schema'

In [19]:
json_docs

[Document(metadata={'source': '../vanna/training_data/snowflake-cost/questions.json'}, page_content=''),
 Document(metadata={'source': '../vanna/training_data/cybersyn-financial-data/questions.json'}, page_content=''),
 Document(metadata={'source': '../vanna/training_data/cybersyn-data-commons/questions.json'}, page_content=''),
 Document(metadata={'source': '../vanna/training_data/sample-fraud/questions.json'}, page_content=''),
 Document(metadata={'source': '../vanna/training_data/sample-retention/questions.json'}, page_content=''),
 Document(metadata={'source': '../vanna/training_data/cybersyn-us-global-public/questions.json'}, page_content=''),
 Document(metadata={'source': '../vanna/training_data/sample-imdb/questions.json'}, page_content=''),
 Document(metadata={'source': '../vanna/training_data/tpc-h/questions.json'}, page_content=''),
 Document(metadata={'source': '../vanna/training_data/fivetran-ads-snowflake/questions.json'}, page_content=''),
 Document(metadata={'source': '.

#### GithubFileLoader

In [3]:
from langchain_community.document_loaders import GithubFileLoader

In [None]:
loader = GithubFileLoader(
    repo="vanna-ai/vanna",
    branch="main",
    access_token=os.getenv('GITHUB_ACCESS_TOKEN'),
    github_api_url="https://api.github.com",
    file_filter=lambda file_path: file_path.endswith(".md"),
)
documents = loader.load()

In [23]:
documents

[Document(metadata={'path': '.github/ISSUE_TEMPLATE/bug_report.md', 'sha': '977810a19a617b270c84db3815b90542fd21903f', 'source': 'https://api.github.com/vanna-ai/vanna/blob/main/.github/ISSUE_TEMPLATE/bug_report.md'}, page_content='---\r\nname: Bug report\r\nabout: Create a report to help us improve\r\ntitle: \'\'\r\nlabels: ["bug"]\r\nassignees: \'\'\r\n\r\n---\r\n\r\n**Describe the bug**\r\nA clear and concise description of what the bug is.\r\n\r\n**To Reproduce**\r\nSteps to reproduce the behavior:\r\n1. Go to \'...\'\r\n2. Click on \'....\'\r\n3. Scroll down to \'....\'\r\n4. See error\r\n\r\n**Expected behavior**\r\nA clear and concise description of what you expected to happen.\r\n\r\n**Error logs/Screenshots**\r\nIf applicable, add logs/screenshots to give more information about the issue.\r\n\r\n**Desktop (please complete the following information where):**\r\n - OS: [e.g. Ubuntu]\r\n - Version: [e.g. 20.04]\r\n - Python: [3.9]\r\n - Vanna: [2.8.0]\r\n\r\n**Additional context*

In [42]:
chunks = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
).split_documents(documents)

In [43]:
len(chunks)

44

In [7]:
python_loader = GithubFileLoader(
    repo="vanna-ai/vanna",
    branch="main",
    access_token=os.getenv('GITHUB_ACCESS_TOKEN'),
    github_api_url="https://api.github.com",
    file_filter=lambda file_path: file_path.endswith(".py"),
)

In [8]:
python_docs = python_loader.load()

In [9]:
len(python_docs)

79

In [11]:
original = python_docs[0].page_content

In [12]:
original

'import re\nfrom typing import List\n\nimport pandas as pd\nfrom zhipuai import ZhipuAI\n\nfrom ..base import VannaBase\n\n\nclass ZhipuAI_Chat(VannaBase):\n    def __init__(self, config=None):\n        VannaBase.__init__(self, config=config)\n        if config is None:\n            return\n        if "api_key" not in config:\n            raise Exception("Missing api_key in config")\n        self.api_key = config["api_key"]\n        self.model = config["model"] if "model" in config else "glm-4"\n        self.api_url = "https://open.bigmodel.cn/api/paas/v4/chat/completions"\n\n    # Static methods similar to those in ZhipuAI_Chat for message formatting and utility\n    @staticmethod\n    def system_message(message: str) -> dict:\n        return {"role": "system", "content": message}\n\n    @staticmethod\n    def user_message(message: str) -> dict:\n        return {"role": "user", "content": message}\n\n    @staticmethod\n    def assistant_message(message: str) -> dict:\n        return {

In [13]:
python_docs[0].page_content = "This is a new content"

In [14]:
python_docs[0]

Document(metadata={'path': 'src/vanna/ZhipuAI/ZhipuAI_Chat.py', 'sha': 'c9181b0295e6a54763075579f82eee8113091b06', 'source': 'https://api.github.com/vanna-ai/vanna/blob/main/src/vanna/ZhipuAI/ZhipuAI_Chat.py'}, page_content='This is a new content')

#### DocString and Comment Line Addition

In [30]:
from langchain_openai import AzureChatOpenAI
from langchain_core.messages import SystemMessage, HumanMessage

In [29]:
gpt4o_mini_model = AzureChatOpenAI(
    api_version="2024-10-21",
    azure_deployment="gpt-4o-mini-2024-07-18",
    temperature=0,
    max_tokens=2000
)

In [31]:
doc_string_sys_message = "You are a DocString and commentline generator. " \
    "You will be given a code snippet and you will generate a docstring and comment lines for the code. " \
    "Return the provided code snippet with the docstring and comment lines added. " \
    "The docstring should be in the format of a Python docstring. " \
    "The comment lines should be in the format of Python comments. "

In [32]:
python_docs[0].page_content

'import re\nfrom typing import List\n\nimport pandas as pd\nfrom zhipuai import ZhipuAI\n\nfrom ..base import VannaBase\n\n\nclass ZhipuAI_Chat(VannaBase):\n    def __init__(self, config=None):\n        VannaBase.__init__(self, config=config)\n        if config is None:\n            return\n        if "api_key" not in config:\n            raise Exception("Missing api_key in config")\n        self.api_key = config["api_key"]\n        self.model = config["model"] if "model" in config else "glm-4"\n        self.api_url = "https://open.bigmodel.cn/api/paas/v4/chat/completions"\n\n    # Static methods similar to those in ZhipuAI_Chat for message formatting and utility\n    @staticmethod\n    def system_message(message: str) -> dict:\n        return {"role": "system", "content": message}\n\n    @staticmethod\n    def user_message(message: str) -> dict:\n        return {"role": "user", "content": message}\n\n    @staticmethod\n    def assistant_message(message: str) -> dict:\n        return {

In [33]:
messages = [
    SystemMessage(content=doc_string_sys_message),
    HumanMessage(content=python_docs[0].page_content)
]

In [35]:
response = gpt4o_mini_model.invoke(messages)

In [37]:
response

AIMessage(content='```python\nimport re\nfrom typing import List\n\nimport pandas as pd\nfrom zhipuai import ZhipuAI\n\nfrom ..base import VannaBase\n\n\nclass ZhipuAI_Chat(VannaBase):\n    """\n    A class to interact with the ZhipuAI API for generating SQL queries and follow-up questions based on user input.\n    \n    This class extends the VannaBase class and provides methods to format messages, \n    add context to prompts, and generate SQL and Plotly code based on user questions.\n    \n    Attributes:\n        api_key (str): The API key for authenticating with the ZhipuAI service.\n        model (str): The model to be used for generating responses.\n        api_url (str): The URL for the ZhipuAI API endpoint.\n    """\n\n    def __init__(self, config=None):\n        """\n        Initializes the ZhipuAI_Chat instance with the provided configuration.\n\n        Args:\n            config (dict, optional): Configuration dictionary containing \'api_key\' and \'model\'.\n        \n   

In [40]:
from langchain_huggingface import HuggingFaceEmbeddings
embedding_python = HuggingFaceEmbeddings(
        model_name="microsoft/codebert-base"
    )

  from .autonotebook import tqdm as notebook_tqdm
No sentence-transformers model found with name microsoft/codebert-base. Creating a new one with mean pooling.


In [23]:
python_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON,
    chunk_size=500,
    chunk_overlap=50
)

In [24]:
python_chunks = python_splitter.split_documents(python_docs)

In [25]:
len(python_chunks)

2180

In [26]:
python_chunks[1].page_content

'class ZhipuAI_Chat(VannaBase):\n    def __init__(self, config=None):\n        VannaBase.__init__(self, config=config)\n        if config is None:\n            return\n        if "api_key" not in config:\n            raise Exception("Missing api_key in config")\n        self.api_key = config["api_key"]\n        self.model = config["model"] if "model" in config else "glm-4"\n        self.api_url = "https://open.bigmodel.cn/api/paas/v4/chat/completions"'

In [38]:
messages = [
    SystemMessage(content=doc_string_sys_message),
    HumanMessage(content=python_chunks[1].page_content)
]
response = gpt4o_mini_model.invoke(messages)

In [39]:
response

AIMessage(content='```python\nclass ZhipuAI_Chat(VannaBase):\n    """\n    ZhipuAI_Chat is a subclass of VannaBase that initializes a chat model \n    using the provided configuration. It checks for the presence of an API key \n    and sets default values for the model if not specified.\n\n    Attributes:\n        api_key (str): The API key for authenticating requests.\n        model (str): The model to be used for chat completions, defaults to "glm-4".\n        api_url (str): The URL endpoint for the chat completions API.\n    """\n\n    def __init__(self, config=None):\n        """\n        Initializes the ZhipuAI_Chat instance.\n\n        Args:\n            config (dict, optional): A configuration dictionary that may contain \n                                      \'api_key\' and \'model\'. If None, defaults \n                                      will be used.\n\n        Raises:\n            Exception: If \'api_key\' is not present in the configuration.\n        """\n        # Call

In [41]:
embedding_python.embed_query(python_chunks[1].page_content)

[-0.3975783586502075,
 0.2779819965362549,
 0.26995208859443665,
 -0.04889015108346939,
 -0.46771231293678284,
 -0.7078947424888611,
 -0.1644677221775055,
 0.38149964809417725,
 0.5058804154396057,
 0.6304805874824524,
 -0.32642263174057007,
 0.8396518230438232,
 -0.30033883452415466,
 -0.1489625871181488,
 0.8165978193283081,
 -0.033967651426792145,
 0.2845640778541565,
 0.41337040066719055,
 -0.07939557731151581,
 0.30950579047203064,
 -0.3646925091743469,
 -0.23856252431869507,
 0.6692246794700623,
 -0.5027076601982117,
 0.6458678841590881,
 0.4917256534099579,
 -0.12268109619617462,
 0.8833966255187988,
 -0.5978009700775146,
 0.7823993563652039,
 -0.2443632185459137,
 0.014729096554219723,
 1.1373494863510132,
 0.3672619163990021,
 0.4696651101112366,
 -0.396048367023468,
 -0.36577120423316956,
 0.22262990474700928,
 0.04493819922208786,
 -0.43121054768562317,
 -0.1105307936668396,
 0.5286081433296204,
 -0.8674601912498474,
 -0.10306952893733978,
 0.5286516547203064,
 0.35676631331

In [44]:
from langchain_core.vectorstores import InMemoryVectorStore

vector_store = InMemoryVectorStore(embeddings)

In [45]:
vector_store.add_documents(chunks)

['97f6ef08-274d-4ab5-b395-7c1031d7d9fb',
 'f2198186-3ab8-465d-9873-046f8f0bbb0e',
 '07e8bc10-0b39-4ff0-ada7-bd092e3f5797',
 '467d59c9-ea1a-49d7-b41d-e41e1c021868',
 '321a90d8-8f71-43e5-854b-0609f50dd5cf',
 'd0d31a85-5432-4247-b3e3-0df21f4d819a',
 '1183de97-961b-4d69-9d71-93e479feca67',
 '6a85ba4a-db1a-49f3-9d49-a92e515f7090',
 '6a32ee94-70f8-484f-9c3a-144649080fd0',
 '91ed8d32-1615-4872-9c59-b29ca52e60cd',
 '174c7cb2-5689-493a-af85-d628269a35ed',
 'dd3620d1-f6cc-4240-a2e7-7f2637396fc3',
 '9c8df319-1ac4-45fc-a688-50d62e07c895',
 '15034c5c-e032-462c-8394-0258297f705e',
 'b8864c80-105a-4385-b4a9-029a3e7174c5',
 '34bdfdb9-61cc-4d9f-acc7-29ea15f88c23',
 '793585b7-91e4-41fb-9360-295ecf4776e2',
 '3bea3df6-58a3-4c2d-8cb1-426750c4ca72',
 '42c6a900-e14f-46e0-be35-0c6f9145a58a',
 'a18a56cc-30a8-4109-bfe9-4f9c08fae88b',
 'ef55b695-368e-4e7d-9070-0ee71999d839',
 'd6b6a648-4539-4075-8abb-3bf665d422ba',
 '3b48d321-e060-4713-9bb5-5aa134bbd861',
 '386329cc-81eb-4ba2-9e9c-08abd4a4c30e',
 '8aa8001b-5dc5-

In [46]:
vector_store.similarity_search("What is Vanna?")

[Document(id='467d59c9-ea1a-49d7-b41d-e41e1c021868', metadata={'path': 'README.md', 'sha': 'dd252217715486451a6a90d5cce2af2cb83c5918', 'source': 'https://api.github.com/vanna-ai/vanna/blob/main/README.md'}, page_content='| GitHub | PyPI | Documentation | Gurubase |\n| ------ | ---- | ------------- | -------- |\n| [![GitHub](https://img.shields.io/badge/GitHub-vanna-blue?logo=github)](https://github.com/vanna-ai/vanna) | [![PyPI](https://img.shields.io/pypi/v/vanna?logo=pypi)](https://pypi.org/project/vanna/) | [![Documentation](https://img.shields.io/badge/Documentation-vanna-blue?logo=read-the-docs)](https://vanna.ai/docs/) | [![Gurubase](https://img.shields.io/badge/Gurubase-Ask%20Vanna%20Guru-006BFF)](https://gurubase.io/g/vanna) |\n\n# Vanna\nVanna is an MIT-licensed open-source Python RAG (Retrieval-Augmented Generation) framework for SQL generation and related functionality.\n\nhttps://github.com/vanna-ai/vanna/assets/7146154/1901f47a-515d-4982-af50-f12761a3b2ce\n\n![vanna-quadra

In [47]:
retriever = vector_store.as_retriever()

In [56]:
docs = retriever.invoke("What is Vanna?")

In [57]:
docs

[Document(id='467d59c9-ea1a-49d7-b41d-e41e1c021868', metadata={'path': 'README.md', 'sha': 'dd252217715486451a6a90d5cce2af2cb83c5918', 'source': 'https://api.github.com/vanna-ai/vanna/blob/main/README.md'}, page_content='| GitHub | PyPI | Documentation | Gurubase |\n| ------ | ---- | ------------- | -------- |\n| [![GitHub](https://img.shields.io/badge/GitHub-vanna-blue?logo=github)](https://github.com/vanna-ai/vanna) | [![PyPI](https://img.shields.io/pypi/v/vanna?logo=pypi)](https://pypi.org/project/vanna/) | [![Documentation](https://img.shields.io/badge/Documentation-vanna-blue?logo=read-the-docs)](https://vanna.ai/docs/) | [![Gurubase](https://img.shields.io/badge/Gurubase-Ask%20Vanna%20Guru-006BFF)](https://gurubase.io/g/vanna) |\n\n# Vanna\nVanna is an MIT-licensed open-source Python RAG (Retrieval-Augmented Generation) framework for SQL generation and related functionality.\n\nhttps://github.com/vanna-ai/vanna/assets/7146154/1901f47a-515d-4982-af50-f12761a3b2ce\n\n![vanna-quadra

## LangGraph Agent Development

In [52]:
from langchain import hub

In [53]:
prompt = hub.pull("rlm/rag-prompt")



In [54]:
prompt

ChatPromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, metadata={'lc_hub_owner': 'rlm', 'lc_hub_repo': 'rag-prompt', 'lc_hub_commit_hash': '50442af133e61576e74536c6556cefe1fac147cad032f4377b60c436e6cdcb6e'}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"), additional_kwargs={})])

In [81]:
from langgraph.graph import START, END, StateGraph, MessagesState
from langchain_core.messages import SystemMessage, HumanMessage, AIMessage
from langchain_core.runnables import RunnableConfig
from langgraph.checkpoint.memory import MemorySaver
from langchain_openai import AzureChatOpenAI
from typing import TypedDict

In [85]:
llm = AzureChatOpenAI(
    api_version="2024-10-21",
    azure_deployment="gpt-4o"
)

class State(TypedDict):
    question: str
    answer: str


def call_model(state: State, config: RunnableConfig):
    # Get the latest message
    latest_message = state["question"]
    
    # Otherwise, call the model with the current state
    context = retriever.invoke(latest_message)
    messages = prompt.invoke({"question": latest_message, "context": context})
    response = llm.invoke(messages, config)
    # Return the response
    return {"answer": response, "question": latest_message}

def build_graph(checkpointer: MemorySaver = None):
    # Graph
    builder = StateGraph(State)
    
    # Define nodes: these do the work
    builder.add_node("assistant", call_model)
    #builder.add_node("tools", ToolNode(self.tools))
    #builder.add_node("tools", self.tool_node)
    #builder.add_node("summarize_conversation", self.summarize_conversation)

    # Define edges: these determine the control flow
    builder.add_edge(START, "assistant")
    #builder.add_conditional_edges(
    #    "assistant",
    #    # If the latest message (result) from assistant is a tool call -> tools_condition routes to tools
    #    # If the latest message (result) from assistant is a not a tool call -> tools_condition routes to END
    #    self.should_continue,
    #    #tools_condition
    #    ["tools", "summarize_conversation", END]
    #)
    #builder.add_edge("tools", "assistant")
    builder.add_edge("assistant", END)
    
    #graph = builder.compile(checkpointer=self.mongodb_saver)
    graph = builder.compile(checkpointer=checkpointer)
    #graph = builder.compile(checkpointer=self.postgres_saver)
    #graph = builder.compile(checkpointer=checkpointer)
    return graph


In [86]:
graph = build_graph(MemorySaver())

In [87]:
config = {"configurable": {"thread_id": "1"}}
question = "What is Vanna?"
response = graph.invoke({"question": question}, config)

In [88]:
response

{'question': 'What is Vanna?',
 'answer': AIMessage(content='Vanna is an open-source Python framework for Retrieval-Augmented Generation (RAG), focused on SQL generation and related functionalities. It can connect to various databases, large language models (LLMs), and vector databases, and enables training a RAG model on data to generate SQL queries. It is MIT-licensed and designed for flexibility and portability.', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 70, 'prompt_tokens': 1333, 'total_tokens': 1403, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-2024-11-20', 'system_fingerprint': 'fp_ee1d74bde0', 'id': 'chatcmpl-BKnbt88LsemXCGPL6AhPOhTpOJHwH', 'finish_reason': 'stop', 'logprobs': None, 'content_filter_results': {}}, id='run-0cc4c1c0-af0c-4f20-92b2-9cf

In [89]:
response["answer"].content

'Vanna is an open-source Python framework for Retrieval-Augmented Generation (RAG), focused on SQL generation and related functionalities. It can connect to various databases, large language models (LLMs), and vector databases, and enables training a RAG model on data to generate SQL queries. It is MIT-licensed and designed for flexibility and portability.'

### Chroma DB Load

In [17]:
from langchain_chroma import Chroma
from langchain_openai import AzureOpenAIEmbeddings

In [18]:
embedding_model = AzureOpenAIEmbeddings(
        api_version="2024-10-21",
        azure_deployment="text-embedding-3-small-1"
    )
vector_store = Chroma(
    collection_name="test-task-collection",
    embedding_function=embedding_model,
    persist_directory="./chroma_db"
)


In [19]:
query = "What is the purpose of the Vanna project?"
results = vector_store.similarity_search(query, k=5)

In [20]:
results

[Document(id='5a56c34c-a297-4390-97a8-23d9cfe2c6c7', metadata={'path': 'README.md', 'sha': 'dd252217715486451a6a90d5cce2af2cb83c5918', 'source': 'https://api.github.com/vanna-ai/vanna/blob/main/README.md'}, page_content='## Why Vanna?'),
 Document(id='a92a03b5-01a2-466c-a2bb-dbab28de73d6', metadata={'path': 'README.md', 'sha': 'dd252217715486451a6a90d5cce2af2cb83c5918', 'source': 'https://api.github.com/vanna-ai/vanna/blob/main/README.md'}, page_content="## Extending Vanna\nVanna is designed to connect to any database, LLM, and vector database. There's a [VannaBase](https://github.com/vanna-ai/vanna/blob/main/src/vanna/base/base.py) abstract base class that defines some basic functionality. The package provides implementations for use with OpenAI and ChromaDB. You can easily extend Vanna to use your own LLM or vector database. See the [documentation](https://vanna.ai/docs/) for more details.\n\n## Vanna in 100 Seconds\n\nhttps://github.com/vanna-ai/vanna/assets/7146154/eb90ee1e-aa05-47

### List Format

In [21]:
a = "[Document(id='61e6653a-2ee8-4dca-90f5-2e974cd1c70f', metadata={'path': 'src/vanna/chromadb/chromadb_vector.py', 'sha': '7fa682f48c2977a2e1440ffec11512182ecd1a1c', 'source': 'https://api.github.com/vanna-ai/vanna/blob/main/src/vanna/chromadb/chromadb_vector.py'}, page_content='if curr_client == \"persistent\":\\n            self.chroma_client = chromadb.PersistentClient(\\n                path=path, settings=Settings(anonymized_telemetry=False)\\n            )\\n        elif curr_client == \"in-memory\":\\n            self.chroma_client = chromadb.EphemeralClient(\\n                settings=Settings(anonymized_telemetry=False)\\n            )\\n        elif isinstance(curr_client, chromadb.api.client.Client):\\n            # allow providing client directly'), Document(id='e02d2657-d2e1-4f05-87e9-a3decc87ee86', metadata={'path': 'src/vanna/base/base.py', 'sha': '16c6469dcbb0f3a99d27de22a1c391e6456e297f', 'source': 'https://api.github.com/vanna-ai/vanna/blob/main/src/vanna/base/base.py'}, page_content='uses the OpenAI API to generate SQL and Plotly code. `vanna.chromadb_vector.ChromaDB_VectorStore` uses ChromaDB to store training data and generate embeddings.'), Document(id='f61a8a14-29c7-43e4-a6ec-ea580584c3bf', metadata={'path': 'src/vanna/chromadb/chromadb_vector.py', 'sha': '7fa682f48c2977a2e1440ffec11512182ecd1a1c', 'source': 'https://api.github.com/vanna-ai/vanna/blob/main/src/vanna/chromadb/chromadb_vector.py'}, page_content=')\\n        self.sql_collection = self.chroma_client.get_or_create_collection(\\n            name=\"sql\",\\n            embedding_function=self.embedding_function,\\n            metadata=collection_metadata,\\n        )'), Document(id='be28c938-3ef2-4c01-bfc3-d0f2377c7f76', metadata={'path': 'src/vanna/base/base.py', 'sha': '16c6469dcbb0f3a99d27de22a1c391e6456e297f', 'source': 'https://api.github.com/vanna-ai/vanna/blob/main/src/vanna/base/base.py'}, page_content='subgraph ChromaDB_VectorStore\\n        generate_embedding\\n        add_question_sql\\n        add_ddl\\n        add_documentation\\n        get_similar_question_sql\\n        get_related_ddl\\n        get_related_documentation\\n    end\\n```\\n\\n\"\"\"\\n\\nimport json\\nimport os\\nimport re\\nimport sqlite3\\nimport traceback\\nfrom abc import ABC, abstractmethod\\nfrom typing import List, Tuple, Union\\nfrom urllib.parse import urlparse'), Document(id='bf3bd06f-8a33-45c3-bdcc-bb435b6ec951', metadata={'path': 'src/vanna/chromadb/chromadb_vector.py', 'sha': '7fa682f48c2977a2e1440ffec11512182ecd1a1c', 'source': 'https://api.github.com/vanna-ai/vanna/blob/main/src/vanna/chromadb/chromadb_vector.py'}, page_content='self.documentation_collection = self.chroma_client.get_or_create_collection(\\n            name=\"documentation\",\\n            embedding_function=self.embedding_function,\\n            metadata=collection_metadata,\\n        )\\n        self.ddl_collection = self.chroma_client.get_or_create_collection(\\n            name=\"ddl\",\\n            embedding_function=self.embedding_function,\\n            metadata=collection_metadata,\\n        )')]"

In [24]:
import json
import ast

In [25]:
ast.literal_eval(a)

ValueError: malformed node or string on line 1: <ast.Call object at 0x169f05f10>