In [14]:
import os
from pathlib import Path
from langchain_community.document_loaders import (
    TextLoader,
    NotebookLoader,
    PyPDFLoader
)
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings

from langchain_community.vectorstores import Chroma

In [15]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings


In [16]:
os.environ["GOOGLE_API_KEY"] = "AIzaSyDfOnww_Cb2nzsccLYYRRBxy3SSWG-Cwd8"


In [17]:
SOURCE_CODE_DIRECTORY = "all_my_code"
# Path to the local vector database
PERSIST_DIRECTORY = "chroma_db"
# Chunking parameters
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 100

In [18]:

def load_documents(directory):
    """Loads all .py and .ipynb files from the specified directory."""
    documents = []
    for item_path in Path(directory).rglob('*'):
        if item_path.is_file():
            
            if item_path.suffix == '.pdf':
                print(f"Loading PDF: {item_path.name}")
                loader = PyPDFLoader(str(item_path))
                documents.extend(loader.load())
            
            if item_path.suffix == '.py':
                loader = TextLoader(str(item_path), encoding="utf-8")
                documents.extend(loader.load())
            elif item_path.suffix == '.ipynb':
                # NotebookLoader handles the JSON structure of .ipynb files
                loader = NotebookLoader(
                    str(item_path),
                    include_outputs=False, # Don't include cell outputs
                    max_output_length=20,
                    remove_newline=True,
                )
                documents.extend(loader.load())
    return documents


def chunk_documents(documents):
    """
    Chunks documents using a splitter that is aware of code syntax.
    This is better than a simple character splitter.
    """
    python_splitter = RecursiveCharacterTextSplitter.from_language(
        language="python", chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP
    )
    chunks = python_splitter.split_documents(documents)
    return chunks

In [6]:
documents = load_documents(SOURCE_CODE_DIRECTORY)

In [7]:
chunks = chunk_documents(documents)

In [8]:
embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")

gemini key = "AIzaSyDfOnww_Cb2nzsccLYYRRBxy3SSWG-Cwd8"

In [9]:
vectorstore = Chroma.from_documents(
    documents=chunks,
    embedding=embeddings,
    persist_directory=PERSIST_DIRECTORY
)

In [10]:
# ## CHANGED ##: Import Google's chat and embedding classes
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

In [None]:
# --- 1. DEFINE CONSTANTS & INITIALIZE ---
# ## CHANGED ##: Point to the new Gemini-powered database
PERSIST_DIRECTORY = "chroma_db"
# ## CHANGED ##: Use a Gemini Pro model for generation
LLM_MODEL = "gemini-1.5-flash-latest"

# --- 2. SETUP THE RAG CHAIN ---

# ## CHANGED ##: Load the vector store using Google's embedding function
embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")
vectorstore = Chroma(persist_directory=PERSIST_DIRECTORY, embedding_function=embeddings)

# ## CHANGED ##: Initialize the Gemini LLM
llm = ChatGoogleGenerativeAI(model=LLM_MODEL, temperature=0.1, convert_system_message_to_human=True)
# Note: `convert_system_message_to_human=True` is sometimes needed for Gemini models
# when using prompt templates that have a system message.

# Create a retriever to fetch relevant documents
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

# This prompt template is generic and works well with Gemini too.
prompt_template = """
You are an expert researcg assistant and lab  mentor.
Your task is to answer questions about the lab's database which is either code or other paper related information.
Use the following retrieved context to answer the question.
If you don't know the answer from the context, just say that you don't know.
Be concise and provide code snippets from the context if they are relevant.

CONTEXT:
{context}

QUESTION:
{input}

ANSWER:
"""
prompt = ChatPromptTemplate.from_template(prompt_template)

# This chain takes a question and the retrieved documents and generates an answer.
question_answer_chain = create_stuff_documents_chain(llm, prompt)

# This is the final chain that combines the retriever and the question-answer chain.
rag_chain = create_retrieval_chain(retriever, question_answer_chain)


  vectorstore = Chroma(persist_directory=PERSIST_DIRECTORY, embedding_function=embeddings)


In [12]:
response = rag_chain.invoke({"input": "i want to make an optimizer, please send me the code"})



In [13]:
response

{'input': 'i want to make an optimizer, please send me the code',
 'context': [Document(metadata={'source': 'all_my_code\\main_integrated.py'}, page_content='from sklearn.metrics import r2_score\nfrom skopt.learning import GaussianProcessRegressor\nfrom skopt.learning.gaussian_process.kernels import ConstantKernel, Matern\nfrom sklearn.neighbors import KNeighborsRegressor\nimport random\nimport math\nimport numpy as np\nnp.random.seed(237)\nimport matplotlib.pyplot as plt\nfrom skopt.plots import plot_gaussian_process\nfrom skopt.optimizer import Optimizer \nimport pandas as pd'),
  Document(metadata={'source': 'all_my_code\\main_integrated.py'}, page_content='from sklearn.metrics import r2_score\nfrom skopt.learning import GaussianProcessRegressor\nfrom skopt.learning.gaussian_process.kernels import ConstantKernel, Matern\nfrom sklearn.neighbors import KNeighborsRegressor\nimport random\nimport math\nimport numpy as np\nnp.random.seed(237)\nimport matplotlib.pyplot as plt\nfrom skopt.