In [None]:
print("OK")

# Q&A over the Code Base to Understand How it Works

In [1]:
from git import Repo
import os

from langchain.text_splitter import Language
from langchain.document_loaders.generic import GenericLoader
from langchain.document_loaders.parsers import LanguageParser
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationSummaryMemory
from langchain.chains import ConversationalRetrievalChain

In [5]:
%pwd

'/home/cefalo/Documents/PERSONAL/End-to-end-Source-Code-Analysis-Generative-AI-main/research'

In [2]:
!mkdir test_repo

In [3]:
repo_path = "test_repo/"
repo = Repo.clone_from("https://github.com/entbappy/End-to-end-Medical-Chatbot-Generative-AI", to_path=repo_path)

In [9]:
loader = GenericLoader.from_filesystem(repo_path,
                                       glob = "**/*",
                                       suffixes=[".py"],
                                       parser = LanguageParser(language=Language.PYTHON, parser_threshold=500)
)

In [10]:
documents = loader.load()

In [11]:
documents

[Document(metadata={'source': 'test_repo/setup.py', 'language': <Language.PYTHON: 'python'>}, page_content="from setuptools import find_packages, setup\n\nsetup(\n    name = 'Generative AI Project',\n    version= '0.0.0',\n    author= 'Bappy Ahmed',\n    author_email= 'entbappy73@gmail.com',\n    packages= find_packages(),\n    install_requires = []\n\n)"),
 Document(metadata={'source': 'test_repo/template.py', 'language': <Language.PYTHON: 'python'>}, page_content='import os\nfrom pathlib import Path\nimport logging\n\nlogging.basicConfig(level=logging.INFO, format=\'[%(asctime)s]: %(message)s:\')\n\n\nlist_of_files = [\n    "src/__init__.py",\n    "src/helper.py",\n    "src/prompt.py",\n    ".env",\n    "setup.py",\n    "app.py",\n    "research/trials.ipynb",\n   " test.py"\n]\n\n\nfor filepath in list_of_files:\n    filepath = Path(filepath)\n    filedir, filename = os.path.split(filepath)\n\n\n    if filedir !="":\n        os.makedirs(filedir, exist_ok=True)\n        logging.info(f

In [12]:
len(documents[0].page_content)

238

In [13]:
documents[0]

Document(metadata={'source': 'test_repo/setup.py', 'language': <Language.PYTHON: 'python'>}, page_content="from setuptools import find_packages, setup\n\nsetup(\n    name = 'Generative AI Project',\n    version= '0.0.0',\n    author= 'Bappy Ahmed',\n    author_email= 'entbappy73@gmail.com',\n    packages= find_packages(),\n    install_requires = []\n\n)")

In [14]:
documents_splitter = RecursiveCharacterTextSplitter.from_language(language = Language.PYTHON,
                                                             chunk_size = 500,
                                                             chunk_overlap = 20)

In [15]:
texts = documents_splitter.split_documents(documents)

In [16]:
len(texts[0].page_content)

238

In [17]:
len(texts)

13

In [18]:
from dotenv import load_dotenv
load_dotenv() 

# Embedding models: https://python.langchain.com/v0.1/docs/integrations/text_embedding/
GOOGLE_API_KEY = os.getenv("GEMINI_API_KEY")

In [19]:
os.environ["GEMINI_API_KEY"] = GOOGLE_API_KEY

In [20]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings

# embeddings=OpenAIEmbeddings(disallowed_special=())
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=GOOGLE_API_KEY)

In [22]:
from langchain_chroma import Chroma

vectordb = Chroma.from_documents(texts, embedding=embeddings, persist_directory='./db')

In [20]:
# vectordb.persist()

In [23]:
from langchain_google_genai import ChatGoogleGenerativeAI

# llm = ChatOpenAI(model_name="gpt-4")
# llm = ChatOpenAI()
llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash",temperature=0.3, max_tokens=500, google_api_key=GOOGLE_API_KEY)

In [24]:
memory = ConversationSummaryMemory(llm=llm, memory_key = "chat_history", return_messages=True)

In [25]:
qa = ConversationalRetrievalChain.from_llm(llm, retriever=vectordb.as_retriever(search_type="mmr", search_kwargs={"k":8}), memory=memory)

In [26]:
question = "give me a method signature which is type annotated, if not found then pick any random one and add type annotation"

In [27]:
result = qa(question)
print(result['answer'])

  result = qa(question)
Number of requested results 20 is greater than number of elements in index 13, updating n_results = 13


```python
from typing import List

def text_split(extracted_data: str) -> List[str]:
    """Splits a text into chunks.

    Args:
        extracted_data: The text to split.

    Returns:
        A list of text chunks.
    """
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks
``` 



In [28]:
question = "add proper type annotation with small reasoning description about why and which type hints is added"
result = qa(question)
print(result['answer'])

Number of requested results 20 is greater than number of elements in index 13, updating n_results = 13


```python
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from typing import List, Union

def text_split(extracted_data: Union[List[dict], List[str]]) -> List[dict]:
    """Splits text into chunks using RecursiveCharacterTextSplitter.

    Args:
        extracted_data: A list of dictionaries or strings representing the extracted text data.

    Returns:
        A list of dictionaries representing the text chunks.
    """
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks
```

**Explanation of Type Annotations:**

* **`extracted_data: Union[List[dict], List[str]]`**: This annotation specifies that the `extracted_data` argument can be either a list of dictionaries or a list of strings. This is because the `load_pdf_file` function