In [None]:
!pip install langchain-community

!pip install langchain_google_genai

!pip install langchain_chroma

In [3]:
from langchain_community.document_loaders import PyPDFLoader

FOLDER_PATH = "content"

def load_pdf(path):
    loader = PyPDFLoader(path)  # Load your PDF file
    data = loader.load()
    return data

def load_all_pdfs_in_folder(folder_path):
    # List all PDF files in the folder
    pdf_files = [f for f in os.listdir(folder_path) if f.endswith('.pdf')]
    
    # Initialize a list to store the loaded documents
    all_documents = []

    for pdf_file in pdf_files:
        pdf_path = os.path.join(folder_path, pdf_file)
        print(f"Loading PDF: {pdf_path}")
        documents = load_pdf(pdf_path)  # Load the PDF
        all_documents.extend(documents)  # Add the loaded documents to the list

    return all_documents

In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)

def split_text(data):
    docs = text_splitter.split_documents(data)
    return docs

In [5]:
import os

api_key = "AIzaSyCoxFsjIYKIz0jxIwlHYR5tI1by7LRvqw4"

os.environ["GEMINI_API_KEY"] = api_key

In [None]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from dotenv import load_dotenv

api_key = os.getenv("GEMINI_API_KEY")
if not api_key:
    raise ValueError("GEMINI_API_KEY is not set. Please set it as an environment variable.")

# Load the Gemini API key
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=api_key)

In [None]:
from langchain.vectorstores import Chroma
import os

PERSISTENT_DIRECTORY = "chroma"

# Check if the persistent directory already exists (indicating the vector store is already created)
if not os.path.exists(PERSISTENT_DIRECTORY):
    # If the vector store does not exist, you need to create it
    # Example: you need to load your documents and embeddings
    # docs = ...  # Load your documents (e.g., list of text documents)
    # embeddings = OpenAIEmbeddings()  # Use your embeddings function

    # For the sake of example, let's assume you have a list of documents called `docs`

    # Create the vector store by embedding the documents and persisting them
    data = load_all_pdfs_in_folder(FOLDER_PATH)
    docs = split_text(data)
    print(docs)
    vectorstoredb = Chroma.from_documents(documents=docs, embedding=embeddings, persist_directory=PERSISTENT_DIRECTORY)
else:
    # If the vector store already exists, load it
    vectorstoredb = Chroma(persist_directory=PERSISTENT_DIRECTORY, embedding_function=embeddings)

# Set up the retriever for similarity search (retrieving the top 5 most similar documents)
retriever = vectorstoredb.as_retriever(search_type="similarity", search_kwargs={"k": 5})


In [None]:
retrieved_docs = retriever.invoke("array bidimensional")
print(len(retrieved_docs))
print(retrieved_docs[0].page_content)  # Print the first retrieved document

In [9]:
from langchain_google_genai import ChatGoogleGenerativeAI

llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash", temperature=0, api_key=api_key)

In [10]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

# Define a system prompt
system_prompt = (
   ''''
You are a knowledgeable AI tutor, dedicated to answering questions in a clear and thorough manner.
Your goal is to break down complex concepts into simple, easy-to-understand terms, making them suitable for a non-technical audience.
Maintain a warm, conversational tone, guiding the student step by step.

Your responses must be based exclusively on the content from the passage and the examples included in it.
If the passage does not address the question, kindly explain that the answer is not available in the provided material.

Respond in Spanish, ensuring that the explanation is simple and easy to follow.
The topic is C# programming, so focus on simplifying and clarifying relevant concepts.

At the end of your answer, include a reference to the source (document name) and the pages that the passage was taken from.
Do not add any additional information.

---
    {context}'''
)

# Set up the prompt for the QA chain
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}")
    ]
)

# Create the RAG chain
chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, chain)

In [None]:
response = rag_chain.invoke({"input": "qué es hola mundo"})
# print(response)
print(response['answer'])