In [1]:
!pip install langchain pypdf faiss-cpu sentence-transformers transformers torch
!pip install -U langchain-community
!pip install langchain_groq
!pip install gradio langchain pypdf faiss-cpu langchain_groq transformers torch

Collecting pypdf
  Downloading pypdf-5.1.0-py3-none-any.whl.metadata (7.2 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Downloading pypdf-5.1.0-py3-none-any.whl (297 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m298.0/298.0 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m73.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf, faiss-cpu
Successfully installed faiss-cpu-1.9.0.post1 pypdf-5.1.0
Collecting langchain-community
  Downloading langchain_community-0.3.10-py3-none-any.whl.metadata (2.9 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting httpx-sse<0.5.0,>=0.4.0

In [6]:
from transformers import AutoTokenizer, AutoModel
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
import torch
import faiss
import numpy as np
import os

class JinaEmbeddings:
    def __init__(self):
        """Initialize the Jina embeddings model"""
        self.model_name = "jinaai/jina-embeddings-v2-base-en"
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.model = AutoModel.from_pretrained(self.model_name)
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model.to(self.device)

    def encode(self, texts):
        """
        Encode texts to embeddings

        Args:
            texts (str or list): Text or list of texts to encode

        Returns:
            numpy.ndarray: Embeddings
        """
        # Asegurar que el texto existe
        if isinstance(texts, str):
            texts = [texts]

        # Tokenizar y obtener las salidas del modelo
        encoded_input = self.tokenizer(
            texts,
            padding=True,
            truncation=True,
            max_length=512,
            return_tensors='pt'
        ).to(self.device)

        with torch.no_grad():
            model_output = self.model(**encoded_input)

        # Mean pooling
        attention_mask = encoded_input['attention_mask']
        token_embeddings = model_output[0]
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

        return embeddings.cpu().numpy()

class MultiDocumentQA:
    def __init__(self, pdf_paths, groq_api_key):
        """
        Initialize the Multi-Document QA system

        Args:
            pdf_paths (list): List of paths to PDF documents
            groq_api_key (str): Groq API key
        """
        self.pdf_paths = pdf_paths
        self.vector_stores = {}
        self.document_names = []
        self.jina_embeddings = JinaEmbeddings()

        # Inicializar el Groq LLM
        os.environ["GROQ_API_KEY"] = groq_api_key
        self.llm = ChatGroq(
            model_name="mixtral-8x7b-32768",
            temperature=0.3,
            max_tokens=1000
        )

        # Crear un prompt de clasificación de documento
        self.classification_prompt = ChatPromptTemplate.from_messages([
            ("system", """You are an expert document classifier and relevance evaluator.
            Your task is to determine which document is MOST LIKELY to contain the answer to the given question.

            Evaluation Criteria:
            1. Direct relevance of the document to the specific question
            2. Likelihood of the document containing the requested information
            3. Depth and comprehensiveness of potential answer

            Available Documents: {document_details}

            Question: {question}

            Instructions:
            - Carefully analyze the context of each document
            - Consider the specificity and scope of the question
            - Provide a detailed explanation of your reasoning
            - Finally, output the MOST RELEVANT document name in the format:
              DOCUMENT: [chosen document name]

            Your nuanced reasoning is crucial."""),
            ("human", "{question}")
        ])

        #  Prompt de QA
        self.qa_prompt = ChatPromptTemplate.from_messages([
            ("system", """You are a helpful assistant that answers questions based on the provided context.
            Your answers should be:
            1. Accurate and based solely on the provided context
            2. Comprehensive yet concise
            3. Well-structured and easy to understand
            If the context doesn't contain enough information to answer the question, say so.

            Context: {context}"""),
            ("human", "{question}")
        ])

    def load_and_split_documents(self):
        """Load PDFs and split into chunks for each document"""
        self.document_chunks = {}
        self.document_names = []

        for pdf_path in self.pdf_paths:
            # Obtener el nombre del documento desde el path
            doc_name = os.path.splitext(os.path.basename(pdf_path))[0]
            self.document_names.append(doc_name)

            # Cargar el PDF
            loader = PyPDFLoader(pdf_path)
            documents = loader.load()

            # Dividir el documento en chunks
            text_splitter = RecursiveCharacterTextSplitter(
                chunk_size=1000,
                chunk_overlap=200,
                separators=["\n\n", "\n", " ", ""]
            )
            chunks = text_splitter.split_documents(documents)

            self.document_chunks[doc_name] = chunks

        return self.document_chunks

    def create_vector_stores(self):
        """Create FAISS vector stores for each document"""
        self.vector_stores = {}

        for doc_name, chunks in self.document_chunks.items():
            # Extraer el texto desde los chunks
            texts = [doc.page_content for doc in chunks]

            # Generar los embeddings
            embeddings = self.jina_embeddings.encode(texts)

            # Crear el indice FAISS
            dimension = embeddings.shape[1]
            index = faiss.IndexFlatL2(dimension)
            index.add(embeddings)

            # Guardar el vector para este documento
            self.vector_stores[doc_name] = {
                'index': index,
                'documents': chunks,
                'embeddings': embeddings
            }

        return self.vector_stores

    def classify_document(self, question):
        """
        Classify which document is most likely to contain the answer

        Args:
            question (str): Question to classify

        Returns:
            str: Name of the most appropriate document
        """
        # Preparar los detalles del documento con algo de contexto
        document_details = []
        for doc_name, chunks in self.document_chunks.items():
            # Toma algunos chunks para proveer context acerca de cada documento
            sample_text = " ".join([chunk.page_content[:200] for chunk in chunks[:3]])
            document_details.append(f"{doc_name}: {sample_text}")

        # Dar formato a el prompt de clasificación
        formatted_prompt = self.classification_prompt.format_messages(
            document_details="\n".join(document_details),
            question=question
        )

        # Generar la clasificación usando Groq
        response = self.llm.invoke(formatted_prompt)
        response_text = response.content.strip()

        # Extraer el nombre del documento
        import re
        match = re.search(r'DOCUMENT:\s*([^\n]+)', response_text, re.IGNORECASE)

        if match:
            classified_doc = match.group(1).strip()

            # Validar la clasificación
            if classified_doc not in self.document_names:
                # Volver al primer documento si falla la clasificación
                classified_doc = self.document_names[0]
        else:
            # Volver al primer documento si no se encuentra ninguna coincidencia
            classified_doc = self.document_names[0]

        return classified_doc

    def retrieve_similar_chunks(self, query, doc_name, k=3):
        """
        Retrieve similar chunks for a query from a specific document

        Args:
            query (str): Query text
            doc_name (str): Name of the document to search
            k (int): Number of chunks to retrieve

        Returns:
            list: Similar document chunks
        """
        # Generar la query embeddings
        query_embedding = self.jina_embeddings.encode(query)

        # Buscar vectores similares
        vector_store = self.vector_stores[doc_name]
        distances, indices = vector_store['index'].search(query_embedding, k)

        # Devolver los documentos correspondientes
        similar_docs = [vector_store['documents'][i] for i in indices[0]]
        return similar_docs

    def generate_answer(self, question, context_docs):
        """
        Generate answer using Groq LLM

        Args:
            question (str): Question to answer
            context_docs (list): List of relevant document chunks

        Returns:
            str: Generated answer
        """
        # Combina los contextos de los documentos
        context = "\n".join([doc.page_content for doc in context_docs])

        # Dar formato a el prompt con contexto y pregunta
        formatted_prompt = self.qa_prompt.format_messages(
            context=context,
            question=question
        )

        # Generar la respuesta usadno Groq
        response = self.llm.invoke(formatted_prompt)
        return response.content

    def initialize_system(self):
        """Initialize the complete system"""
        print("Loading and splitting documents...")
        self.load_and_split_documents()
        print("Creating vector stores...")
        self.create_vector_stores()
        print("System initialized!")

    def ask_question(self, question):
        """
        Ask a question to the system

        Args:
            question (str): Question to ask

        Returns:
            dict: Contains answer, source documents, and source document name
        """
        if not self.vector_stores:
            raise ValueError("System not initialized. Call initialize_system() first.")

        # Clasificar cual documento usar
        classified_doc = self.classify_document(question)
        print(f"Using document: {classified_doc}")

        # Recuperar los chunks relevantes del documento clasificado
        similar_docs = self.retrieve_similar_chunks(question, classified_doc)

        # Generar la respuesta usando Groq
        answer = self.generate_answer(question, similar_docs)

        return {
            "answer": answer,
            "sources": [doc.page_content for doc in similar_docs],
            "source_document": classified_doc
        }

def main():
    # Inicialiar el sistema con varios PDFs
    pdf_paths = [
        "/content/sample_data/andres_malvestiti_resume_2024.pdf",
        "/content/sample_data/carlos_villalobos_resume_2024.pdf"
    ]
    groq_api_key = "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"

    qa_system = MultiDocumentQA(pdf_paths, groq_api_key)
    qa_system.initialize_system()

    # Pregunta (s)
    questions = [
        "What is Carlos' email?"
    ]

    # Obtener respuestas
    for question in questions:
        print(f"\nQuestion: {question}")
        print("Generating answer...")
        result = qa_system.ask_question(question)
        print(f"Answer: {result['answer']}")
        print(f"Source Document: {result['source_document']}")
        print("\nSources:")
        for i, source in enumerate(result['sources'], 1):
            print(f"Source {i}: {source[:200]}...")

if __name__ == "__main__":
    main()

Some weights of BertModel were not initialized from the model checkpoint at jinaai/jina-embeddings-v2-base-en and are newly initialized: ['embeddings.position_embeddings.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.0.output.dense.bias', 'encoder.layer.0.output.dense.weight', 'encoder.layer.1.intermediate.dense.bias', 'encoder.layer.1.intermediate.dense.weight', 'encoder.layer.1.output.LayerNorm.bias', 'encoder.layer.1.output.LayerNorm.weight', 'encoder.layer.1.output.dense.bias', 'encoder.layer.1.output.dense.weight', 'encoder.layer.10.intermediate.dense.bias', 'encoder.layer.10.intermediate.dense.weight', 'encoder.layer.10.output.LayerNorm.bias', 'encoder.layer.10.output.LayerNorm.weight', 'encoder.layer.10.output.dense.bias', 'encoder.layer.10.output.dense.weight', 'encoder.layer.11.intermediate.dense.bias', 'encoder.layer.11.intermedi

Loading and splitting documents...
Creating vector stores...
System initialized!

Question: What is Carlos' email?
Generating answer...
Using document: carlos_villalobos_resume_2024
Answer: Carlos' email address is carvil@gmail.com.
Source Document: carlos_villalobos_resume_2024

Sources:
Source 1: • Collaborated with cross-disciplinary teams for system upgrades and installations.
Frequency Spectrum Monitoring Specialist
Telecom Regulatory Body | [City, Country] | [Dates]
• Utilized advanced fre...
Source 2: Carlos VillalobosSoftware Engineer
Email: carvil@gmail.com | Location: Buenos Aires, Argentina | Linkedin: www.linkedin.com/in/ 
carvil| Github: github.com/orgs/carlos
WORK EXPERIENCE
Technical Soluti...
Source 3: • Collaborated with cross-disciplinary teams for system upgrades and installations.
Frequency Spectrum Monitoring Specialist
Telecom Regulatory Body | [City, Country] | [Dates]
• Utilized advanced fre...
