In [38]:
!pip install torch transformers beautifulsoup4 pdfplumber PyMuPDF langchain_core langchain_experimental langchain_community langchain_huggingface langchain_huggingface faiss-cpu pandas

Collecting spacy
  Using cached spacy-3.8.2.tar.gz (1.3 MB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'error'


  error: subprocess-exited-with-error
  
  × pip subprocess to install build dependencies did not run successfully.
  │ exit code: 1
  ╰─> [65 lines of output]
      Ignoring numpy: markers 'python_version < "3.9"' don't match your environment
      Collecting setuptools
        Using cached setuptools-80.7.1-py3-none-any.whl.metadata (6.6 kB)
      Collecting cython<3.0,>=0.25
        Using cached Cython-0.29.37-py2.py3-none-any.whl.metadata (3.1 kB)
      Collecting cymem<2.1.0,>=2.0.2
        Using cached cymem-2.0.11-cp313-cp313-win_amd64.whl.metadata (8.8 kB)
      Collecting preshed<3.1.0,>=3.0.2
        Using cached preshed-3.0.9.tar.gz (14 kB)
        Installing build dependencies: started
        Installing build dependencies: finished with status 'done'
        Getting requirements to build wheel: started
        Getting requirements to build wheel: finished with status 'done'
        Preparing metadata (pyproject.toml): started
        Preparing metadata (pyproject.toml): fi

In [13]:
import os
import logging
import re
import transformers
import torch
import bs4
import pdfplumber
import pymupdf
import fitz
import pandas as pd

import warnings
warnings.filterwarnings("ignore", category=UserWarning, message="Empty content on page")

from concurrent.futures import ThreadPoolExecutor, as_completed
from langchain_core.output_parsers import StrOutputParser
from langchain.schema import Document
from langchain_experimental.text_splitter import SemanticChunker
from langchain_community.vectorstores.faiss import FAISS, DistanceStrategy
from langchain_huggingface import HuggingFaceEmbeddings, HuggingFacePipeline
from langchain.document_loaders import (
    PyMuPDFLoader,
    TextLoader,
    WebBaseLoader,
    CSVLoader,
    PDFPlumberLoader
)

In [None]:
LLM_MODEL = "meta-llama/Llama-3.1-8B-Instruct"
EMBEDDINGS_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
# DOCS_DIR = os.path.join(DATA_DIR, 'ejemplo_alberto')
DOCS_DIR = r"Data\Datasets"
BBDD_VECTORES_DIR = r"Data\BBDD"


In [20]:
class DataBase():
    """
        Class for store vectors

        Params:
        -------
            docs_path: documents location in case of building the vectorstore
            load_path: location of an existing vectorstore.
            db_path: location of the new vectorstore in case docs_path != False
            embedding_path: embeddings model location
        Attributes:
        -----------
            embeddings: embeddings model used
            vectorstore: vectorstore used
        Methods:
        --------
            load_txt():
                Create multiple Langchain.core.Document from a .txt file
            load_csv():
                Create multiple Langchain.core.Document from a .csv file
            load_web():
                Create multiple Langchain.core.Document from a url
            load_tables():
                Create multiple Langchain.core.Document from tables of a .pdf file
            load_pdf():
                Create multiple Langchain.core.Document from a .pdf file
            load_documents():
                Create a list of Langchain.core.Document from a directory
            add_documents():
                Add documents
            search_documents():
                Search documents given a query
    """
    def __init__(self, docs_path=DOCS_DIR, load_path=False, db_path=BBDD_VECTORES_DIR, embedding_path=EMBEDDINGS_MODEL):
        print("Loading Embeddings")
        self.embeddings = HuggingFaceEmbeddings(
            model_name=embedding_path,
            model_kwargs={'trust_remote_code': True, 'device': 'cpu'},
            encode_kwargs={'batch_size': 2}
            )
        if docs_path:
            print("Loading Docs")
            docs = self.load_documents(docs_path)
            print(f"Cantidad de documentos: {len(docs)}")
            print("Splitting Docs")
            # splits = SemanticChunker(self.embeddings).split_documents(docs)
            print("Creating Vectors to FAISS DB")
            self.vectorstore = FAISS.from_documents(docs, self.embeddings, normalize_L2=True, distance_strategy=DistanceStrategy.MAX_INNER_PRODUCT)
            print("Saving Vectors to FAISS DB")
            self.vectorstore.save_local(db_path)
        elif load_path:
            self.vectorstore = FAISS.load_local(load_path, self.embeddings, allow_dangerous_deserialization=True)
        else:
            raise ValueError("docs_path or load_path must be different of False")

    def load_txt(self, file):
        loader = TextLoader(file, encoding='utf-8')
        docs = loader.load()
        return docs

    def load_csv(self, file):
        df = pd.read_csv(file, encoding='utf-8')
        docs = []
        for idx, row in df.iterrows():
            row_text = row.to_json(force_ascii=False)
            docs.append(Document(page_content=row_text, metadata={"source": file, "row": idx}))
        return docs

    def load_web(self, url):
        loader = WebBaseLoader(
            web_paths=(url),
            bs_kwargs=dict(
                parse_only=bs4.SoupStrainer(
                    class_=("post-content", "post-title", "post-header")
                )
            ),
        )
        docs = loader.load()
        return docs

    def load_tables(self, file):
        docs = []
        with pdfplumber.open(file) as pdf:
            for page_number, page in enumerate(pdf.pages):
                tables = page.extract_tables()
                for table in tables:
                    df = pd.DataFrame(table[1:], columns=table[0])
                    df_unique = df.loc[:, ~df.columns.duplicated()]
                    json_text = df_unique.to_json(orient='records', lines=False)
                    json_text = json_text.replace('\\n', '\n')
                    json_text = re.sub(r'\\/', '/', json_text)
                    json_text = json_text.encode('utf-8').decode('unicode_escape')
                    doc = Document(
                        page_content=json_text,
                        metadata={"source": file, "type": "table"}
                    )
                    docs.append(doc)
        return docs

    def load_pdf(self, file):
        docs = []

        def process_page(page_num):
            try:
                with pdfplumber.open(file) as pdf:
                    page = pdf.pages[page_num]
                    text = page.extract_text()
                    if text:
                        return Document(page_content=text, metadata={"source": file, "page": page_num})
            except Exception as e:
                print(f"Error processing page {page_num} in {file}: {e}")
            return None

        with pdfplumber.open(file) as pdf:
            num_pages = len(pdf.pages)

        max_workers = min(2, num_pages)
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            futures = [executor.submit(process_page, page_num) for page_num in range(num_pages)]
            for future in as_completed(futures):
                doc_result = future.result()
                if doc_result:
                    docs.append(doc_result)

        return docs

    def load_documents(self, path_directory):
        docs = []
        for root, _, files in os.walk(path_directory):  # os.walk recorre todas las subcarpetas
            for file in files:
                print(file)
                file_path = os.path.join(root, file)
                if file.endswith('.txt'):
                    docs.extend(self.load_txt(file_path))
                elif file.endswith('.csv'):
                    docs.extend(self.load_csv(file_path))
                elif file.endswith('.pdf'):
                    docs.extend(self.load_pdf(file_path))
        return docs

    def add_documents(self, path_file):
        if path_file.endswith('.txt'):
            docs = self.load_txt(path_file)
        elif path_file.endswith('.csv'):
            docs = self.load_csv(path_file)
        elif path_file.endswith('.pdf'):
            docs = self.load_pdf(path_file)
        self.vectorstore.add_documents(docs)

    def search_documents(self, query, num=4, score=False):
        if score:
            return self.vectorstore.similarity_search(query, k=num)
        else:
            return self.vectorstore.similarity_search(query, k=num, score_threshold=score)

class RagChain():
    """
        Class for making a chain with questions

        Params:
        -------
            query: question made
            db: DataBase object
            model: HuggingFacePipeline object
        Attributes:
        -----------
            query: list of questions
            context: list of contexts used to answer the query
            prompt: prompt used in the last query
            answer: list of answers obtained from the queries
        Methods:
        --------
            format_docs:
                Return a formated string to make the prompt
            question:
                Make another question with context of the previous questions
    """
    def __init__(self, query, db, model):
        self.query = []
        self.query.append(query)
        self.context = []
        self.context.append(self.format_docs(db.search_documents(self.query[0])))
        self.prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
            Eres un asistente para preguntas y respuestas. Responde a la siguiente pregunta usando solo la información proporcionada en el contexto. Si la respuesta no está en el contexto, di 'No lo sé'.<|eot_id|>
            <|start_header_id|>user<|end_header_id|>
            Contexto: {self.context[0]}

            Pregunta: {self.query[0]}<|eot_id|>
            <|start_header_id|>assistant<|end_header_id|>"""
        self.answer = []
        self.answer.append(StrOutputParser().parse(model.invoke(self.prompt)).split("<|end_header_id|>")[-1].replace("\n", ""))

    def format_docs(self, docs):
        return "\n\n--\n\n".join(doc.page_content for doc in docs)

    def question(self, query, db, model):
        aux_queries = self.query[-3:]
        aux_answer = self.answer[-3:]
        prompt_answer = ""
        for q in aux_queries:
            ans = aux_answer[aux_queries.index(q)]
            prompt_answer = prompt_answer + f"\n\n <|start_header_id|>user<|end_header_id|>Pregunta: {q}<|eot_id|> \n\n <|start_header_id|>assistant<|end_header_id|>Respuesta: {ans}<|eot_id|>"
        self.query.append(query)
        self.context.append(self.format_docs(db.search_documents(query)))
        self.prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
            Actúa como un asistente de generación de informes. Crea un resumen ejecutivo del siguiente contenido, usando títulos, subtítulos, y listas con viñetas. El resumen debe ser claro, bien estructurado, y fácil de leer. Formatea el texto utilizando Markdown para que sea fácil de renderizar en un documento final.<|eot_id|>
            {prompt_answer}
            <|start_header_id|>user<|end_header_id|>
            Contexto: {self.context[-1]}

            Pregunta: {self.query[-1]}<|eot_id|>
            <|start_header_id|>assistant<|end_header_id|>
            """
        self.answer.append(StrOutputParser().parse(model.invoke(self.prompt)).split("<|end_header_id|>")[-1].replace("\n", ""))

def read_excel(path_file):
    list_query = []
    df = pd.read_excel(path_file)
    for index, rows in df.iterrows():
       list_query.append(rows[0])
    return list_query

def write_answer(chains, path_file):
    df = pd.DataFrame()
    for chain in chains:
        aux = pd.DataFrame({'Pregunta': chain.query, 'Respuesta': chain.answer, 'Contexto': chain.context})
        df = pd.concat([df,aux],ignore_index=True).drop_duplicates()
    df.to_excel(path_file)


In [29]:
print("Start Database")
db = DataBase(DOCS_DIR, load_path=BBDD_VECTORES_DIR)
print("Database loaded")

Start Database
Loading Embeddings
Loading Docs
DataAnalyst.csv
Cantidad de documentos: 2253
Splitting Docs
Creating Vectors to FAISS DB
Saving Vectors to FAISS DB
Database loaded




In [30]:
queries = [
    "I am a data analyst with 3 years of experience looking for a full-time position in a dynamic company. I have a strong background in SQL, Python, and Excel, and have worked extensively with Power BI and Tableau to create dashboards and business reports. I’m particularly interested in roles that offer remote work or are based in New York or San Francisco. I value collaborative environments where I can contribute to data-driven decision making and continue learning. Ideally, I’m looking for a salary around $100,000 and opportunities for career growth."
]

In [31]:
for query in queries:
    print(f"\n🔍 Pregunta: {query}")
    results = db.search_documents(query, num=5)  # Puedes cambiar el número de resultados
    for i, doc in enumerate(results, 1):
        print(f"\n--- Resultado {i} ---")
        print(doc.page_content)



🔍 Pregunta: I am a data analyst with 3 years of experience looking for a full-time position in a dynamic company. I have a strong background in SQL, Python, and Excel, and have worked extensively with Power BI and Tableau to create dashboards and business reports. I’m particularly interested in roles that offer remote work or are based in New York or San Francisco. I value collaborative environments where I can contribute to data-driven decision making and continue learning. Ideally, I’m looking for a salary around $100,000 and opportunities for career growth.

--- Resultado 1 ---
{"Unnamed: 0":1396,"Job Title":"Direct Client Requirement - Data Reporting Analyst","Salary Estimate":"$41K-$86K (Glassdoor est.)","Job Description":"Our direct client which is a leading telecommunications and media company is looking for a Data Reporting Analyst in Dallas, TX. Please submit local candidates only. It's long term contract role, please share resume asap with Paulrealsoftinc.com. Local candidat

In [33]:
# Recupera todos los documentos almacenados
docs = db.vectorstore.docstore._dict.values()