In [1]:
import os
from os import getenv
from pathlib import Path

from tqdm import tqdm
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from sentence_transformers import SentenceTransformer
from langchain.vectorstores import FAISS
from langchain.chat_models import ChatOpenAI
from langchain.chains.question_answering import load_qa_chain
from langchain.callbacks import get_openai_callback

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def get_path(prev_folders: int = 0) -> str:
    """
    Identifies a specific directory according to the absolute path of the file.
    Args:
        prev_folders (int): An integer indicating the number of previous folders to get. If not given, 0 (current folder) is used by default.

    Returns:
        str: A text string with the identified path.
    """
    # Use pathlib to handle paths and directories
    path = Path.cwd()
    
    # Go up the specified number of directories
    for _ in range(prev_folders):
        path = path.parent
    
    # Convert path to string and ensure it ends with a slash
    path_str = str(path)
    if not path_str.endswith('/'):
        path_str += '/'
    
    return path_str

In [3]:
PATH = get_path(1)
OPENAI_API_KEY = getenv('OPENAI_API_KEY')

# Read data

In [4]:
# Read pdf
pdf_file_obj = open(PATH + 'data/constitucion_colombia_1991.pdf', 'rb')
pdf_reader = PdfReader(pdf_file_obj)

text = "".join(page.extract_text() for page in pdf_reader.pages)

pdf_file_obj.close()  # Free up system resources
del pdf_file_obj, pdf_reader  # Delete variables to free memory

# Preprocessing

## Chunks

In [5]:
# Chunks creation
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=30,
    length_function=len
    )

chunks = text_splitter.split_text(text)
# len(chunks)
# chunks[13]

## Embeddings

In [6]:
# Embeddings model creation
MINILM_L12_V2 = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'  # 471M
MPNET_BASE_V2 = 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2'  # 1.11G

embeddings = HuggingFaceEmbeddings(model_name=MPNET_BASE_V2)
model = SentenceTransformer(MPNET_BASE_V2)

In [7]:
# # Embeddings example
# sentence_embeddings = model.encode('Esta es una prueba de embeddings')
# sentence_embeddings

In [8]:
knowledge_base = FAISS.from_texts(chunks, embeddings)

In [None]:
# # Embeddings example
# question = '¿Puedo ser obligado a declarar contra mí mismo?'
# # Busqueda de párrafos similares
# docs = knowledge_base.similarity_search(question, 5)

## LangChain

In [None]:
llm = ChatOpenAI(model_name='gpt-3.5-turbo-0125')  # gpt-3.5-turbo
chain = load_qa_chain(llm, chain_type='stuff')

In [10]:
question = '¿Puedo ser obligado a declarar contra mí mismo?'
# Busqueda de párrafos similares
docs = knowledge_base.similarity_search(question, 2)
# Utilizar los parrafos similares para darle contexto a ChatGPT
answer = chain.run(input_documents=docs, question=question)
print(f'Respuesta ChatGPT:\n\t {answer}')

  warn_deprecated(


Respuesta ChatGPT:
	 No, según la información proporcionada, ninguna persona puede ser obligada a declarar contra sí misma.


In [11]:
# Calculate cost
with get_openai_callback() as cb:
    response = chain.run(input_documents=docs, question=question)
    print(cb)

Tokens Used: 342
	Prompt Tokens: 295
	Completion Tokens: 47
Successful Requests: 1
Total Cost (USD): $0.0005365


# Bibliografía

1. [Tutorial base](https://www.youtube.com/watch?v=iDrpdkIHMq8&t=897s&ab_channel=Nechu)
2. [Secondary base](https://www.analyticsvidhya.com/blog/2023/07/creating-a-chatbot-with-falconai-langchain-and-chainlit/)
3. [Notebook base](https://colab.research.google.com/drive/1sLPDZJMBzMFF6s1scFcj5P3Rr2QPBX51?usp=sharing#scrollTo=6b0eb821)
4. [PEP 484 – Type Hints](https://docs.python.org/3/library/typing.html)
5. [PEP 257 – Docstring Conventions](https://peps.python.org/pep-0257/)