## Code format

In [1]:
# Code format
# For Jupyter Notebook:
%load_ext nb_black
# For Jupyter Lab:
# %load_ext lab_black

<IPython.core.display.Javascript object>

## Libraries

In [2]:
# Converting documents from one format to another
import pypandoc

# General purpose libraries
import os
import time
import sys

# Data management tools
# Serialization and deserialization of Python objects
import pickle
import json

<IPython.core.display.Javascript object>

In [12]:
# Tools for interacting with the OpenAI API
import openai
# Tools for working with various pre-trained language models
# Includes tokenizers and other utilities for text processing
from transformers import GPT2Tokenizer

<IPython.core.display.Javascript object>

In [113]:
# Libraries and classes to load and parse different types of text data
from langchain.document_loaders import (
    UnstructuredPDFLoader,
    OnlinePDFLoader,
    UnstructuredFileLoader,
    TextLoader,
    PyPDFLoader,
)

# Classes for splitting text into characters and recursively splitting text into characters
# Tools for splitting text into smaller chunks for further processing
from langchain.text_splitter import (
    CharacterTextSplitter,
    RecursiveCharacterTextSplitter,
)

# Tools for working with OpenAI's GPT-3 language model
# from langchain.embeddings import OpenAIEmbeddings
from langchain.embeddings.openai import OpenAIEmbeddings

# Tools for working with vector databases, including authentication with the Pinecone and OpenAI APIs
# from langchain.vectorstores import Chroma, Pinecone
# from langchain.vectorstores.faiss import FAISS
from langchain.vectorstores import (
    Chroma,
    Pinecone,
    ElasticVectorSearch,
    Weaviate,
    FAISS,
)

# Wrapper around OpenAI's API and provides tools for interacting with OpenAI's GPT-3 language model
from langchain.llms import OpenAI

# Tools for building and running natural language processing (NLP) chains
# Class for building a retrieval-based question answering (QA) system and chatbots that interact with vector databases
from langchain.chains import (
    RetrievalQA,
    ChatVectorDBChain,
    ConversationalRetrievalChain,
)

# Generating text prompts
from langchain.prompts.prompt import PromptTemplate

# Load question answering chain
from langchain.chains.question_answering import load_qa_chain

# Class to create indexes in vector databases
from langchain.indexes import VectorstoreIndexCreator

<IPython.core.display.Javascript object>

## Functions

In [76]:
# Function to save an object into a pickle file.
def f_pklsave(arg_obj, arg_path: str):
    """
    Serialize and save the given object to a pickle file at the specified path.

    Parameters:
    arg_obj (object): The object that needs to be saved in the pickle file.
    arg_path (str): The file path of the pickle file where the object will be saved.

    Returns:
    None

    Example:
    >>> my_object = {"name": "John", "age": 25, "address": "123 Main St"}
    >>> file_path = "./my_object.pkl"
    >>> f_pklsave(my_object, file_path)
    """
    # Dump object to pickle file
    pickle.dump(arg_obj, open(arg_path, "wb"))

<IPython.core.display.Javascript object>

In [48]:
# Function: Group the shorter chunks into chunks of around 1000 tokens and decrease the frequency of breaks within the text
def group_chunks(chunks, ntokens, max_len=1000):
    """
    Group very short chunks, to form approximately a page long chunks.

    Args:
        chunks (List[str]): A list of string chunks to group.
        ntokens (List[int]): A list of integer number of tokens in each chunk.
        max_len (int): Maximum number of tokens to group together.

    Returns:
        List[str]: A list of grouped string chunks.
    """

    batches = []  # initialize a list to store the grouped chunks
    cur_batch = (
        ""  # initialize an empty string to keep track of current group of chunks
    )
    cur_tokens = 0  # initialize a counter for number of tokens

    # iterate over the chunks and group them based on token count
    for chunk, ntoken in zip(chunks, ntokens):
        cur_tokens += (
            ntoken + 2
        )  # increment the token count for current chunk (+2 for newlines between chunks)

        # if adding this chunk would exceed the max length, finalize the current batch and start a new one
        if ntoken + cur_tokens > max_len:
            batches.append(cur_batch)  # add the current batch to the list of batches
            cur_batch = chunk  # start a new batch with the current chunk
        else:
            cur_batch += "\n\n" + chunk  # add the current chunk to the current batch

    batches.append(cur_batch)  # add the last batch to the list of batches
    return batches  # return the list of grouped chunks

<IPython.core.display.Javascript object>

In [49]:
# Function: Translate each chunck of text to english
def translate_chunk(chunk, engine="text-davinci-002", dest_language="English"):
    """
    Translate the input chunk to the specified destination language using OpenAI GPT-3 API.

    Args:
        chunk (str): The input text to be translated.
        engine (str): The GPT-3 engine ID to use for the translation.
        dest_language (str): The destination language to translate the input text to.

    Returns:
        str: The translated text.
    """
    # Generate the prompt to be sent to the OpenAI API
    prompt = f'''Translate only the text from the following text document into {dest_language}.
"""{chunk}"""
'''

    # Send a request to the OpenAI API to translate the chunk
    response = openai.Completion.create(
        prompt=prompt,
        engine=engine,
        temperature=0,
        top_p=1,
        max_tokens=1500,
    )

    # Extract the translated text from the API response
    result = response["choices"][0]["text"].strip()
    # Remove the double quotes, as we used them to surround the text in the prompt
    result = result.replace('"""', "")
    # Return the translated text
    return result

<IPython.core.display.Javascript object>

## Variables

In [3]:
# Folder paths
str_folder_credentials = "credentials/"
str_folder_sources = "sources/"
str_folder_outputs = "outputs/"

<IPython.core.display.Javascript object>

In [4]:
# List files
lst_pdfs = [x for x in os.listdir(str_folder_sources) if x.endswith(".docx")]
lst_pdfs

['RSMV00001500034003.docx']

<IPython.core.display.Javascript object>

In [5]:
# Filename, with and without extension
srt_filename = lst_pdfs[0]
srt_filename_next = srt_filename.split(".")[0]

<IPython.core.display.Javascript object>

In [42]:
# Filename and folders
str_path_input = str_folder_sources + srt_filename
str_path_output_pkl = str_folder_outputs + srt_filename_next + ".pkl"
str_path_output_txt = str_folder_outputs + srt_filename_next + ".txt"
str_path_output_txt_en = str_folder_outputs + srt_filename_next + "_en.txt"

<IPython.core.display.Javascript object>

In [43]:
srt_filename, str_path_input, str_path_output_pkl, str_path_output_txt, str_path_output_txt_en

('RSMV00001500034003.docx',
 'sources/RSMV00001500034003.docx',
 'outputs/RSMV00001500034003.pkl',
 'outputs/RSMV00001500034003.txt',
 'outputs/RSMV00001500034003_en.txt')

<IPython.core.display.Javascript object>

### Authentication for APIs

In [8]:
# Load credentials
# Create dictionary to store credentials
lst_cred = {}
# Loop through folder and load json file with credentials
for x in [x for x in os.listdir(str_folder_credentials)]:
    lst_cred[x.split(".")[0]] = json.load(open(str_folder_credentials + x, "r"))

<IPython.core.display.Javascript object>

In [9]:
OPENAI_API_KEY = lst_cred["yahoo"]["OPENAI_API_KEY"]
PINECONE_API_KEY = lst_cred["mail"]["PINECONE_API_KEY"]
PINECONE_API_ENV = lst_cred["yahoo"]["PINECONE_API_ENV"]

<IPython.core.display.Javascript object>

## Method 1: Translating the document to english

### Ingestion of data

In [27]:
# Load credentials for OpenAI
openai.api_key = OPENAI_API_KEY

<IPython.core.display.Javascript object>

#### docx to txt

In [11]:
# Convert the txt file to plain text format and save it as a new file
pypandoc.convert_file(str_path_input, "plain", outputfile=str_path_output_txt)

''

<IPython.core.display.Javascript object>

In [15]:
# Read converted text 
str_text_raw = open(str_path_output_txt, mode="r", encoding="utf8").read()

<IPython.core.display.Javascript object>

In [125]:
# Load spanish file
loader_es = TextLoader(str_path_output_txt, encoding="utf8")

<IPython.core.display.Javascript object>

####  Count and split

In [17]:
# Import the GPT2Tokenizer from the transformers library and create an instance of the tokenizer using the GPT-2 model
# OpenAI GPT-2 tokenizer is the same as GPT-3 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

<IPython.core.display.Javascript object>

In [18]:
# split the input text by double line breaks
chunks = str_text_raw.split("\n\n")

# initialize an empty list to store the number of tokens in each chunk
ntokens = []

# iterate over the chunks and calculate the number of tokens in each one using the GPT2 tokenizer
for chunk in chunks:
    ntokens.append(len(tokenizer.encode(chunk)))

# find the maximum number of tokens in any chunk
max(ntokens)

Token indices sequence length is longer than the specified maximum sequence length for this model (1216 > 1024). Running this sequence through the model will result in indexing errors


8263

<IPython.core.display.Javascript object>

In [21]:
chunks = group_chunks(chunks, ntokens)
len(chunks)

1709

<IPython.core.display.Javascript object>

#### Tests

In [25]:
chunks[803]

'El Agente debe contar con un sistema de control que cumpla con las\ndisposiciones contenidas en el presente Reglamento y con las demás\ndisposiciones que aprueba la SMV.'

<IPython.core.display.Javascript object>

In [32]:
print(translate_chunk(chunks[803]))

The Agent must have a control system that complies with the provisions contained in this Regulation and with the other provisions approved by the SMV.


<IPython.core.display.Javascript object>

### Translation

In [33]:
# Create an empty list to hold the translated chunks.
translated_chunks = []
# Iterate over each chunk in the list of chunks
for i, chunk in enumerate(chunks):
    # Print the progress of the loop
    print(str(i + 1) + " / " + str(len(chunks)))
    # Call the translate_chunk function on the current chunk and append the result to the list of translated chunks
    translated_chunks.append(translate_chunk(chunk))

1 / 1709
2 / 1709
3 / 1709
4 / 1709
5 / 1709
6 / 1709
7 / 1709
8 / 1709
9 / 1709
10 / 1709
11 / 1709
12 / 1709
13 / 1709
14 / 1709
15 / 1709
16 / 1709
17 / 1709
18 / 1709
19 / 1709
20 / 1709
21 / 1709
22 / 1709
23 / 1709
24 / 1709
25 / 1709
26 / 1709
27 / 1709
28 / 1709
29 / 1709
30 / 1709
31 / 1709
32 / 1709
33 / 1709
34 / 1709
35 / 1709
36 / 1709
37 / 1709
38 / 1709
39 / 1709
40 / 1709
41 / 1709
42 / 1709
43 / 1709
44 / 1709
45 / 1709
46 / 1709
47 / 1709
48 / 1709
49 / 1709
50 / 1709
51 / 1709
52 / 1709
53 / 1709
54 / 1709
55 / 1709
56 / 1709
57 / 1709
58 / 1709
59 / 1709
60 / 1709
61 / 1709
62 / 1709
63 / 1709
64 / 1709
65 / 1709
66 / 1709
67 / 1709
68 / 1709
69 / 1709
70 / 1709
71 / 1709
72 / 1709
73 / 1709
74 / 1709
75 / 1709
76 / 1709
77 / 1709
78 / 1709
79 / 1709
80 / 1709
81 / 1709
82 / 1709
83 / 1709
84 / 1709
85 / 1709
86 / 1709
87 / 1709
88 / 1709
89 / 1709
90 / 1709
91 / 1709
92 / 1709
93 / 1709
94 / 1709
95 / 1709
96 / 1709
97 / 1709
98 / 1709
99 / 1709
100 / 1709
101 / 17

756 / 1709
757 / 1709
758 / 1709
759 / 1709
760 / 1709
761 / 1709
762 / 1709
763 / 1709
764 / 1709
765 / 1709
766 / 1709
767 / 1709
768 / 1709
769 / 1709
770 / 1709
771 / 1709
772 / 1709
773 / 1709
774 / 1709
775 / 1709
776 / 1709
777 / 1709
778 / 1709
779 / 1709
780 / 1709
781 / 1709
782 / 1709
783 / 1709
784 / 1709
785 / 1709
786 / 1709
787 / 1709
788 / 1709
789 / 1709
790 / 1709
791 / 1709
792 / 1709
793 / 1709
794 / 1709
795 / 1709
796 / 1709
797 / 1709
798 / 1709
799 / 1709
800 / 1709
801 / 1709
802 / 1709
803 / 1709
804 / 1709
805 / 1709
806 / 1709
807 / 1709
808 / 1709
809 / 1709
810 / 1709
811 / 1709
812 / 1709
813 / 1709
814 / 1709
815 / 1709
816 / 1709
817 / 1709
818 / 1709
819 / 1709
820 / 1709
821 / 1709
822 / 1709
823 / 1709
824 / 1709
825 / 1709
826 / 1709
827 / 1709
828 / 1709
829 / 1709
830 / 1709
831 / 1709
832 / 1709
833 / 1709
834 / 1709
835 / 1709
836 / 1709
837 / 1709
838 / 1709
839 / 1709
840 / 1709
841 / 1709
842 / 1709
843 / 1709
844 / 1709
845 / 1709
846 / 1709

1460 / 1709
1461 / 1709
1462 / 1709
1463 / 1709
1464 / 1709
1465 / 1709
1466 / 1709
1467 / 1709
1468 / 1709
1469 / 1709
1470 / 1709
1471 / 1709
1472 / 1709
1473 / 1709
1474 / 1709
1475 / 1709
1476 / 1709
1477 / 1709
1478 / 1709
1479 / 1709
1480 / 1709
1481 / 1709
1482 / 1709
1483 / 1709
1484 / 1709
1485 / 1709
1486 / 1709
1487 / 1709
1488 / 1709
1489 / 1709
1490 / 1709
1491 / 1709
1492 / 1709
1493 / 1709
1494 / 1709
1495 / 1709
1496 / 1709
1497 / 1709
1498 / 1709
1499 / 1709
1500 / 1709
1501 / 1709
1502 / 1709
1503 / 1709
1504 / 1709
1505 / 1709
1506 / 1709
1507 / 1709
1508 / 1709
1509 / 1709
1510 / 1709
1511 / 1709
1512 / 1709
1513 / 1709
1514 / 1709
1515 / 1709
1516 / 1709
1517 / 1709
1518 / 1709
1519 / 1709
1520 / 1709
1521 / 1709
1522 / 1709
1523 / 1709
1524 / 1709
1525 / 1709
1526 / 1709
1527 / 1709
1528 / 1709
1529 / 1709
1530 / 1709
1531 / 1709
1532 / 1709
1533 / 1709
1534 / 1709
1535 / 1709
1536 / 1709
1537 / 1709
1538 / 1709
1539 / 1709
1540 / 1709
1541 / 1709
1542 / 1709
1543

InvalidRequestError: This model's maximum context length is 4097 tokens, however you requested 4814 tokens (3314 in your prompt; 1500 for the completion). Please reduce your prompt; or completion length.

<IPython.core.display.Javascript object>

In [41]:
# Join the chunks together
result = "\n\n".join(translated_chunks)

<IPython.core.display.Javascript object>

In [45]:
# Saves the translated text into a file
open(str_path_output_txt_en, "w", encoding="utf8").write(result)

270525

<IPython.core.display.Javascript object>

In [116]:
# Load english file
loader_en = TextLoader(str_path_output_txt_en)

<IPython.core.display.Javascript object>

### Text to vector database 

#### Split text

In [55]:
# Using PdfReader
# We need to split the text that we read into smaller chunks so that during information retreival we don't hit the token size limits
text_splitter = RecursiveCharacterTextSplitter(
    separators="\n",
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len,
)

<IPython.core.display.Javascript object>

In [None]:
texts = text_splitter.split_text(result)

In [67]:
print(f"You have {len(texts)} document(s) in your data")
print(f"There are {sum(len(i) for i in texts)} characters in your document")

You have 350 document(s) in your data
There are 306013 characters in your document


<IPython.core.display.Javascript object>

#### Embeddings

In [73]:
# Create an instance of the OpenAIEmbeddings class
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

<IPython.core.display.Javascript object>

In [74]:
# Use the from_texts() function to convert each document into a vector
db_faiss = FAISS.from_texts(texts, embeddings)

<IPython.core.display.Javascript object>

In [77]:
# Save the FAISS database to pkl object
# Filename
str_path_output = (
    str_path_output_txt_en.split(".")[0]
    + "db_faiss."
    + str_path_output_txt_en.split(".")[1]
)
# Store vector database
f_pklsave(db_faiss, str_path_output_pkl)

<IPython.core.display.Javascript object>

### Query data

In [78]:
# Create new instance of the OpenAI class
# llm = OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)
llm = OpenAI(openai_api_key=OPENAI_API_KEY)

<IPython.core.display.Javascript object>

In [80]:
# Question that the NLP chain will attempt to answer
query = "What is this document about?"

<IPython.core.display.Javascript object>

In [82]:
# Create a retriever from the vector database
retriever = db_faiss.as_retriever(search_type="similarity", search_kwargs={"k": 2})

<IPython.core.display.Javascript object>

#### `load_qa_chain`

In [83]:
# Load a pre-trained question-answering (QA) chain and creates an instance of it
chain = load_qa_chain(llm=llm, chain_type="stuff")

<IPython.core.display.Javascript object>

In [84]:
# Query string as input and searches the vector database for similar documents
# docs variable is a list of documents that are similar to the query string, ordered by their similarity score.
docs = db_faiss.similarity_search(query)

<IPython.core.display.Javascript object>

In [85]:
# Processes the input documents and the question using a pre-trained machine learning model and returns an answer to the question
chain.run(input_documents=docs, question=query)

' This document is about the requirements and obligations of an auditing company and the Agent in regards to foreign trade, derivative instruments, prudential indicators, portfolio management, computer information processing systems, and other related operations and areas.'

<IPython.core.display.Javascript object>

#### `RetrievalQA`

In [86]:
# create a chain to answer questions
qa = RetrievalQA.from_chain_type(
    llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True
)

<IPython.core.display.Javascript object>

In [87]:
result = qa({"query": query})

<IPython.core.display.Javascript object>

In [88]:
retriever.get_relevant_documents(query)

[Document(page_content="The following text document deals with foreign trade, derivative instruments, prudential indicators, portfolio management, computer information processing systems, as well as the main operations and areas that the auditing company considers most important for the Agent.\n\nArticle 140.- Sustentation\n\nIn accordance with the SMV's requirements, the auditor who has signed the opinions or reports mentioned in this title must support them before it and demonstrate that the corresponding auditing work was carried out in accordance with the Regulations.\n\nThe aforementioned justification and demonstration must be carried out only in the meetings that the SMV convenes or specifically sets for this purpose, and exclusively on the basis of the evaluation of the working papers that the auditor presents during them.", metadata={}),
 Document(page_content='The means used for the delivery or making available of the documents referred to in the previous paragraph must be in

<IPython.core.display.Javascript object>

In [89]:
result

{'query': 'What is this document about?',
 'result': ' This document covers foreign trade, derivative instruments, prudential indicators, portfolio management, computer information processing systems, as well as the main operations and areas that the auditing company considers most important for the Agent. It also outlines the requirements for the auditor to support their opinions and reports, as well as the mechanisms that must be in place to ensure the integrity and security of the contracting process and its subsequent verification. Finally, it includes Article 32, which outlines the customer policy.',
 'source_documents': [Document(page_content="The following text document deals with foreign trade, derivative instruments, prudential indicators, portfolio management, computer information processing systems, as well as the main operations and areas that the auditing company considers most important for the Agent.\n\nArticle 140.- Sustentation\n\nIn accordance with the SMV's requireme

<IPython.core.display.Javascript object>

#### `VectorstoreIndexCreator`

In [128]:
# Create a searchable index of embeddings based on a set of input documents
index_en = VectorstoreIndexCreator(
    # split the documents into chunks
    text_splitter=text_splitter,
    # select which embeddings we want to use
    embedding=embeddings,
    # use Chroma as the vectorestore to index and search embeddings
    vectorstore_cls=Chroma,
).from_loaders([loader_en])



<IPython.core.display.Javascript object>

In [130]:
index_en.query(llm=llm, question=query, chain_type="stuff")

' This document is about the requirements for auditing work, customer policies, and the declaration of data accuracy for an Intermediation Agent.'

<IPython.core.display.Javascript object>

In [131]:
# Question that the NLP chain will attempt to answer
query = "What type of firms this document talks about?"

<IPython.core.display.Javascript object>

In [133]:
index_en.query(llm=llm, question=query, chain_type="stuff")

' This document talks about auditing firms.'

<IPython.core.display.Javascript object>

## Method 2: Asking questions in spanish

### Text to vector database

#### Split text

In [93]:
texts_es = text_splitter.split_text(str_text_raw)

<IPython.core.display.Javascript object>

In [94]:
print(f"You have {len(texts)} document(s) in your data")
print(f"There are {sum(len(i) for i in texts)} characters in your document")

You have 350 document(s) in your data
There are 306013 characters in your document


<IPython.core.display.Javascript object>

#### Embeddings

In [95]:
# Use the from_texts() function to convert each document into a vector
db_faiss_es = FAISS.from_texts(texts, embeddings)

<IPython.core.display.Javascript object>

In [96]:
str_path_output_txt.split(".")

['outputs/RSMV00001500034003', 'txt']

<IPython.core.display.Javascript object>

In [97]:
# Save the FAISS database to pkl object
# Filename
str_path_output = str_path_output_txt.split(".")[0] + "db_faiss_es.pkl"
# Store vector database
f_pklsave(db_faiss_es, str_path_output_pkl)

<IPython.core.display.Javascript object>

### Query data

In [98]:
# Question that the NLP chain will attempt to answer
query_es = "List the type of entities in the document"

<IPython.core.display.Javascript object>

In [99]:
# Create a retriever from the vector database
retriever_es = db_faiss_es.as_retriever(
    search_type="similarity", search_kwargs={"k": 2}
)

<IPython.core.display.Javascript object>

#### `load_qa_chain`

In [100]:
# Query string as input and searches the vector database for similar documents
# docs variable is a list of documents that are similar to the query string, ordered by their similarity score.
docs_es = db_faiss_es.similarity_search(query_es)

<IPython.core.display.Javascript object>

In [101]:
# Processes the input documents and the question using a pre-trained machine learning model and returns an answer to the question
chain.run(input_documents=docs_es, question=query_es)

' Financial Instruments, Agent, Deposit or Custody Institution, Banks of the national financial system or abroad'

<IPython.core.display.Javascript object>

#### `RetrievalQA`

In [106]:
# Question that the NLP chain will attempt to answer
query_es = "Define el tipo de empresas que menciona el documento"

<IPython.core.display.Javascript object>

In [107]:
# create a chain to answer questions
qa_es = RetrievalQA.from_chain_type(
    llm=llm, chain_type="stuff", retriever=retriever_es, return_source_documents=True
)

<IPython.core.display.Javascript object>

In [108]:
result_es = qa_es({"query": query_es})

<IPython.core.display.Javascript object>

In [109]:
retriever_es.get_relevant_documents(query_es)

[Document(page_content='The policy may include more than one operation of the same type corresponding to the same value, if these have been carried out on the same day, in which case, for each operation, the corresponding detail must be presented.\n\nThis is a text document.\n\nThis is a text document.\n\n"(*) Paragraph modified by RSUP Nº 024-2017-SMV/01"\n\nArticle 58.- Intermediation Accounts\n\nThe Agent must maintain at least two bank accounts for intermediation, which must be constituted in banks of the national financial system or abroad, and must be destined exclusively for their intermediation activities, one account must be destined for operations of third parties and another for operations on their own account. These accounts are used to concentrate the funds corresponding to charges and payments of customers, the own funds for operations of the Agent, which are centralized for the settlement of positions with other institutions, those coming from dividends or interests rece

<IPython.core.display.Javascript object>

In [110]:
result_es

{'query': 'Define el tipo de empresas que menciona el documento',
 'result': ' El documento se refiere a los agentes de intermediación financiera.',
 'source_documents': [Document(page_content='The policy may include more than one operation of the same type corresponding to the same value, if these have been carried out on the same day, in which case, for each operation, the corresponding detail must be presented.\n\nThis is a text document.\n\nThis is a text document.\n\n"(*) Paragraph modified by RSUP Nº 024-2017-SMV/01"\n\nArticle 58.- Intermediation Accounts\n\nThe Agent must maintain at least two bank accounts for intermediation, which must be constituted in banks of the national financial system or abroad, and must be destined exclusively for their intermediation activities, one account must be destined for operations of third parties and another for operations on their own account. These accounts are used to concentrate the funds corresponding to charges and payments of customer

<IPython.core.display.Javascript object>

#### `VectorstoreIndexCreator`

In [126]:
# Create a searchable index of embeddings based on a set of input documents
index = VectorstoreIndexCreator(
    # split the documents into chunks
    text_splitter=text_splitter,
    # select which embeddings we want to use
    embedding=embeddings,
    # use Chroma as the vectorestore to index and search embeddings
    vectorstore_cls=Chroma,
).from_loaders([loader_es])



<IPython.core.display.Javascript object>

In [127]:
index.query(llm=llm, question=query_es, chain_type="stuff")

' El documento se refiere a Sociedades Agentes de Bolsa, Sociedades Intermediarias de Valores, inversionistas institucionales, y algunas otras entidades financieras.'

<IPython.core.display.Javascript object>

## References

**Open AI Cookbook**
<br />
<br />
[Translate a book writen in LaTeX from Slovenian into English](https://github.com/openai/openai-cookbook/blob/main/examples/book_translation/translate_latex_book.ipynb)