In [4]:
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

True

In [5]:
#pip install pypdf -q

In [6]:
#pip install docx2txt -q

In [7]:
#pip install wikipedia -q

## Functions

In [8]:
def load_document(file):
    import os
    name,extension = os.path.splitext(file)
    
    if extension == '.pdf':
        from langchain.document_loaders import PyPDFLoader
        # each document contains the page, content and metadata with a page number
        print(f'Loading {file}')
        loader = PyPDFLoader(file)
    elif extension == '.docx':
        from langchain.document_loaders import Docx2txtLoader
        print(f'Loading {file}')
        loader = Docx2txtLoader(file)
    else:
        print('Document format is not supported!')
    
    data = loader.load()
    return data

def load_from_wikipedia(query,lang='en', load_max_docs=2):
    from langchain.document_loaders import WikipediaLoader
    loader = WikipediaLoader(query=query,lang=lang,load_max_docs=load_max_docs)
    data = loader.load()
    return data

In [9]:
def chunk_data(data, chunk_size=256):
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=0)
    chunks = text_splitter.split_documents(data)
    return chunks

In [16]:
def print_embedding_cost(texts):
    import tiktoken
    enc = tiktoken.encoding_for_model('text-embedding-ada-002')
    total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
    print(f'Total tokens: {total_tokens}')
    print(f'Embeeding cost in USD: {total_tokens / 1000 * 0.0004:.6f}')

In [38]:
def insert_of_fetch_embeddings(index_name):
    
    import pinecone 
    from langchain.vectorstores import Pinecone
    from langchain.embeddings.openai import OpenAIEmbeddings
    
    embeddings = OpenAIEmbeddings()
    
    pinecone.init(api_key=os.environ.get('PINECONE_API_KEY'), environment=os.environ.get('PINECONE_ENV'))
    
    if index_name in pinecone.list_indexes():
        print(f'Index {index_name} already exists. Loading embeddings', end='')
        vector_store = Pinecone.from_existing_index(index_name, embeddings)
        print('Ok')
        
    else:
        print(f'Creating Index {index_name} and embeddings', end='')
        pinecone.create_index(index_name, dimension=1536, metric='cosine')
        vector_store = Pinecone.from_documents(chunks,embeddings, index_name=index_name)
        print('Ok')
    
    return vector_store

In [28]:
def delete_pinecone_index(index_name='all'):
    import pinecone 
    pinecone.init(api_key=os.environ.get('PINECONE_API_KEY'), environment=os.environ.get('PINECONE_ENV'))

    if index_name=='all':
        indexes = pinecone.list_indexes()
        print('Deleting all the indexes...')
        for index in indexes:
            pinecone.delete_index(index)
        print('ok')
    else:
        print(f'Deleting inde {index_name}...', end='')
        pinecone.delete_index(index)
        print('ok')
        

## Running code

### Uploading Data

In [29]:
#data = load_document('files/Análisis+de+un+sector+2.0.pdf')
#print(data[1].page_content)
#print(data[1].metadata)
#print(f'You have {len(data)} pages in your data')
#print(f'There are {len(data[2].page_content)} characters in the page')

In [30]:
data = load_document('files/00217 - Vivir Sin Jefe - Sergio Fernández.pdf')
#print(data[1].page_content)
print(data[1].metadata)
print(f'You have {len(data)} pages in your data')
print(f'There are {len(data[2].page_content)} characters in the page')

Loading files/00217 - Vivir Sin Jefe - Sergio Fernández.pdf
{'source': 'files/00217 - Vivir Sin Jefe - Sergio Fernández.pdf', 'page': 1}
You have 138 pages in your data
There are 82 characters in the page


In [31]:
#data = load_from_wikipedia('GPT-4', 'es')
#print(data[0].page_content)

### Chunking Data

In [32]:
chunks = chunk_data(data)
print(f'We have a total of chunks of {len(chunks)}')
print(chunks[1000].page_content)

We have a total of chunks of 1367
Jugar a ser usted de mayor 
 
Error 37. No dar la apariencia de estar siempre ocupado
Creyendo apasionadamente en algo que aún no existe, lo creamos. Lo que no existe
es aquello que no hemos deseado lo suficiente. (Pintada encontrada en Leicester


### Calculating Cost

In [33]:
print_embedding_cost(chunks)

Total tokens: 85574
Embeeding cost in USD: 0.034230


### Embedding and Uploading to a Vector DB

In [40]:
delete_pinecone_index()

Deleting all the indexes...
ok


In [41]:
index_name = 'askadocument'
vector_store = insert_of_fetch_embeddings(index_name)

Creating Index askadocument and embeddingsOk
