In [3]:
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

True

In [4]:
#pip install pypdf -q

In [5]:
#pip install docx2txt -q

In [6]:
#pip install wikipedia -q

## Functions

In [7]:
def load_document(file):
    import os
    name,extension = os.path.splitext(file)
    
    if extension == '.pdf':
        from langchain.document_loaders import PyPDFLoader
        # each document contains the page, content and metadata with a page number
        print(f'Loading {file}')
        loader = PyPDFLoader(file)
    elif extension == '.docx':
        from langchain.document_loaders import Docx2txtLoader
        print(f'Loading {file}')
        loader = Docx2txtLoader(file)
    else:
        print('Document format is not supported!')
    
    data = loader.load()
    return data

def load_from_wikipedia(query,lang='en', load_max_docs=2):
    from langchain.document_loaders import WikipediaLoader
    loader = WikipediaLoader(query=query,lang=lang,load_max_docs=load_max_docs)
    data = loader.load()
    return data

In [8]:
def chunk_data(data, chunk_size=256):
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=0)
    chunks = text_splitter.split_documents(data)
    return chunks

In [9]:
def print_embedding_cost(texts):
    import tiktoken
    enc = tiktoken.encoding_for_model('text-embedding-ada-002')
    total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
    print(f'Total tokens: {total_tokens}')
    print(f'Embeeding cost in USD: {total_tokens / 1000 * 0.0004:.6f}')

In [10]:
def insert_of_fetch_embeddings(index_name):
    
    import pinecone 
    from langchain.vectorstores import Pinecone
    from langchain.embeddings.openai import OpenAIEmbeddings
    
    embeddings = OpenAIEmbeddings()
    
    pinecone.init(api_key=os.environ.get('PINECONE_API_KEY'), environment=os.environ.get('PINECONE_ENV'))
    
    if index_name in pinecone.list_indexes():
        print(f'Index {index_name} already exists. Loading embeddings', end='')
        vector_store = Pinecone.from_existing_index(index_name, embeddings)
        print('Ok')
        
    else:
        print(f'Creating Index {index_name} and embeddings', end='')
        pinecone.create_index(index_name, dimension=1536, metric='cosine')
        vector_store = Pinecone.from_documents(chunks,embeddings, index_name=index_name)
        print('Ok')
    
    return vector_store

In [11]:
def delete_pinecone_index(index_name='all'):
    import pinecone 
    pinecone.init(api_key=os.environ.get('PINECONE_API_KEY'), environment=os.environ.get('PINECONE_ENV'))

    if index_name=='all':
        indexes = pinecone.list_indexes()
        print('Deleting all the indexes...')
        for index in indexes:
            pinecone.delete_index(index)
        print('ok')
    else:
        print(f'Deleting inde {index_name}...', end='')
        pinecone.delete_index(index)
        print('ok')
        

In [38]:
def ask_and_get_answer(vector_store, q):
    from langchain.chains import RetrievalQA
    from langchain.chat_models import ChatOpenAI

    llm = ChatOpenAI(model='gpt-3.5-turbo',temperature=1)

    retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k':3})

    chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)
   
    answer = chain.run(q)
    
    return answer
    

In [58]:
def ask_with_memory(vector_store, q, chat_history=[]):
    from langchain.chains import ConversationalRetrievalChain
    from langchain.chat_models import ChatOpenAI
    
    llm = ChatOpenAI(temperature=1)
    retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k':3})
    
    crc = ConversationalRetrievalChain.from_llm(llm,retriever)
    result = crc({'question': q, 'chat_history': chat_history})
    
    chat_history.append((question, result['answer']))
    
    return result, chat_history


## Running code

### Uploading Data

In [40]:
#data = load_document('files/Análisis+de+un+sector+2.0.pdf')
#print(data[1].page_content)
#print(data[1].metadata)
#print(f'You have {len(data)} pages in your data')
#print(f'There are {len(data[2].page_content)} characters in the page')

Loading files/Análisis+de+un+sector+2.0.pdf


In [41]:
#data = load_document('files/00217 - Vivir Sin Jefe - Sergio Fernández.pdf')
#print(data[1].page_content)
#print(data[1].metadata)
#print(f'You have {len(data)} pages in your data')
#print(f'There are {len(data[2].page_content)} characters in the page')

In [49]:
data = load_from_wikipedia('GPT-4', 'es')
#print(data[0].page_content)



  lis = BeautifulSoup(html).find_all('li')


### Chunking Data

In [50]:
chunks = chunk_data(data)
print(f'We have a total of chunks of {len(chunks)}')
#print(chunks[1000].page_content)

We have a total of chunks of 22


### Calculating Cost

In [51]:
print_embedding_cost(chunks)

Total tokens: 1152
Embeeding cost in USD: 0.000461


### Embedding and Uploading to a Vector DB

In [52]:
delete_pinecone_index()

Deleting all the indexes...
ok


In [53]:
index_name = 'chatgpt'
vector_store = insert_of_fetch_embeddings(index_name)


Creating Index chatgpt and embeddingsOk


### Asking Questions

In [54]:
q = 'What is chatpGPT'
answer = ask_and_get_answer(vector_store,q)
#result = vector_store.similarity_search(q)
print(answer)

ChatGPT is a product developed by OpenAI. It is powered by the GPT-3.5 model, which is an advanced version of GPT-3, a large language model. ChatGPT is designed to generate responses and hold conversations in a chat-like format. It is capable of understanding and generating human-like text based on the input it receives. ChatGPT has been trained on a vast amount of data and can be accessed by users through the OpenAI API or the ChatGPT Plus subscription.


In [55]:
import time
i = 1
print('Write Quite or Exit to quit.')
while True:
    q = input(f'Question #{i}: ')
    i = i + 1
    if q.lower() in ['quit', 'exit']:
        print('Quitting... bye!')
        time.sleep(2)
        break
    answer = ask_and_get_answer(vector_store,q)
    print(f'\nAnswer: {answer}')
    print(f'\n {"-"*50} \n')

Write Quite or Exit to quit.
Question #1: When was chat GPT launched?

Answer: ChatGPT was launched on March 14, 2023.

 -------------------------------------------------- 

Question #2: Until what info can we ask it?

Answer: You can ask GPT-4 for information such as summarizing text, analyzing screenshots, and answering questions related to exams with diagrams. However, the extent of its capabilities to provide information may vary depending on the specific instructions given to it.

 -------------------------------------------------- 

Question #3: Can I ask about information happened in 2022?

Answer: No, I can only provide information up until the context given, which is up until the year 2020. I do not have any information about events or developments that occurred in 2022.

 -------------------------------------------------- 

Question #4: que es github copilot?

Answer: GitHub Copilot es una herramienta de programación desarrollada por GitHub y OpenAI. Es un asistente de codifi

KeyboardInterrupt: Interrupted by user

### Asking with memorry

In [60]:
chat_history = []
question = 'Cuandos se creó GPT-4 '
result, chat_history = ask_with_memory(vector_store,question,chat_history)
print(result['answer'])
print(chat_history)

GPT-4 se creó el 14 de marzo de 2023.
[('Cuandos se creó GPT-4 ', 'GPT-4 se creó el 14 de marzo de 2023.')]


In [None]:
question = 'Sumale a esa fecha 43 días, qué fecha es? '
result, chat_history = ask_with_memory(vector_store,question,chat_history)
print(result['answer'])
print(chat_history)