In [2]:
import configparser
import os

config = configparser.RawConfigParser()
config.read('../keys.config')
os.environ["ACTIVELOOP_TOKEN"] = config.get('keys', 'active_loop_key')
os.environ["OPENAI_API_KEY"] = config.get('keys', 'open_ai_key')


In [None]:
# %pip install --upgrade langchain==0.0.208 deeplake openai typing_extensions==4.5.0

In [24]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import DeepLake
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA

# Before executing the following code, make sure to have your
# Activeloop key saved in the “ACTIVELOOP_TOKEN” environment variable.

# instantiate the LLM and embeddings models
llm = OpenAI(model="text-davinci-003", temperature=0)
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")

f = open('../source/energy_week2.txt')
row = f.readlines()

texts = []
metadatas = []
for r in row:
    if '•	' in r:
        if r.replace('•	', '') in texts:
            continue
        texts.append(r.replace('•	', ''))
        metadatas.append({"query": ""})

# text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0, separators="\n")
# docs = text_splitter.create_documents(texts, metadatas=metadatas)

docs = [Document(page_content=t) for t in texts]

# create Deep Lake dataset
# TODO: use your organization id here. (by default, org id is your username)
my_activeloop_org_id = "braduck" 
my_activeloop_dataset_name = "espresso"
dataset_path = f"hub://{my_activeloop_org_id}/{my_activeloop_dataset_name}"
db = DeepLake(dataset_path=dataset_path, embedding_function=embeddings)

# add documents to our Deep Lake dataset
db.delete(delete_all=True)
db.add_documents(docs)

Deep Lake Dataset in hub://braduck/espresso already exists, loading from the storage
Your Deep Lake dataset has been successfully created!


|

Dataset(path='hub://braduck/espresso', tensors=['embedding', 'id', 'metadata', 'text'])

  tensor      htype       shape       dtype  compression
  -------    -------     -------     -------  ------- 
 embedding  embedding  (1633, 1536)  float32   None   
    id        text      (1633, 1)      str     None   
 metadata     json      (1633, 1)      str     None   
   text       text      (1633, 1)      str     None   


 

['7f4622f0-2972-11ee-a47d-be20711fe3ae',
 '7f462368-2972-11ee-a47d-be20711fe3ae',
 '7f46237c-2972-11ee-a47d-be20711fe3ae',
 '7f462390-2972-11ee-a47d-be20711fe3ae',
 '7f46239a-2972-11ee-a47d-be20711fe3ae',
 '7f4623ae-2972-11ee-a47d-be20711fe3ae',
 '7f4623c2-2972-11ee-a47d-be20711fe3ae',
 '7f4623cc-2972-11ee-a47d-be20711fe3ae',
 '7f4623e0-2972-11ee-a47d-be20711fe3ae',
 '7f4623ea-2972-11ee-a47d-be20711fe3ae',
 '7f4623fe-2972-11ee-a47d-be20711fe3ae',
 '7f462408-2972-11ee-a47d-be20711fe3ae',
 '7f462412-2972-11ee-a47d-be20711fe3ae',
 '7f462426-2972-11ee-a47d-be20711fe3ae',
 '7f462430-2972-11ee-a47d-be20711fe3ae',
 '7f462444-2972-11ee-a47d-be20711fe3ae',
 '7f46244e-2972-11ee-a47d-be20711fe3ae',
 '7f462462-2972-11ee-a47d-be20711fe3ae',
 '7f46246c-2972-11ee-a47d-be20711fe3ae',
 '7f462476-2972-11ee-a47d-be20711fe3ae',
 '7f46248a-2972-11ee-a47d-be20711fe3ae',
 '7f462494-2972-11ee-a47d-be20711fe3ae',
 '7f4624a8-2972-11ee-a47d-be20711fe3ae',
 '7f4624b2-2972-11ee-a47d-be20711fe3ae',
 '7f4624bc-2972-

In [27]:
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k":3})
retrieval_qa = RetrievalQA.from_chain_type(
	llm=llm,
	chain_type="stuff",
	retriever=retriever
)

In [28]:
query = query = f"""
Look for the 3 to 5 relevant page_content related to Energy and Geopolitics, according to rules below.\
By Geopolitics, I mean:
- Information related to most important consumers and producers countries of Crude Oil
- Information that impacts China, USA, Europe, or Middle East
Rules:\
- Bring the most relevant paragraphs, based on:
    - Words related to Energy (Crude Oil, Natural Gas, Electricity)
    - Lenght of the paragraph
    - Don't repeat
"""

matching_docs = db.similarity_search(query=query)
print(matching_docs)

[Document(page_content='"the importance of energy in the PPI decline in recent months and how, together with the drop in..." via @eToro $OIL $NATGAS\n', metadata={}), Document(page_content='- #Indias Crude Oil Imports from Russia Fuel Exports to Europe - Turkish Elections Global Energy Markets - European Gas Prices Amid Muted Gas Demand - Global Energy Investment Rising Costs Link: https://t.co/gOLTsp7k11\n', metadata={}), Document(page_content='Briefs on Natural Gas: Gold Strategy US Clean Energy:\n', metadata={}), Document(page_content='Feeling pressured to compete against Russia for market share in China, Irans total crude oil and gas condensate exports are now at 1.9 million barrels per day over the past 30 days. Russia, Iran and Venezuela are keeping us very busy this year. #OOTT\n', metadata={})]


In [19]:
query = f"""
Look for the 3 to 5 relevant page_content related to Energy and Geopolitics, according to rules below.\
By Geopolitics, I mean:
- Information related to most important consumers and producers countries of Crude Oil
- Information that impacts China, USA, Europe, or Middle East
Rules:\
- Bring the most relevant paragraphs, based on:
    - Words related to Energy (Crude Oil, Natural Gas, Electricity)
    - Lenght of the paragraph
"""

result = retrieval_qa({"query": query})
print(result)
# retrieval_qa.run("What are the 4 most relevants paragraphs related to Enery and Geopolitics?")


{'query': '\nLook for the 3 to 5 relevant page_content related to Energy and Geopolitics, according to rules below.By Geopolitics, I mean:\n- Information related to most important consumers and producers countries of Crude Oil\n- Information that impacts China, USA, Europe, or Middle East\nRules:- Bring the most relevant paragraphs, based on:\n    - Words related to Energy (Crude Oil, Natural Gas, Electricity)\n    - Lenght of the paragraph\n', 'result': " The page content provided in the link includes information related to India's crude oil imports from Russia, Turkish elections and their impact on global energy markets, European gas prices amid muted gas demand, and global energy investment rising costs. These topics are all related to energy and geopolitics, as they discuss the most important consumers and producers countries of crude oil, and how these countries and events impact China, the US, Europe, and the Middle East."}
