In [16]:
import configparser
import os

config = configparser.RawConfigParser()
config.read('keys.config')
os.environ["ACTIVELOOP_TOKEN"] = config.get('keys', 'active_loop_key')
os.environ["OPENAI_API_KEY"] = config.get('keys', 'open_ai_key')


In [17]:
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import DeepLake

In [18]:
loader = TextLoader("energy_week2.txt", encoding="utf-8")
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(documents)
for i, text in enumerate(texts):
    text.metadata["source"] = f"{i}-pl"
embeddings = OpenAIEmbeddings()
docsearch = DeepLake.from_documents(texts, embeddings)

# # create Deep Lake dataset
# # TODO: use your organization id here. (by default, org id is your username)
# my_activeloop_org_id = "braduck" 
# my_activeloop_dataset_name = "espresso"
# dataset_path = f"hub://{my_activeloop_org_id}/{my_activeloop_dataset_name}"
# db = DeepLake(dataset_path=dataset_path, embedding_function=embeddings)

# # add documents to our Deep Lake dataset
# db.delete(delete_all=True)
# db.add_documents(docsearch)


Deep Lake Dataset in ./deeplake/ already exists, loading from the storage
Dataset(path='./deeplake/', tensors=['embedding', 'id', 'metadata', 'text'])

  tensor      htype      shape     dtype  compression
  -------    -------    -------   -------  ------- 
 embedding  embedding  (2, 1536)  float32   None   
    id        text      (2, 1)      str     None   
 metadata     json      (2, 1)      str     None   
   text       text      (2, 1)      str     None   




8

In [19]:
from langchain import PromptTemplate
from langchain import FewShotPromptTemplate

# Load Sample
f = open('sample_espresso.txt', 'r')
rows = f.readlines()

sample = []
for r  in rows:
    if "Category:" in r: # New Category
        sample.append({"query": r})
    else:
        if sample[-1].get("answer", None) is None:
            sample[-1]["answer"] = ""
        
        sample[-1]["answer"] += r

# create an example template
example_template = """
User: {query}
AI: {answer}
"""

# create a prompt example from above template
example_prompt = PromptTemplate(
    input_variables=["query", "answer"],
    template=example_template
)

# now break our previous prompt into a prefix and suffix
# the prefix is our instructions
prefix = """The following is a template for Energy-related News, grouped by category.\
    Use this template as a sample to generate new Energy-related News by category.
"""
# and the suffix our user input and output indicator
suffix = """
User: {query}
AI: """

# now create the few-shot prompt template
few_shot_prompt_template = FewShotPromptTemplate(
    examples=sample,
    example_prompt=example_prompt,
    prefix=prefix,
    suffix=suffix,
    input_variables=["query"],
    example_separator="\n\n"
)

In [8]:
from langchain.chat_models import ChatOpenAI
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.prompts import PromptTemplate
from langchain.chains import create_qa_with_sources_chain

In [20]:
llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo")
qa_chain = create_qa_with_sources_chain(llm)

In [21]:
doc_prompt = PromptTemplate(
    template="Content: {page_content}\nSource: {source}",
    input_variables=["page_content", "source"],
)

In [12]:
# final_qa_chain = StuffDocumentsChain(
#     llm_chain=qa_chain,
#     document_variable_name="context",
#     document_prompt=doc_prompt,
# )
final_qa_chain = StuffDocumentsChain(
    llm_chain=qa_chain,
    document_variable_name="context",
    document_prompt=few_shot_prompt_template,
)

In [13]:
retrieval_qa = RetrievalQA(
    retriever=docsearch.as_retriever(), combine_documents_chain=final_qa_chain
)

In [22]:
query = "What the most commom word?"

query = """
Generate a summary of the tweets (each tweet is a line in the document retrieved) for each category on few show template.\
Rules:\
- The same tweet cannot appear in more than one category
- Minimum of 2 and Max of 5 tweets per category
- Bring the most relevant tweets, based on:
    - Words related to Energy (Crude Oil, Natural Gas, Electricity)
    - Words related to the Category Name
    - Lenght of the tweet
"""


In [23]:
retrieval_qa.run(query)

'{\n  "answer": "Category: Geopolitics\\n- OPEC meeting on June 4th.\\n- International inspectors report Iran has re-installed monitoring equipment even as its highly-enriched uranium inventory surged 30% last quarter\\n\\nCategory: Global Macro\\n- China Manufacturing PMI fell to 48.8 (lower/weaker than expected). US PMI reported at 48.5 (weaker/lower than expected)\\n- Germany is confirmed to be in recession (two successive quarters of negative Growth). GDP growth Q1 2023 -0.5% Q4 2022 -0.3%\\n\\nCategory: Trade/Reg\\n- #Nigeria Owes Its Energy Company $6 Billion in Fuel-Subsidy Debt NNPC’s CEO welcomed president’s decision to scrap subsidy It cost the state $10 billion to keep fuel cheap in 2022\\n\\nCategory: Merchant\\n- China Russia are discussing potential #coal supply contracts w Russia seeking to strengthen cooperation w China. This comes amid global demand uncertainties increasing competition in the Chinese market. In Oct 2021, Russia signed a memorandum w India re coal suppl