In [1]:
import os
import pprint

from dotenv import load_dotenv

from langchain.chains import create_history_aware_retriever, create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import FAISS
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import CrossEncoderReranker
from langchain_community.cross_encoders import HuggingFaceCrossEncoder

In [2]:
load_dotenv()
open_api_key = os.getenv("OPENAI_API_KEY")
if not open_api_key:
    raise ValueError("OPENAI_API_KEY is not set")

In [3]:
loader = TextLoader("data/data.txt", encoding="utf-8")
docs = loader.load()

In [4]:
generative_llm = ChatOpenAI(model="gpt-4o", temperature=0)

In [5]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100, 
    chunk_overlap=20,
    separators=["\n"]
)   

splits = text_splitter.split_documents(docs)
vectorstore = FAISS.from_documents(
    documents=splits, 
    embedding=OpenAIEmbeddings(model="text-embedding-3-large")
)
retriever = vectorstore.as_retriever()
# retriever=vectorstore.as_retriever(search_type="similarity_score_threshold", search_kwargs={
#                               'score_threshold': 0.5})

model = HuggingFaceCrossEncoder(model_name="cross-encoder/ms-marco-MiniLM-L-2-v2")
compressor = CrossEncoderReranker(model=model, top_n=3)
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=retriever
)

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
splits

[Document(metadata={'source': 'data/data.txt'}, page_content='Azerbaijan Artificial Intelligence Lab (AILAB) was established under the Ministry of Digital Development and Transportation to set a roadmap for the development of artificial intelligence in Azerbaijan. Committed to establishing strong AI alliances within the country and the region. '),
 Document(metadata={'source': 'data/data.txt'}, page_content='\nSpace exploration has advanced significantly over the years, with missions to the Moon, Mars, and beyond. Technologies like reusable rockets and space telescopes are paving the way for future discoveries, deepening our understanding of the universe.'),
 Document(metadata={'source': 'data/data.txt'}, page_content='\nRenewable energy sources, such as solar, wind, and hydroelectric power, are crucial for a sustainable future. By reducing reliance on fossil fuels, renewable energy helps mitigate climate change and provides clean, sustainable power.'),
 Document(metadata={'source': 'd

In [7]:
# Some useful db operations

# vectorstore.index.reset()
# vectorstore.index.ntotal
# vectorstore.index.reconstruct(0)

In [8]:
compression_retriever

ContextualCompressionRetriever(base_compressor=CrossEncoderReranker(model=HuggingFaceCrossEncoder(client=<sentence_transformers.cross_encoder.CrossEncoder.CrossEncoder object at 0x0000025AB03C5010>, model_name='cross-encoder/ms-marco-MiniLM-L-2-v2', model_kwargs={}), top_n=3), base_retriever=VectorStoreRetriever(tags=['FAISS', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x0000025AD4DD5E80>, search_kwargs={}))

In [9]:
### Contextualize question ###
contextualize_q_system_prompt = """Given a chat history and the latest user question \
which might reference context in the chat history, formulate a standalone question \
which can be understood without the chat history. Do NOT answer the question, \
just reformulate it if needed and otherwise return it as is."""

contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

In [10]:
history_aware_retriever = create_history_aware_retriever(
    generative_llm, compression_retriever, contextualize_q_prompt
)

In [11]:
### Answer question ###
qa_system_prompt = """You are an assistant for question-answering tasks. \
Use ONLY the provided retrieved context to answer the question. \
If the context does not contain relevant information, simply respond with: \
"I don’t know based on the given information." \

Retrieved context: 
{context}"""

qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", qa_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

question_answer_chain = create_stuff_documents_chain(generative_llm, qa_prompt)
rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)

In [12]:
### Statefully manage chat history ###
store = {}
session_id = "chatbot_first_session"

def get_session_history(session_id: str) -> BaseChatMessageHistory:
    if session_id not in store:
        store[session_id] = ChatMessageHistory()
    return store[session_id]

In [13]:
conversational_rag_chain = RunnableWithMessageHistory(
    rag_chain,
    get_session_history,
    input_messages_key="input",
    history_messages_key="chat_history",
    output_messages_key="answer",
)

In [14]:
store

{}

In [15]:
conversational_rag_chain.invoke(
    {"input": "What causes climate change?"},
    config={
        "configurable": {"session_id": session_id}
    }, 
)["answer"]

'Climate change is caused by long-term shifts in weather patterns and global temperatures. Human activities, particularly the burning of fossil fuels, have contributed to rising temperatures, leading to extreme weather events and environmental degradation.'

In [19]:
conversational_rag_chain.invoke(
    {"input": "What was my last question?"},
    config={
        "configurable": {"session_id": session_id}
        },
        
)["answer"] 

'Your last question was: "what are renewable energy sources?"'

In [138]:
pprint.pprint(store)

{'chatbot_first_session': InMemoryChatMessageHistory(messages=[HumanMessage(content='Under what ministry was AILAB established', additional_kwargs={}, response_metadata={}), AIMessage(content='AILAB was established under the Ministry of Digital Development and Transportation.', additional_kwargs={}, response_metadata={}), HumanMessage(content='What was my last question?', additional_kwargs={}, response_metadata={}), AIMessage(content='Your last question was: "Under what ministry was AILAB established?"', additional_kwargs={}, response_metadata={})])}


In [139]:
for msg in store[session_id].messages:
    print(f"{msg.type.upper()}: {msg.content}")

response = conversational_rag_chain.invoke(
    {"input": "Can you list all questions that I've addressed to you so far?"},
    config={"configurable": {"session_id": session_id}},
)["answer"]


HUMAN: Under what ministry was AILAB established
AI: AILAB was established under the Ministry of Digital Development and Transportation.
HUMAN: What was my last question?
AI: Your last question was: "Under what ministry was AILAB established?"


In [None]:
retriever.invoke(
    input="Do you know anything about AILAB", 
    config={}
)

[Document(id='d533ffb0-1edb-475a-af60-68e9265e4a7b', metadata={'source': 'data/data.txt'}, page_content='Azerbaijan Artificial Intelligence Lab (AILAB) was established under the Ministry of Digital Development and Transportation to set a roadmap for the development of artificial intelligence in Azerbaijan. Committed to establishing strong AI alliances within the country and the region. '),
 Document(id='aca09dbf-584c-4309-9131-f23cab36dbc5', metadata={'source': 'data/data.txt'}, page_content='\nBlockchain technology is a decentralized ledger system that ensures secure, transparent transactions. It is the backbone of cryptocurrencies like Bitcoin and has applications in industries ranging from finance to healthcare.'),
 Document(id='7690c3b7-76b5-4acf-9e68-b315a730d221', metadata={'source': 'data/data.txt'}, page_content='\nQuantum computing harnesses the principles of quantum mechanics to solve problems beyond the capabilities of classical computers. It has the potential to revolutioni

In [24]:
query = "Do you know anything about AILAB"
docs_and_scores = vectorstore.similarity_search_with_score(query, k=4)

for doc, score in docs_and_scores:
    print("----")
    print("Document:\n", doc.page_content)
    print("Metadata:", doc.metadata)
    print("Score:", score)

----
Document:
 Azerbaijan Artificial Intelligence Lab (AILAB) was established under the Ministry of Digital Development and Transportation to set a roadmap for the development of artificial intelligence in Azerbaijan. Committed to establishing strong AI alliances within the country and the region. 
Metadata: {'source': 'data/data.txt'}
Score: 0.88919747
----
Document:
 
Blockchain technology is a decentralized ledger system that ensures secure, transparent transactions. It is the backbone of cryptocurrencies like Bitcoin and has applications in industries ranging from finance to healthcare.
Metadata: {'source': 'data/data.txt'}
Score: 1.7115123
----
Document:
 
Quantum computing harnesses the principles of quantum mechanics to solve problems beyond the capabilities of classical computers. It has the potential to revolutionize fields like cryptography, materials science, and artificial intelligence.
Metadata: {'source': 'data/data.txt'}
Score: 1.7125733
----
Document:
 
Space explorati