In [4]:

'''
This code defines a process for web scraping, text chunking, vectorizing, and creating a 
retrieval-augmented generation (RAG) pipeline using Langchain and FAISS. It scrapes a webpage, 
splits the content into smaller chunks, converts these chunks into vector embeddings, stores 
them in a FAISS vector store, and then creates a chain to retrieve relevant information and 
generate responses to questions based on the retrieved context.
'''




# pip install faiss-cpu
from langchain_core.prompts import ChatPromptTemplate
from dotenv import load_dotenv
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_community.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores.faiss import FAISS
from langchain.chains import create_retrieval_chain
from langchain_openai import ChatOpenAI


load_dotenv()

def crawl_data(url):
    loader = WebBaseLoader(url)
    docs = loader.load()
    
    splitter = RecursiveCharacterTextSplitter(
        chunk_size = 200, 
        chunk_overlap = 20)
    split_docs = splitter.split_documents(docs)
    return split_docs

def vectorization(docs):
    embedding = OpenAIEmbeddings()
    vectorStore = FAISS.from_documents(docs, embedding = embedding)
    return vectorStore

def create_chain(vectorStore):
    llm_model = ChatOpenAI(
    model = 'gpt-3.5-turbo')

    prompt = ChatPromptTemplate.from_template('''
            Answer the questions: 
            Context: {context}
            Question: {input}
            ''')  
    
    # Create llm chain
    chain = create_stuff_documents_chain(
        llm = llm_model, 
        prompt = prompt
    )
    
    #A retrieval-augmented generation (RAG) pipeline, which combines a language model with a vector store 
    #to answer questions based on the context retrieved from the vector store.
    retriever = vectorStore.as_retriever() # give 5 best results  or you can use as_retriever(search_kwargs={"k": 1})
    retriever_chain = create_retrieval_chain(retriever, chain)

    return retriever_chain
    


docs = crawl_data('https://www.cicnews.com/2024/07/express-entry-1390-candidates-invited-in-latest-pnp-only-draw-0745309.html#gs.bs9dnf')
vectorStore = vectorization(docs)
chain = create_chain(vectorStore)


response = chain.invoke({
    'input': 'What is category-based selection?',
    'context': docs
})
print(response)


In [6]:
print(response['answer'])

# Improve the chat using history of chats (Conversatioal ChatBot)

### Showing our current approach using LangChain does not have our history 

In [40]:
# pip install faiss-cpu
from langchain_core.prompts import ChatPromptTemplate
from dotenv import load_dotenv
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_community.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores.faiss import FAISS
from langchain.chains import create_retrieval_chain
from langchain_openai import ChatOpenAI


load_dotenv()

def crawl_data(url):
    loader = WebBaseLoader(url)
    docs = loader.load()
    
    splitter = RecursiveCharacterTextSplitter(
        chunk_size = 400, 
        chunk_overlap = 20)
    split_docs = splitter.split_documents(docs)
    return split_docs

def vectorization(docs):
    embedding = OpenAIEmbeddings()
    vectorStore = FAISS.from_documents(docs, embedding = embedding)
    return vectorStore

def create_chain(vectorStore):
    llm_model = ChatOpenAI(
    model = 'gpt-3.5-turbo')

    prompt = ChatPromptTemplate.from_template('''
            Answer the questions: 
            Context: {context}
            Question: {input}
            ''')  
    
    # Create llm chain
    chain = create_stuff_documents_chain(
        llm = llm_model, 
        prompt = prompt
    )
    
    #A retrieval-augmented generation (RAG) pipeline, which combines a language model with a vector store 
    #to answer questions based on the context retrieved from the vector store.
    retriever = vectorStore.as_retriever(search_kwargs={"k": 3}) # give 5 best results  or you can use as_retriever(search_kwargs={"k": 1})
    retriever_chain = create_retrieval_chain(retriever, chain)

    return retriever_chain
    

def chat_pipe(chain, question):
    response = chain.invoke({
    'input': question,
    'context': docs
    })
    return (response['answer'])

if __name__ == '__main__':
    docs = crawl_data('https://www.cicnews.com/2024/07/express-entry-1390-candidates-invited-in-latest-pnp-only-draw-0745309.html#gs.bs9dnf')
    vectorStore = vectorization(docs)
    chain = create_chain(vectorStore)

    while True:
        user_input = input('You:')
        if user_input.lower() == 'exit':
            break
        response = chat_pipe(chain, user_input)
        
        print('Assistant:', response)




### Adding Chat history 

In [18]:
# pip install faiss-cpu
from langchain_core.prompts import ChatPromptTemplate
from dotenv import load_dotenv
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_community.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores.faiss import FAISS
from langchain.chains import create_retrieval_chain
from langchain_openai import ChatOpenAI

from langchain_core.messages import HumanMessage, AIMessage
from langchain_core.prompts import MessagesPlaceholder 

load_dotenv()

def crawl_data(url):
    loader = WebBaseLoader(url)
    docs = loader.load()
    
    splitter = RecursiveCharacterTextSplitter(
        chunk_size = 400, 
        chunk_overlap = 20)
    split_docs = splitter.split_documents(docs)
    return split_docs

def vectorization(docs):
    embedding = OpenAIEmbeddings()
    vectorStore = FAISS.from_documents(docs, embedding = embedding)
    return vectorStore

def create_chain(vectorStore):
    llm_model = ChatOpenAI(
    model = 'gpt-3.5-turbo')

    prompt = ChatPromptTemplate.from_messages([
        ("system", "Answer the the user question based on the context: {context}"),
        MessagesPlaceholder(variable_name = 'chat_history'),
        ("human", "{input}")   
    ])  
    
    # Create llm chain
    chain = create_stuff_documents_chain(
        llm = llm_model, 
        prompt = prompt
    )
    
    #A retrieval-augmented generation (RAG) pipeline, which combines a language model with a vector store 
    #to answer questions based on the context retrieved from the vector store.
    retriever = vectorStore.as_retriever(search_kwargs={"k": 3}) # give 5 best results  or you can use as_retriever(search_kwargs={"k": 1})
    retriever_chain = create_retrieval_chain(retriever, chain)

    return retriever_chain
    

def chat_pipe(chain, question, chat_history):
    response = chain.invoke({
    'input': question,
    'chat_history': chat_history,
    'context': docs
    })
    return (response['answer'])

if __name__ == '__main__':
    docs = crawl_data('https://www.cicnews.com/2024/07/express-entry-1390-candidates-invited-in-latest-pnp-only-draw-0745309.html#gs.bs9dnf')
    vectorStore = vectorization(docs)
    chain = create_chain(vectorStore)

    chat_history = []
    
    while True:
        user_input = input('You:')
        if user_input.lower() == 'exit':
            break
        response = chat_pipe(chain, user_input, chat_history)

        chat_history.append(HumanMessage(content = user_input))
        chat_history.append(AIMessage(content = response)) 
        
        
        print('Assistant:', response)


# Context Aware Retriever 

In a context-aware retriever, we:

- **Incorporate Context**: Use the broader context, including previous queries and responses, to understand the user's intent better.
- **Enhance Accuracy**: Improve the precision of search results by disambiguating queries and prioritizing relevant information.
- **Leverage LLMs**: Integrate with large language models to interpret context and guide the retrieval process.
- **Maintain Interaction History**: Keep a history of interactions to maintain continuity and coherence across multiple turns.

Unlike basic retrievers that only match queries to documents, context-aware retrievers leverage the surrounding information, such as conversation history or user intent, to fetch more relevant results.


In [42]:
# pip install faiss-cpu
from langchain_core.prompts import ChatPromptTemplate
from dotenv import load_dotenv
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_community.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores.faiss import FAISS
from langchain.chains import create_retrieval_chain
from langchain_openai import ChatOpenAI
from langchain_core.messages import HumanMessage, AIMessage
from langchain_core.prompts import MessagesPlaceholder 

from langchain.chains.history_aware_retriever import create_history_aware_retriever

load_dotenv()

def crawl_data(url):
    loader = WebBaseLoader(url)
    docs = loader.load()
    
    splitter = RecursiveCharacterTextSplitter(
        chunk_size = 400, 
        chunk_overlap = 20)
    split_docs = splitter.split_documents(docs)
    return split_docs

def vectorization(docs):
    embedding = OpenAIEmbeddings()
    vectorStore = FAISS.from_documents(docs, embedding = embedding)
    return vectorStore

def create_chain(vectorStore):
    llm_model = ChatOpenAI(
    model = 'gpt-3.5-turbo')

    prompt = ChatPromptTemplate.from_messages([
        ("system", "Answer the the user question based on the context: {context}"),
        MessagesPlaceholder(variable_name = 'chat_history'),
        ("human", "{input}")   
    ])  
    
    # Create llm chain
    chain = create_stuff_documents_chain(
        llm = llm_model, 
        prompt = prompt
    )
    
    #A retrieval-augmented generation (RAG) pipeline, which combines a language model with a vector store 
    #to answer questions based on the context retrieved from the vector store.
    retriever = vectorStore.as_retriever(search_kwargs={"k": 3}) # give 5 best results  or you can use as_retriever(search_kwargs={"k": 1})
    
    retriever_prompt = ChatPromptTemplate.from_messages([
        MessagesPlaceholder(variable_name="chat_history"),
        ("human", "{input}"),
        ("human", "Given above conversation, generate a search query to look up in order to get information relevant to the conversation.")
    ])
    
    history_aware_retriever =  create_history_aware_retriever(
        llm = llm_model, 
        retriever = retriever,
        prompt = retriever_prompt
    )
    
    retriever_chain = create_retrieval_chain(history_aware_retriever, chain)

    return retriever_chain
    

def chat_pipe(chain, question, chat_history):
    response = chain.invoke({
    'input': question,
    'chat_history': chat_history,
    'context': docs
    })
    return (response['answer'])

if __name__ == '__main__':
    docs = crawl_data('https://www.cicnews.com/2024/07/express-entry-1390-candidates-invited-in-latest-pnp-only-draw-0745309.html#gs.bs9dnf')
    vectorStore = vectorization(docs)
    chain = create_chain(vectorStore)

    chat_history = []
    
    while True:
        user_input = input('You:')
        if user_input.lower() == 'exit':
            break
        response = chat_pipe(chain, user_input, chat_history)

        chat_history.append(HumanMessage(content = user_input))
        chat_history.append(AIMessage(content = response)) 
        
        
        print('Assistant:', response)


## Create an AI agent using LangChain

In [None]:
### Agents: We can give them an instruction and it will determine which actions to take and in which sequence

In [45]:

'''
This code creates an AI agent using the GPT-3.5-turbo model, which integrates a web search tool (TavilySearchResults) 
to answer user questions based on the most current data available on the web. 
'''


from dotenv import load_dotenv
load_dotenv()

from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

from langchain.agents import create_openai_functions_agent, AgentExecutor

#create_openai_functions_agent: create the definition of an agent
#AgentExecutor: To invoke our agent
#Different types of Agents in LangChain: https://python.langchain.com/v0.1/docs/modules/agents/agent_types/

from langchain_community.tools.tavily_search import TavilySearchResults

'''
TavilySearchResults is a tool within the LangChain ecosystem designed to integrate with 
search functionalities. It typically allows an agent to perform web searches and retrieve 
results that can be used to answer questions or provide information based on 
the most current data available on the web.
'''

model = ChatOpenAI(model = 'gpt-3.5-turbo')

prompt = ChatPromptTemplate.from_messages([
        ("system", "Answer the the user question."),
        ("human", "{input}"),
        MessagesPlaceholder(variable_name = 'agent_scratchpad') 
    ])  


search = TavilySearchResults()
tools = [search]

agent  = create_openai_functions_agent(
    llm = model,
    prompt = prompt, 
    tools =  tools)

agentExecutor = AgentExecutor(
    agent = agent, 
    tools = tools
    )

response = agentExecutor.invoke({
    "input": "What is current weather in London, Ontario, Canada"})

print(response)


# Agent with History (Context-aware)

In [None]:

from dotenv import load_dotenv
load_dotenv()

from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

from langchain.agents import create_openai_functions_agent, AgentExecutor
#create_openai_functions_agent: create the definition of an agent
#AgentExecutor: To invoke our agent
#Different types of Agents in LangChain: https://python.langchain.com/v0.1/docs/modules/agents/agent_types/

from langchain_community.tools.tavily_search import TavilySearchResults
from langchain_core.messages import HumanMessage, AIMessage
from langchain_core.prompts import MessagesPlaceholder


model = ChatOpenAI(model = 'gpt-4o-mini')

prompt = ChatPromptTemplate.from_messages([
        ("system", "Answer the the user question."),
        MessagesPlaceholder(variable_name ='chat_history'),
        ("human", "{input}"),
        MessagesPlaceholder(variable_name = 'agent_scratchpad') # Dynamically insert the conversation history into the prompt
    ])  
    


search = TavilySearchResults()
tools = [search]

agent  = create_openai_functions_agent(
    llm = model,
    prompt = prompt, 
    tools =  tools)

agentExecutor = AgentExecutor(
    agent = agent, 
    tools = tools
    )

def chat_pipe(agentExecutor, user_input, chat_history):
    response = agentExecutor.invoke({
        "input": user_input,
        "chat_history": chat_history
    })
    
    return response['output']


if __name__ == '__main__':
    

    chat_history = []
    
    while True:
        user_input = input('You:')
        if user_input.lower() == 'exit':
            break
        response = chat_pipe(agentExecutor, user_input, chat_history)

        chat_history.append(HumanMessage(content = user_input))
        chat_history.append(AIMessage(content = response)) 
        
        
        print('Assistant:', response)