In [None]:
!pip install requests
!pip install html5lib
!pip install bs4
# !pip install langchain_community

In [None]:
!pip install --upgrade langchain

In [None]:
!pip install unstructured

In [None]:
!pip install tiktoken

## Libraries import

In [None]:
import requests
import sys
import os
from bs4 import BeautifulSoup
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from urllib.parse import urljoin, urlparse
from langchain.document_loaders import ReadTheDocsLoader
from langchain_community.document_loaders import BSHTMLLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough


In [None]:
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import FAISS
from langchain.memory import ConversationBufferMemory
from langchain.agents.mrkl.base import ZeroShotAgent
from langchain.agents import Tool
from langchain.agents.agent import AgentExecutor

In [None]:
import sys
sys.path.insert(0, "../..")
from src.aws_utils import get_secrets
from langchain.llms import OpenAI
from src.prompts.prompt import REACT_PROMPT
from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain import PromptTemplate, LLMChain
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

secrets = get_secrets()
# OPENAI_MODEL = "gpt-3.5-turbo-instruct"
OpenAI.api_key = secrets["OPENAI_API_KEY"]
os.environ["OPENAI_API_KEY"] = secrets["OPENAI_API_KEY"]

In [None]:
base_url = "https://procogia.com/"

response = requests.get(base_url)
soup = BeautifulSoup(response.content, "html5lib")

## Crawl all pages

In [None]:
def get_links(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    links = soup.find_all('a', href=True)
    return [link['href'] for link in links]

def crawl_website(base_url, max_depth=3):
    visited_urls = set()

    def recursive_crawl(url, depth):
        if depth > max_depth:
            return

        if url in visited_urls:
            return

        print(f"Crawling: {url}")
        visited_urls.add(url)

        links = get_links(url)
        for link in links:
            absolute_url = urljoin(base_url, link)
            if urlparse(absolute_url).scheme in ['http', 'https']:
                recursive_crawl(absolute_url, depth + 1)

    recursive_crawl(base_url, 0)


In [None]:
crawl_website('https://procogia.com/', max_depth=3)

## Crawl only ProCogia Pages

In [None]:
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

def get_links(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    links = soup.find_all('a', href=True)
    return [link['href'] for link in links]

def crawl_website(base_url, max_depth=3, output_folder='crawled_pages'):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    visited_urls = set()

    def is_same_domain(url):
        return urlparse(url).netloc == urlparse(base_url).netloc

    def save_page(url, content):
        filename = os.path.join(output_folder, f"{urlparse(url).path.replace('/', '_')}.html")
        with open(filename, 'w', encoding='utf-8') as file:
            file.write(content)

    def recursive_crawl(url, depth):
        if depth > max_depth or not is_same_domain(url):
            return

        if url in visited_urls:
            return

        print(f"Crawling: {url}")
        visited_urls.add(url)

        response = requests.get(url)
        content = response.text

        save_page(url, content)

        links = get_links(url)
        for link in links:
            absolute_url = urljoin(base_url, link)
            if is_same_domain(absolute_url):
                recursive_crawl(absolute_url, depth + 1)

    recursive_crawl(base_url, 0)
    
crawl_website('https://procogia.com', max_depth=3)


## Retrieval and Embedding

In [None]:
from langchain_community.document_loaders import DirectoryLoader
loader = DirectoryLoader("crawled_pages/")

In [None]:
raw_documents = loader.load()
print(f"loaded {len(raw_documents) } documents")

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=100, separators=["\n\n", "\n", " ", ""]
        )
documents = text_splitter.split_documents(documents=raw_documents)
print(f"Splitted into {len(documents)} chunks")

In [None]:
EMBEDDING_MODEL_PATH = "intfloat/e5-base-v2"

# Get embedding model
embeddings = SentenceTransformerEmbeddings(model_name=EMBEDDING_MODEL_PATH)

# Use FAISS to create a database from the embeddings, with the original text
docsearch = FAISS.from_documents(documents, embeddings)

## LLM- Chain

In [None]:
template = """You are Chatbot, an assistant that offers support for answering questions related to ProCogia. 
Chatbot should assume that any query from the User will be regarding ProCogia.com, and so this information does not need to be added.
If Chatbot thinks that the Question is completely irrelevant, then state that you are unable to answer that User query in a reasonable and friendly manner. 
You are a support bot for ProCogia first and foremost. 
Input: {question}
Output: 
"""

prompt = PromptTemplate(template=template, input_variables=["question"])

llm = ChatOpenAI(temperature=0.7, model_name="gpt-4")
memory = ConversationBufferMemory(
memory_key='chat_history', return_messages=True)


conversation_chain = ConversationalRetrievalChain.from_llm(
        llm=llm,
        retriever=docsearch.as_retriever(),
        verbose = True,
        memory=memory
        )


In [None]:
retriever=docsearch.as_retriever()

In [None]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

## Chain with GuardRail

In [None]:

def qa_chain_latest(question):
    
    qa_system_prompt = """You are Chatbot, an assistant that offers support for answering questions related to ProCogia. \
                        Chatbot should assume that any query from the User will be regarding ProCogia.com, and so this information does not need to be added.\
                        If Chatbot thinks that the Question is completely irrelevant, then state that you are unable to answer that User query in a reasonable and friendly manner. \
                        You are a support bot for ProCogia first and foremost. \
                        Use the following pieces of retrieved context to answer the question. \
                        Do not answer any question, mathematical equation or anything irrelevant to ProCogia.\
                        If you don't know the answer, just say that you don't know as you are a support bot for ProCogia. \
                        Use three sentences maximum and keep the answer concise.\
                        Input: {context}
                        Output: \
                        """


    qa_prompt = ChatPromptTemplate.from_messages(
                [
                ("system", qa_system_prompt),
                MessagesPlaceholder(variable_name="chat_history"),
                ("human", "{question}"),
                ]
                )


    def contextualized_question(input: dict):
        if input.get("chat_history"):
            return contextualize_q_chain
        else:
            return input["question"]


    rag_chain = (
            RunnablePassthrough.assign(
            context=contextualized_question | retriever | format_docs
            )
            | qa_prompt
            | llm
            )

    contextualize_q_chain = qa_prompt | llm | StrOutputParser()

    ai_msg = rag_chain.invoke({"question": question, "chat_history": chat_history})
    # chat_history.extend([HumanMessage(content=question), ai_msg])
    return ai_msg.content

In [None]:
chat_history = []

question = "What is Task Decomposition?"
ai_msg = rag_chain.invoke({"question": question, "chat_history": chat_history})
# chat_history.extend([HumanMessage(content=question), ai_msg])

# second_question = "What are common ways of doing it?"
# rag_chain.invoke({"question": second_question, "chat_history": chat_history})

In [None]:
ai_msg.content

## Front End Gradio

In [None]:
import gradio as gr

In [None]:
def q_a_func(query):
    template = """You are Chatbot, an assistant that offers support for answering questions related to ProCogia. 
    Chatbot should assume that any query from the User will be regarding ProCogia.com, and so this information does not need to be added.
    If Chatbot thinks that the Question is completely irrelevant, then state that you are unable to answer that User query in a reasonable and friendly manner. 
    You are a support bot for ProCogia first and foremost. 
    Input: {question}
    Output: 
    """
    
    llm = ChatOpenAI(temperature=0.7, model_name="gpt-4")
    memory = ConversationBufferMemory(
    memory_key='chat_history', return_messages=True)
    conversation_chain = ConversationalRetrievalChain.from_llm(
                            llm=llm,
                            retriever=docsearch.as_retriever(),
                            verbose = True,
                            memory=memory
                            )
    result = conversation_chain({"question": query})
    x = result["answer"]
    print(x)
    return x

In [None]:
import gradio as gr
import random
import time

with gr.Blocks() as demo:
    chatbot = gr.Chatbot(label="ProCogia Chatbot",height=800)
    msg = gr.Textbox(label="Question")
    clear = gr.ClearButton([msg, chatbot])

    def respond(message, chat_history):
        bot_message = q_a_func(message)
        chat_history.append((message, bot_message))
        time.sleep(2)
        return "", chat_history

    msg.submit(respond, [msg, chatbot], [msg, chatbot])

if __name__ == "__main__":
    demo.launch()


## Front End With Guardrail

In [None]:
import gradio as gr
import random
import time
with gr.Blocks() as demo:
    chatbot = gr.Chatbot(label="ProCogia Chatbot",height=800)
    msg = gr.Textbox(label="Question")
    clear = gr.ClearButton([msg, chatbot])

    def respond(message, chat_history):
        bot_message = qa_chain_latest(message)
        chat_history.append((message, bot_message))
        time.sleep(2)
        return "", chat_history

    msg.submit(respond, [msg, chatbot], [msg, chatbot])

if __name__ == "__main__":
    demo.launch()


In [None]:
demo.close()

In [None]:
# inputs = gr.components.Textbox(label="Enter question:")
# answer_box = gr.components.Textbox(label="Answer")

# # Create the Gradio interface
# demo = gr.Interface(
#     fn=q_a_func,
#     inputs="text",
#     outputs=[answer_box],
#     title="ProCogia Chatbot",
#     description="Enter question",
# ).launch(share=True)

## Example runs

In [None]:
query = "Who is Giselle Bagatini?"
result = conversation_chain({"question": query})
answer = result["answer"]
answer

In [None]:
query = "What does ProCogia do?"
result = conversation_chain({"question": query})
answer = result["answer"]
answer

In [None]:
query = "Who is the Founder of ProCogia?"
result = conversation_chain({"question": query})
answer = result["answer"]
answer

In [None]:
query = "What are the different solutions provided by ProCogia?"
result = conversation_chain({"question": query})
answer = result["answer"]
answer