# Web Page Summerizer

## Load the document

In [1]:
# !pip install langchain\
#     langchain_community==0.0.20\
        

In [2]:
from langchain_community.document_loaders import WebBaseLoader

In [3]:
# dataset_url = "https://raw.githubusercontent.com/iamnaofil/E-commerce-Sales-Analysis/main/Sales%20Data%20Analysis.csv"
# dataset_url = "https://domo-support.domo.com/s/article/360043931814?language=en_US"
url = "https://www.chittorgarh.com/report/ipo-performance-report-listing-current-gain/125/all/"


In [4]:
web_loader = WebBaseLoader(url)

In [5]:
# To bypass SSL verification errors during fetching, you can set the "verify" option:

## Uncomment to bypass SSL verfication
# web_loader.requests_kwargs = {'verify':False}

In [6]:
doc = web_loader.load()

## Chunking the document

In [7]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [8]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size = 2000,
    chunk_overlap = 200
)

In [9]:
split_docs = splitter.split_documents(doc)

In [10]:
split_docs

[Document(page_content='IPO Performance Analysis 2024 - Listing Day and Post-IPO\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nKnow More \n\n\n\n\n\n\n\n\nZerodha (Trading & Demat Account)\n\n\n\n\nFREE Equity Delivery and MF\nFlat ₹20/trade Intra-day/F&O\nOpen Instant Account\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nUnlimited Trading @ ₹899 per month\n\n\nPS www.ProStocks.com\n\n\nFREE Account Opening + No Clearing Fees\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nIPO \nSME IPO \nBROKER REVIEWS \nSTOCK MARKET \nNRI \nCITY INFO \n\n\n\n\n\n\n\n\n\n\n\n×\n\n\nCurrent IPO\nIPO Dashboard\nCurrent Mainline IPO\nMainboard IPO Timetable\nIPO Calendar\nPerformance Tracker\nIPO Grey Market\n\n\nLive IPO Information\nIPO Subscription\nIPO Ratings\nIPO Reviews\nIPO Listing Date\nIPO Allotment\nBasis of Allotment\n\n\nLearn about IPO\nIPO Reports (Historic)\neBook - IPO Guide \nIPO FAQs\nIPO Articles\nIPO Message Board\n\n\nNCD Issues\nCurrent NCD Issues\nNCD Reviews\nNCD Subscription (L

## Embeddings

In [None]:
from langchain_community.vectorstores import FAISS
from langchain_openai import AzureOpenAIEmbeddings
from dotenv import load_dotenv
import os

load_dotenv()

In [None]:
OPENAI_EMBEDDING_MODEL = os.getenv("OPENAI_EMBEDDING_MODEL")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OPENAI_ENDPOINT = os.getenv("OPENAI_ENDPOINT")

### Embedding Model

In [None]:
embedding_model = AzureOpenAIEmbeddings(
    model=OPENAI_EMBEDDING_MODEL, 
    api_key=OPENAI_API_KEY, 
    azure_endpoint=OPENAI_ENDPOINT, 
    disallowed_special=(),
    )
embedding_model

In [None]:
os.environ["HTTPS_PROXY"]="blrproxy.ad.infosys.com:443"
os.environ["HTTP_PROXY"]="blrproxy.ad.infosys.com:443"

### Retriever

In [None]:
## create db
# vector_store = FAISS.from_documents(split_docs, embedding_model)
# vector_store.save_local("./vector_store")

In [None]:
vector_store = FAISS.load_local("./vector_store", embedding_model)

In [None]:
qa_retriver = vector_store.as_retriever(
    search_type = "similarity",
    search_kwargs = {"k":10}
)

## Chain

In [None]:
from langchain.chains import RetrievalQA
from langchain_openai import AzureChatOpenAI

In [None]:
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OPENAI_ENDPOINT = os.getenv("OPENAI_ENDPOINT")
OPENAI_MODEL = os.getenv("OPENAI_MODEL")
OPENAI_DEPLOYMENT = os.getenv("OPENAI_DEPLOYMENT")
OPENAI_API_VERSION = os.getenv("OPENAI_API_VERSION")

In [None]:
model = AzureChatOpenAI(
        temperature=0,
        api_key=OPENAI_API_KEY,
        api_version=OPENAI_API_VERSION,
        azure_endpoint=OPENAI_ENDPOINT,
        azure_deployment=OPENAI_DEPLOYMENT,
        model=OPENAI_MODEL,          
        # model_kwargs= { "top_p": 1}
    )
model

## Prompt Engineering

In [None]:
from langchain_core.prompts.prompt import PromptTemplate

### Chain Type

In [None]:
## Chain Type as "Stuff" i.e; Stuff Document

stuff_prompt_template = """
Return the your answer of the following question using the given context.

context : {context}
question : {question}
answer : 
"""

stuff_prompt = PromptTemplate(template = stuff_prompt_template, input_variables=["context", "question"])

In [None]:
## Chain Type as "map_reduce" i.e; Map Reduce Document

### 1. Map Reduce Prompt: which will be applied on each batch of the document parallelly.
question_prompt_template = """
Return the answer to the question using the context of information provided below.:

text: {context}
question : {question}
Answer :
"""
question_prompt = PromptTemplate(template = question_prompt_template, input_variables=["context"])

### 2. Combine Prompt: which will be applied on map reduced results to 
combine_prompt_template = """
Generate a summary of the following context.

context: {context}
"""
combine_prompt = PromptTemplate(template = combine_prompt_template, input_variables=["context", "question"])

In [None]:
my_chain_type = "map_reduce"

if my_chain_type == "stuff":
    my_chain_type_kwargs = {
        'prompt': stuff_prompt
    }

elif my_chain_type == "map_reduce":
    my_chain_type_kwargs = {
        # "map_reduce_prompt": map_reduce_prompt,
        "question_prompt": question_prompt,
        "combine_prompt": combine_prompt,
        "combine_document_variable_name": "context",
        "verbose" : True
    }

In [None]:
chain = RetrievalQA.from_chain_type(
    llm= model,
    retriever= qa_retriver,
    return_source_documents = True,
    verbose = True,
    chain_type= my_chain_type,
    chain_type_kwargs= my_chain_type_kwargs
)

In [None]:
query = "Which IPO is performing better in the market?"

In [None]:
res = chain.invoke({'query':query})

In [None]:
print(res["result"])