## Load the Data

In [2]:
import bs4
from langchain_community.document_loaders import WebBaseLoader

page_url = "https://www.amazon.com/"

loader = WebBaseLoader(
    web_paths=[page_url],
    bs_kwargs={
        "parse_only": bs4.SoupStrainer("body"),  # Parse the body instead of a specific class
    },
    bs_get_text_kwargs={"separator": " | ", "strip": True},
)

docs = []

async for doc in loader.alazy_load():
    docs.append(doc)

if docs:
    print(f"Metadata: {docs[0].metadata}\n")
    print(f"First 500 chars:\n{docs[0].page_content[:500]}")
else:
    print("No content extracted.")

USER_AGENT environment variable not set, consider setting it to identify your requests.


Metadata: {'source': 'https://www.amazon.com/'}

First 500 chars:
Skip to | Main content | Keyboard shortcuts | Search | alt | + | / | Cart | shift | + | alt | + | C | Home | shift | + | alt | + | H | Orders | shift | + | alt | + | O | Show/Hide shortcuts | shift | + | alt | + | Z | To move between items, use your keyboard's up or down arrows. | .us | Deliver to | Pakistan | All | Select the department you want to search in | All Departments | Arts & Crafts | Automotive | Baby | Beauty & Personal Care | Books | Boys' Fashion | Computers | Deals | Digital Music


## Splitting Data

In [4]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    add_start_index=True,
)

splits = text_splitter.split_documents(docs)

print(f"Split blog post into {len(splits)} sub-documents.")

Split blog post into 7 sub-documents.


## Embedding and Storing

In [6]:
from langchain_ollama import OllamaEmbeddings
from langchain_community.vectorstores import FAISS

embedding = OllamaEmbeddings(model="nomic-embed-text:v1.5")

vector = FAISS.from_documents(docs, embedding)

## LLM

In [8]:
from langchain_ollama import ChatOllama

llm = ChatOllama(model="deepseek-r1:1.5b", temprature=0)

## Prompt

In [10]:
from langchain.prompts import PromptTemplate

advanced_prompt = """
You are a helpful AI assistant answering questions based on the provided context and previous conversation.

Context:
{context}

Chat History:
{chat_history}

Question:
{question}

Helpful Answer:
"""

## Retriever

In [12]:
from langchain.chains import retrieval

retriever = vector.as_retriever(search_kwargs={"k": 4})

In [13]:
from langchain.memory import ConversationBufferMemory

memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

  memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)


## Retrievel QA Chain

In [15]:
from langchain.chains import ConversationalRetrievalChain

qa_chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=retriever,
    memory=memory,
    combine_docs_chain_kwargs={
        "prompt": PromptTemplate.from_template(advanced_prompt)
    }
)

## Run the App

In [17]:
!streamlit run app.py

^C


In [95]:
import bs4
from langchain.chains import ConversationalRetrievalChain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain_community.embeddings import OllamaEmbeddings
from langchain.prompts import PromptTemplate
from langchain_ollama import ChatOllama
from langchain.document_loaders import WebBaseLoader

# ---- Load or Build FAISS ----
def build_or_load_vectorstore(url):
    embedding = OllamaEmbeddings(model="all-minilm:22m")  # Faster embeddings
    index_path = "faiss_index_new"

    try:
        vector_store = FAISS.load_local(index_path, embedding, allow_dangerous_deserialization=True)
        print("Loaded FAISS index from disk.")
    except:
        print("Building FAISS index...")
        loader = WebBaseLoader(web_paths=[url], bs_kwargs={"parse_only": bs4.SoupStrainer("body")})
        docs = list(loader.load())
        splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
        splits = splitter.split_documents(docs)
        vector_store = FAISS.from_documents(splits, embedding)
        vector_store.save_local(index_path)
        print("Saved FAISS index to disk.")
    
    return vector_store

# ---- LLM ----
def create_llm():
    return ChatOllama(model="tinyllama:1.1b", temperature=0, max_tokens=300, streaming=True)

# ---- Prompt ----
def create_advanced_prompt():
    template = """
    
    You are a helpful AI assistant answering questions based on the provided context and previous conversation.
    Only return the final answer.

    Context:
    {context}

    Chat History:
    {chat_history}

    Question:
    {query}

    Helpful Answer:
"""
    prompt = PromptTemplate(template=template, input_variables=["context", "chat_history", "query"])

# ---- QA Chain ----
def create_qa_chain(llm, retriever, prompt):
    return ConversationalRetrievalChain.from_llm(
        llm=llm,
        retriever=retriever,
        combine_docs_chain_kwargs={"prompt": prompt}
    )

# Usage
url = "https://www.amazon.com"
vector_store = build_or_load_vectorstore(url)
retriever = vector_store.as_retriever(search_kwargs={"k": 2})
llm = create_llm()
prompt = create_advanced_prompt()
qa_chain = create_qa_chain(llm, retriever, prompt)


Loaded FAISS index from disk.


In [97]:
query = "What is Amazon?"
answer = qa_chain.invoke({"question": query, "chat_history": []})
print("Answer:", answer["answer"])

Answer: Amazon is an online retail giant that offers a wide range of products for purchase, including books, electronics, clothing, household goods, and more. It was founded in 1994 by Jeff Bezos and has since grown into one of the largest e-commerce companies in the world with over 200 million active customers globally. Amazon's mission is to be Earth's most customer-centric company, helping people find what they need, getting it right, every time.


In [None]:
!streamlit run app2.py

In [101]:
!streamlit run app3.py

^C
