### Loading the docs from Website


In [None]:
import scrapy
from scrapy.crawler import CrawlerProcess
import logging
import warnings
from scrapy.utils.deprecate import ScrapyDeprecationWarning

warnings.filterwarnings("ignore", category=ScrapyDeprecationWarning)
logging.basicConfig(level=logging.WARNING)

scraped_data = []
class ImmersifySpider(scrapy.Spider):
    name = "immersify"
    allowed_domains = ["immersify.com"]
    start_urls = [
        "https://immersify.com"    
    ]

    def parse(self, response):
        
        page_data = {
            "url": response.url,
            "title": response.css('title::text').get(),
            "headings": response.css('h1::text, h2::text, h3::text').getall(),
            "paragraphs": response.css('p::text').getall(),
        }

        
        scraped_data.append(page_data)

        
        print(f"Scraped data from {response.url}")
        print(page_data)


process = CrawlerProcess()
process.crawl(ImmersifySpider)
process.start()


print("Final Scraped Data:")
print(scraped_data)


print(f"Scraped {len(scraped_data)} pages.")

### Convert to LangChain Documents


In [None]:
from langchain.schema import Document


def convert_to_documents(scraped_data):
    documents = []
    for page in scraped_data:
        content = "\n".join(page['headings']) + "\n" + "\n".join(page['paragraphs'])
        documents.append(Document(page_content=content, metadata={"url": page['url'], "title": page['title']}))
    return documents


documents = convert_to_documents(scraped_data)
print(f"Converted {len(documents)} pages into documents.")

### Convert to vector

In [None]:
from langchain.vectorstores import Chroma
from langchain.embeddings.ollama import OllamaEmbeddings

# Initialize Chroma vectorstore
def store_in_chroma(documents):

    vectorstore = Chroma.from_documents(
        documents=documents,
        embedding=OllamaEmbeddings(model="llama3.2", show_progress=True),
        persist_directory="./chroma_db",
    )

    return vectorstore


documents = convert_to_documents(scraped_data)
vectorstore = store_in_chroma(documents)
print("Documents stored in Chroma.")

In [None]:
question = "What is Immersify?"
docs = vectorstore.similarity_search(question)

print(docs)

In [None]:
from langchain import hub
from langchain_community.llms import Ollama
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

llm = Ollama(model="llama3.2")

retriever = vectorstore.as_retriever()


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_prompt = hub.pull("rlm/rag-prompt")
qa_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | rag_prompt
    | llm
    | StrOutputParser()
)

### Query

In [None]:
# question = "What is Immersify ?"

question = "What is Immersify?"

qa_chain.invoke(question)

In [None]:
from langchain_chroma import Chroma
from langchain_community.embeddings import OllamaEmbeddings
from langchain import hub
from langchain_community.llms import Ollama
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

vectorstore = Chroma(persist_directory="./chroma_db", embedding_function=OllamaEmbeddings(model="llama3.2"))
llm = Ollama(model="llama3.2")
retriever = vectorstore.as_retriever()

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_prompt = hub.pull("rlm/rag-prompt")

qa_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | rag_prompt
    | llm
    | StrOutputParser()
)

# Type "exit" to stop the loop
while True:
    question = input("Question: ")
    if question.lower() == "exit":
        break
    answer = qa_chain.invoke(question)

    print(f"\nAnswer: {answer}\n")



