### Loading the docs from Website


In [1]:
import scrapy
from scrapy.crawler import CrawlerProcess
import logging
import warnings
from scrapy.utils.deprecate import ScrapyDeprecationWarning

warnings.filterwarnings("ignore", category=ScrapyDeprecationWarning)
logging.basicConfig(level=logging.WARNING)

scraped_data = []
class ImmersifySpider(scrapy.Spider):
    name = "immersify"
    allowed_domains = ["immersify.com"]
    start_urls = [
        "https://immersify.com"    
    ]

    def parse(self, response):
        
        page_data = {
            "url": response.url,
            "title": response.css('title::text').get(),
            "headings": response.css('h1::text, h2::text, h3::text').getall(),
            "paragraphs": response.css('p::text').getall(),
        }

        
        scraped_data.append(page_data)

        
        print(f"Scraped data from {response.url}")
        print(page_data)


process = CrawlerProcess()
process.crawl(ImmersifySpider)
process.start()


print("Final Scraped Data:")
print(scraped_data)


print(f"Scraped {len(scraped_data)} pages.")

INFO:scrapy.utils.log:Scrapy 2.11.1 started (bot: scrapybot)
2025-07-08 12:51:18 [scrapy.utils.log] INFO: Scrapy 2.11.1 started (bot: scrapybot)
INFO:scrapy.utils.log:Versions: lxml 5.2.1.0, libxml2 2.10.4, cssselect 1.2.0, parsel 1.8.1, w3lib 2.1.2, Twisted 23.10.0, Python 3.8.16 | packaged by conda-forge | (default, Feb  1 2023, 16:01:13) - [Clang 14.0.6 ], pyOpenSSL 23.2.0 (OpenSSL 3.3.1 4 Jun 2024), cryptography 41.0.3, Platform macOS-15.5-arm64-arm-64bit
2025-07-08 12:51:18 [scrapy.utils.log] INFO: Versions: lxml 5.2.1.0, libxml2 2.10.4, cssselect 1.2.0, parsel 1.8.1, w3lib 2.1.2, Twisted 23.10.0, Python 3.8.16 | packaged by conda-forge | (default, Feb  1 2023, 16:01:13) - [Clang 14.0.6 ], pyOpenSSL 23.2.0 (OpenSSL 3.3.1 4 Jun 2024), cryptography 41.0.3, Platform macOS-15.5-arm64-arm-64bit
INFO:scrapy.addons:Enabled addons:
[]
2025-07-08 12:51:18 [scrapy.addons] INFO: Enabled addons:
[]
DEBUG:scrapy.utils.log:Using reactor: twisted.internet.selectreactor.SelectReactor
2025-07-08 1

Scraped data from https://immersify.com
{'url': 'https://immersify.com', 'title': '\n        Immersify\n    ', 'headings': ['A selection of Institutions using Immersify', 'Want to find out more?', 'Cookies Policy'], 'paragraphs': ['Immersify provides lecturers and academics with interactive resources spanning from beginner to advanced content, for a comprehensive learning experience.', '3D visuals, voiceover, simulations, games and more to make even the most difficult subjects fun!', 'A huge catalog of flexible, peer-reviewed content ready to integrate seamlessly with any curriculum.', 'Discover the proven impact immersify can have on student outcomes through our rich analytics.', 'Study conducted by Waveform, independent market research company, 2023.', 'of users agree that the platform has high quality visuals and resources.', 'say that Immersify makes studying more enjoyable.', 'of Immersify users think that it’s relevant to them and their studies.', 'find the platform useful to pre

### Convert to LangChain Documents


In [2]:
from langchain.schema import Document


def convert_to_documents(scraped_data):
    documents = []
    for page in scraped_data:
        content = "\n".join(page['headings']) + "\n" + "\n".join(page['paragraphs'])
        documents.append(Document(page_content=content, metadata={"url": page['url'], "title": page['title']}))
    return documents


documents = convert_to_documents(scraped_data)
print(f"Converted {len(documents)} pages into documents.")

Converted 1 pages into documents.


### Convert to vector

In [3]:
from langchain.vectorstores import Chroma
from langchain.embeddings.ollama import OllamaEmbeddings

# Initialize Chroma vectorstore
def store_in_chroma(documents):

    vectorstore = Chroma.from_documents(
        documents=documents,
        embedding=OllamaEmbeddings(model="llama3.2", show_progress=True),
        persist_directory="./chroma_db",
    )

    return vectorstore


documents = convert_to_documents(scraped_data)
vectorstore = store_in_chroma(documents)
print("Documents stored in Chroma.")

INFO:chromadb.telemetry.product.posthog:Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
2025-07-08 12:51:19 [chromadb.telemetry.product.posthog] INFO: Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
DEBUG:chromadb.config:Starting component System
2025-07-08 12:51:20 [chromadb.config] DEBUG: Starting component System
DEBUG:chromadb.config:Starting component Posthog
2025-07-08 12:51:20 [chromadb.config] DEBUG: Starting component Posthog
DEBUG:chromadb.config:Starting component OpenTelemetryClient
2025-07-08 12:51:20 [chromadb.config] DEBUG: Starting component OpenTelemetryClient
DEBUG:chromadb.config:Starting component SqliteDB
2025-07-08 12:51:20 [chromadb.config] DEBUG: Starting component SqliteDB
DEBUG:chromadb.config:Starting component QuotaEnforcer
2025-07-08 12:51:20 [chromadb.config] DEBUG: Starting component QuotaEnforcer
DEBUG:chromadb.config:Starting

Documents stored in Chroma.


In [4]:
question = "What is Immersify?"
docs = vectorstore.similarity_search(question)

print(docs)

OllamaEmbeddings:   0%|          | 0/1 [00:00<?, ?it/s]DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): localhost:11434
2025-07-08 12:51:21 [urllib3.connectionpool] DEBUG: Starting new HTTP connection (1): localhost:11434
DEBUG:urllib3.connectionpool:https://us.i.posthog.com:443 "POST /batch/ HTTP/11" 200 15
2025-07-08 12:51:21 [urllib3.connectionpool] DEBUG: https://us.i.posthog.com:443 "POST /batch/ HTTP/11" 200 15
DEBUG:urllib3.connectionpool:http://localhost:11434 "POST /api/embeddings HTTP/11" 200 None
2025-07-08 12:51:21 [urllib3.connectionpool] DEBUG: http://localhost:11434 "POST /api/embeddings HTTP/11" 200 None
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 11.24it/s]

[Document(metadata={'title': '\n        Immersify\n    ', 'url': 'https://immersify.com'}, page_content='A selection of Institutions using Immersify\nWant to find out more?\nCookies Policy\nImmersify provides lecturers and academics with interactive resources spanning from beginner to advanced content, for a comprehensive learning experience.\n3D visuals, voiceover, simulations, games and more to make even the most difficult subjects fun!\nA huge catalog of flexible, peer-reviewed content ready to integrate seamlessly with any curriculum.\nDiscover the proven impact immersify can have on student outcomes through our rich analytics.\nStudy conducted by Waveform, independent market research company, 2023.\nof users agree that the platform has high quality visuals and resources.\nsay that Immersify makes studying more enjoyable.\nof Immersify users think that it’s relevant to them and their studies.\nfind the platform useful to prepare for their exams.\nOur dentistry content is created th




In [5]:
from langchain import hub
from langchain_community.llms import Ollama
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

llm = Ollama(model="llama3.2")

retriever = vectorstore.as_retriever()


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_prompt = hub.pull("rlm/rag-prompt")
qa_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | rag_prompt
    | llm
    | StrOutputParser()
)



DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.smith.langchain.com:443
2025-07-08 12:51:21 [urllib3.connectionpool] DEBUG: Starting new HTTPS connection (1): api.smith.langchain.com:443
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (2): api.smith.langchain.com:443
2025-07-08 12:51:21 [urllib3.connectionpool] DEBUG: Starting new HTTPS connection (2): api.smith.langchain.com:443
DEBUG:urllib3.connectionpool:https://api.smith.langchain.com:443 "GET /info HTTP/11" 200 748
2025-07-08 12:51:21 [urllib3.connectionpool] DEBUG: https://api.smith.langchain.com:443 "GET /info HTTP/11" 200 748
DEBUG:urllib3.connectionpool:https://api.smith.langchain.com:443 "GET /info HTTP/11" 200 748
2025-07-08 12:51:21 [urllib3.connectionpool] DEBUG: https://api.smith.langchain.com:443 "GET /info HTTP/11" 200 748
DEBUG:urllib3.connectionpool:https://us.i.posthog.com:443 "POST /batch/ HTTP/11" 200 15
2025-07-08 12:51:21 [urllib3.connectionpool] DEBUG: https://us.i.posthog.com:4

### Query

In [6]:
# question = "What is Immersify ?"

question = "What is Immersify?"

qa_chain.invoke(question)

OllamaEmbeddings:   0%|          | 0/1 [00:00<?, ?it/s]DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): localhost:11434
2025-07-08 12:51:21 [urllib3.connectionpool] DEBUG: Starting new HTTP connection (1): localhost:11434
DEBUG:urllib3.connectionpool:http://localhost:11434 "POST /api/embeddings HTTP/11" 200 None
2025-07-08 12:51:21 [urllib3.connectionpool] DEBUG: http://localhost:11434 "POST /api/embeddings HTTP/11" 200 None
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  9.30it/s]
DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): localhost:11434
2025-07-08 12:51:21 [urllib3.connectionpool] DEBUG: Starting new HTTP connection (1): localhost:11434
DEBUG:urllib3.connectionpool:http://localhost:11434 "POST /api/generate HTTP/11" 200 None
2025-07-08 12:51:24 [urllib3.connectionpool] DEBUG: http://localhost:11434 "POST /api/generate HTTP/11" 200 None


"Immersify is an educational platform providing interactive resources for lecturers and academics to enhance learning experiences with 3D visuals, simulations, games, and other engaging content. It offers a comprehensive catalog of flexible, peer-reviewed content that can be integrated into various curricula. The platform aims to make studying more enjoyable and relevant to students' needs."

In [None]:
from langchain_chroma import Chroma
from langchain_community.embeddings import OllamaEmbeddings
from langchain import hub
from langchain_community.llms import Ollama
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

vectorstore = Chroma(persist_directory="./chroma_db", embedding_function=OllamaEmbeddings(model="llama3.2"))
llm = Ollama(model="llama3.2")
retriever = vectorstore.as_retriever()

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_prompt = hub.pull("rlm/rag-prompt")

qa_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | rag_prompt
    | llm
    | StrOutputParser()
)

# Type "exit" to stop the loop
while True:
    question = input("Question: ")
    if question.lower() == "exit":
        break
    answer = qa_chain.invoke(question)

    print(f"\nAnswer: {answer}\n")





DEBUG:chromadb.api.segment:Collection langchain already exists, returning existing collection.
2025-07-08 12:51:26 [chromadb.api.segment] DEBUG: Collection langchain already exists, returning existing collection.


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.smith.langchain.com:443
2025-07-08 12:51:26 [urllib3.connectionpool] DEBUG: Starting new HTTPS connection (1): api.smith.langchain.com:443
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (2): api.smith.langchain.com:443
2025-07-08 12:51:26 [urllib3.connectionpool] DEBUG: Starting new HTTPS connection (2): api.smith.langchain.com:443
DEBUG:urllib3.connectionpool:https://api.smith.langchain.com:443 "GET /info HTTP/11" 200 748
2025-07-08 12:51:26 [urllib3.connectionpool] DEBUG: https://api.smith.langchain.com:443 "GET /info HTTP/11" 200 748
DEBUG:urllib3.connectionpool:https://api.smith.langchain.com:443 "GET /info HTTP/11" 200 748
2025-07-08 12:51:26 [urllib3.connectionpool] DEBUG: https://api.smith