In [37]:
from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
# from langchain_community.memory import ConversationBufferMemory
from langchain_community.vectorstores import FAISS
# from langchain.llms import OpenAI
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.chat_history import InMemoryChatMessageHistory
from langchain_core.runnables import RunnableWithMessageHistory
from configs import API_KEY, DEFAULT_MODEL

In [28]:
url = "https://365datascience.com/upcoming-courses"

In [29]:
loader = WebBaseLoader(url)

In [30]:
raw_document = loader.load()

In [31]:
text_splitter = RecursiveCharacterTextSplitter()
documents = text_splitter.split_documents(raw_document)

In [33]:
embeddings = OpenAIEmbeddings(openai_api_key = API_KEY)

In [34]:
vectorstore = FAISS.from_documents(documents, embeddings)

In [38]:
retriever = vectorstore.as_retriever()
llm = ChatOpenAI(openai_api_key = API_KEY, model="gpt-4o-mini")
store = {}

def get_history(session_id: str):
    if session_id not in store:
        store[session_id] = InMemoryChatMessageHistory()
    return store[session_id]

chain = RunnableWithMessageHistory(llm, get_history)

In [43]:
import os
from typing import List

import bs4
from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_core.documents import Document
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableParallel, RunnablePassthrough

# 1. Config
# OPENAI_API_KEY = "YOUR_OPENAI_API_KEY"
os.environ["OPENAI_API_KEY"] = API_KEY

SOURCE_URL = "https://365datascience.com/upcoming-courses"  # example: replace with your URL
PERSIST_DIR = None #"./chroma_db"  # or None if you don't want persistence

# 2. Load docs from the web
def load_docs_from_url(url: str) -> List[Document]:
    loader = WebBaseLoader(
        web_paths=(url,),
        bs_kwargs=dict(
            parse_only=bs4.SoupStrainer()  # customize this for more precise HTML selection
        ),
    )
    docs = loader.load()
    return docs

# 3. Split docs into chunks
def split_docs(docs: List[Document]) -> List[Document]:
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        add_start_index=True,
    )
    return text_splitter.split_documents(docs)

# 4. Build or load vector store
def build_vectorstore(chunks: List[Document]) -> Chroma:
    embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
    vectorstore = Chroma.from_documents(
        documents=chunks,
        embedding=embeddings,
        persist_directory=PERSIST_DIR,
    )
    return vectorstore

# 5. Create RAG chain
def create_rag_chain(vectorstore: Chroma):
    retriever = vectorstore.as_retriever(search_kwargs={"k": 4})

    template = """You are a helpful assistant that answers questions based only on the provided context.
If you don't know the answer from the context, say you don't know.

Context:
{context}

Question:
{question}

Answer in a concise, technical style when appropriate.
"""
    prompt = ChatPromptTemplate.from_template(template)

    llm = ChatOpenAI(model="gpt-4.1-mini", temperature=0)

    # Map: get context docs + original question
    rag_inputs = RunnableParallel(
        context=retriever,
        question=RunnablePassthrough()
    )

    # Chain: (question) -> {context, question} -> prompt -> llm -> str
    rag_chain = rag_inputs | prompt | llm
    return rag_chain

# 6. Simple chat loop
def chat_loop(rag_chain):
    print("RAG chatbot ready. Type 'exit' to quit.\n")
    while True:
        question = input("You: ").strip()
        if question.lower() in {"exit", "quit"}:
            break
        response = rag_chain.invoke(question)
        # response is a ChatMessage; get content
        print(f"Bot: {response.content}\n")


def main():
    # If you want to reuse a persisted DB, check if it exists first
    if PERSIST_DIR and os.path.exists(PERSIST_DIR):
        print("Loading existing vector store...")
        embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
        vectorstore = Chroma(
            embedding_function=embeddings,
            persist_directory=PERSIST_DIR,
        )
    else:
        print("Loading documents from URL...")
        docs = load_docs_from_url(SOURCE_URL)

        print("Splitting documents...")
        chunks = split_docs(docs)

        print("Building vector store...")
        vectorstore = build_vectorstore(chunks)

    print("Creating RAG chain...")
    rag_chain = create_rag_chain(vectorstore)

    chat_loop(rag_chain)


if __name__ == "__main__":
    main()


Loading documents from URL...
Splitting documents...
Building vector store...
Creating RAG chain...
RAG chatbot ready. Type 'exit' to quit.



You:  what is the next course to be uploaded on 365DataScience platform ?


Bot: The provided context does not specify the next course to be uploaded on the 365 Data Science platform.



You:  what is 365dataSciencePLatform?


Bot: 365 Data Science is an online platform offering structured data science courses and certification programs. It provides training on in-demand tools like Python, SQL, and R, covering topics such as data analysis, machine learning, AI, data visualization, and more. The platform supports various career tracks including Data Scientist, Data Analyst, AI Engineer, and others, with courses designed for different skill levels from beginner to advanced.



KeyboardInterrupt: Interrupted by user