<h3><strong style="color:purple;">Author Name: Anirban Bose</strong></h3>
<h3><strong style="color:purple;">App Name: Personalized AI Tutor for Advanced NLP</strong></h3>

## 1: Install Required Libraries

In [203]:
!pip install -q gradio transformers datasets sentence-transformers langchain faiss-cpu langchain-community trafilatura nltk langchain-openai openai nltk --quiet

In [204]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
from nltk.tokenize import sent_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [205]:
import os
import ast
from collections import defaultdict
import gradio as gr
import random
from datasets import load_dataset
from transformers import (
    pipeline,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    AutoModelForSequenceClassification
)
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from langchain.chains import ConversationalRetrievalChain, LLMChain
from langchain.chains import RetrievalQA
from langchain.memory import ConversationBufferMemory
from langchain.llms import HuggingFacePipeline
from langchain.llms import HuggingFaceHub
from langchain.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
from tqdm import tqdm
import requests
from bs4 import BeautifulSoup
import trafilatura
from openai import OpenAI
import json

In [206]:
from google.colab import userdata  # only works if `userdata` is available
os.environ["HUGGINGFACEHUB_API_TOKEN"] = userdata.get("HF_TOKEN")
os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')

In [207]:
CHATGPT_MODEL = "gpt-4"

## 2: Create Dataset based on Huggingface LLM course

### 2.1: Fetch the valid urls

In [208]:
import requests

def get_valid_llm_course_urls(base_url="https://huggingface.co/learn/llm-course", max_chapters=30, max_pages=50):
    valid_urls = []

    for chapter in range(1, max_chapters + 1):
        for page in range(1, max_pages + 1):
            url = f"{base_url}/chapter{chapter}/{page}"
            response = requests.head(url)
            if response.status_code == 200:
                valid_urls.append(url)
            else:
                # stop checking more pages for this chapter once a 404 is hit
                break

    return valid_urls

# Run this to get the list of valid URLs
valid_urls = get_valid_llm_course_urls()
print(f"Found {len(valid_urls)} valid URLs")


Found 97 valid URLs


### 2.2: Create section chunks and convert them into Langchain document objects

In [209]:
section_chunks = []

for url in tqdm(valid_urls, desc="Scraping and chunking entire sections"):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, "html.parser")

        # Page title (as section group identifier)
        page_title = soup.title.string if soup.title else "Untitled"

        # Prepare for text accumulation
        content = []
        section_title = page_title

        for tag in soup.find_all(['h1', 'h2', 'h3', 'p', 'li']):
            tag_text = tag.get_text(strip=True)
            if tag.name in ['h1', 'h2', 'h3'] and content:
                full_text = " ".join(sent_tokenize(" ".join(content)))
                section_chunks.append({
                    "title": section_title,
                    "content": full_text,
                    "url": url
                })
                section_title = tag_text
                content = []
            elif tag.name in ['p', 'li']:
                content.append(tag_text)

        # Capture last chunk
        if content:
            full_text = " ".join(sent_tokenize(" ".join(content)))
            section_chunks.append({
                "title": section_title,
                "content": full_text,
                "url": url
            })

# 🧾 Preview example
print(f"✅ Total chunks (sections): {len(section_chunks)}")
print("📌 Sample Section:")
print("Title:", section_chunks[0]['title'])
print("Content:", section_chunks[0]['content'][:300])

Scraping and chunking entire sections: 100%|██████████| 97/97 [00:31<00:00,  3.08it/s]

✅ Total chunks (sections): 751
📌 Sample Section:
Title: Introduction - Hugging Face LLM Course
Content: Models Datasets Spaces Community Docs Enterprise Pricing   Log In Sign Up LLM Course documentation Introduction





In [210]:
section_chunks[0]

{'title': 'Introduction - Hugging Face LLM Course',
 'content': 'Models Datasets Spaces Community Docs Enterprise Pricing   Log In Sign Up LLM Course documentation Introduction',
 'url': 'https://huggingface.co/learn/llm-course/chapter1/1'}

In [211]:
# Convert your section_chunks into LangChain Document objects
documents = [
    Document(
        page_content=chunk["content"],
        metadata={"title": chunk["title"], "source": chunk["url"]}
    )
    for chunk in section_chunks
]

In [212]:
print(f"✅ Total number of document objects: {len(documents)}")

✅ Total number of document objects: 751


## 3: Setup Retrieval-Augmented QA with FAISS

### 3.1: Create FAISS vector store

In [213]:
embedding = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")
vectordb = FAISS.from_documents(documents, embedding)

### 3.2: Setup Retriever and RAG Chain

In [214]:
from transformers import AutoTokenizer, pipeline, AutoModelForCausalLM
from langchain.llms import HuggingFacePipeline

In [215]:
# Setup Custom Prompt template
prompt_template = PromptTemplate.from_template(
    """You are an expert NLP tutor. Use the context below to answer the question accurately.
If the answer is not in the context, say "I don't know."

Context:
{context}

Question: {question}
Answer:"""
)

In [216]:
# Use ChatGPT
llm = ChatOpenAI(model_name=CHATGPT_MODEL, temperature=0.3)

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=vectordb.as_retriever(),
    chain_type="stuff",
    return_source_documents=True,
    chain_type_kwargs={"prompt": prompt_template}
)

### 3.3: RAG-style QA function

In [217]:
import traceback

In [218]:
def ask_question(user_query):
    try:
        result = qa_chain(user_query)
        answer = result["result"]

        if answer.strip().lower() == "i don't know.":
            sources = "NA"
        else:
            sources = "\n".join(
                f"- {doc.metadata.get('title', 'Unknown')} ({doc.metadata.get('source', 'No URL')})"
                for doc in result["source_documents"]
            )

        return f"📌 **Answer:**\n{answer}\n\n🔗 **Sources:**\n{sources}"
    except Exception as e:
        return f"❌ **Error:**\n```\n{traceback.format_exc()}\n```"

## 4: Generate and Score MCQ Quizzes

### 4.1: Create Topics

In [219]:
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

In [220]:
# Get all unique titles and build title-to-doc mapping
title_to_docs = defaultdict(list)
unique_titles = set()

In [221]:
for doc in documents:
    title = doc.metadata.get("title")
    if title:
        title_to_docs[title].append(doc)
        unique_titles.add(title)

In [222]:
# Get the list of topics from the documents
topics = sorted(list(unique_titles))
topic_chunks = [topics[i:i+50] for i in range(0, len(topics), 50)]


In [223]:
#Group similar titles into broader topics using GPT
dropdown_topics_set = set()
topic_to_titles = defaultdict(set)

for chunk in tqdm(topic_chunks):
    prompt = f"""
You are given a list of topic strings from an NLP course. Group them under broader educational umbrella topics. This is for Personalized AI Tutor for Advanced NLP App.

Ignore navigation items, UI labels, quiz strings, etc.

Return a Python dictionary in the format:
{{"Transformers": ["Transformer Architectures", "Understanding"], "Tokenization": ["Text Tokenization", "Tokenizer"]}}

Topics:
{chunk}

Output:
"""

    try:
        response = client.chat.completions.create(
            model=CHATGPT_MODEL,
            messages=[
                {"role": "system", "content": "You are a helpful assistant that categorizes topics."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.0
        )

        parsed = ast.literal_eval(response.choices[0].message.content)

        if isinstance(parsed, dict):
            for broad_topic, titles in parsed.items():
                dropdown_topics_set.add(broad_topic)
                topic_to_titles[broad_topic].update(titles)

    except Exception as e:
        print("⚠️ GPT Error:", e)

100%|██████████| 12/12 [04:06<00:00, 20.58s/it]


In [224]:
# Map broad topics to documents using title_to_docs
topics_docs = defaultdict(list)

for broad_topic, title_set in topic_to_titles.items():
    for title in title_set:
        if title in title_to_docs:
            topics_docs[broad_topic].extend(title_to_docs[title])

In [225]:
# Final list of dropdown topics
dropdown_topics = ["All"] + sorted(dropdown_topics_set)
print("✅ Topics found:", dropdown_topics)
print("✅ Number of Topics found:", len(dropdown_topics))

✅ Topics found: ['All', 'APIs and Libraries', 'Additional Resources', 'Advanced Features', 'Advanced Topics', 'Algorithms', 'Alternative Evaluation', 'Annotation', 'Applications', 'Architecture', 'Attention Mechanisms', 'Audio Processing', 'Best Practices and Limitations', 'Chatbot Development', 'Code Formatting', 'Course Completion', 'Course Information', 'Course Introduction', 'Course Navigation', 'Course Navigation and Assistance', 'Course Navigation and Interaction', 'Course Progress', 'Custom Implementation', 'Data Acquisition', 'Data Handling', 'Data Management', 'Data Preparation', 'Dataset Handling', 'Dataset Management', 'Dataset Review', 'Debugging', 'Demo Creation', 'Deployment', 'Domain Adaptation', 'Encoding', 'Entity Handling', 'Evaluation', 'File Management', 'Framework Selection', 'Git Operations', 'Hosting and Integration', 'Image Processing', 'Importance of NLP', 'Input Management', 'Key Concepts', 'Language Model Learning', 'Language Models', 'Language Understanding 

### 4.2: Functions for the MCQ quizzes

In [226]:
# Quiz state variables
quiz_state = []
current_index = 0
score = 0

In [227]:
# Sample question generator (replace with LLM-generated content)
def get_mcq_questions_for_topic(topic, n):
    # Select relevant documents for the topic
    if topic == "All":
        relevant_docs = sum(topics_docs.values(), [])
    else:
        relevant_docs = topics_docs.get(topic, [])

    if not relevant_docs:
        return [{
            "question": "No content available for this topic.",
            "options": ["N/A", "N/A", "N/A", "N/A"],
            "correct": 0
        }]

    # Concatenate top N documents for context (limit token size)
    context = "\n\n".join(doc.page_content for doc in relevant_docs)

    prompt = f"""
You are a helpful assistant that generates multiple-choice questions (MCQs) for NLP topics.

Using the context below, generate {n} MCQs. Each question must:
- Have 4 options (as a list of strings).
- Indicate the correct option index (0 for A, 1 for B, etc.).
- Be returned in valid Python list format.

Context:
\"\"\"
{context}
\"\"\"

Return a Python list of dictionaries in the following format (and nothing else):

[
  {{
    "question": "Your question?",
    "options": ["Option A", "Option B", "Option C", "Option D"],
    "correct": 2
  }},
  ...
]
    """

    try:
        response = client.chat.completions.create(
            model=CHATGPT_MODEL,
            messages=[{"role": "user", "content": prompt}],
            temperature=0.7
        )

        raw_output = response.choices[0].message.content.strip()
        # Safely parse GPT output
        mcqs = ast.literal_eval(raw_output)
        print(raw_output)
        if isinstance(mcqs, list) and all("question" in q and "options" in q and "correct" in q for q in mcqs):
            return mcqs[:n]

    except Exception as e:
        print("❌ LLM Generation Failed:", e)

    return [{
        "question": "Failed to generate questions. Please try again.",
        "options": ["Retry", "Retry", "Retry", "Retry"],
        "correct": 0
    }]



In [228]:
# Format a question with labeled options
def format_question(question_obj, index):
    qtext = f"**Q{index+1}. {question_obj['question']}**"
    options = [f"{chr(65+i)}. {opt}" for i, opt in enumerate(question_obj["options"])]
    return qtext, options

In [229]:
# Start the quiz
def start_quiz(topic, n):
    global quiz_state, current_index, score
    quiz_state = get_mcq_questions_for_topic(topic, int(n))
    current_index = 0
    score = 0
    qtext, options = format_question(quiz_state[0], current_index)
    return qtext, gr.Radio(choices=options, visible=True), "", gr.update(visible=True), gr.update(visible=False), gr.update(visible=False)

In [230]:
# Submit answer only
def submit_answer(selected):
    global score
    if not selected:
        return gr.update(), gr.update(), "⚠️ Please select an answer.", gr.update(), gr.update(), gr.update()

    correct_index = quiz_state[current_index]["correct"]
    correct_ans = f"{chr(65+correct_index)}. {quiz_state[current_index]['options'][correct_index]}"

    if selected == correct_ans:
        score += 1
        result = "✅ Correct!"
    else:
        result = f"❌ Incorrect. Correct answer: **{correct_ans}**"

    score_status = f"**Score:** {score}/{current_index+1}"
    feedback_text = f"{result}<br>{score_status}"

    return gr.update(visible=False), gr.update(visible=False), feedback_text, gr.update(visible=False), gr.update(visible=False), gr.update(visible=True)


In [231]:
# Go to next question
def next_question():
    global current_index
    current_index += 1

    if current_index < len(quiz_state):
        qtext, options = format_question(quiz_state[current_index], current_index)
        return  gr.update(value=qtext, visible=True), gr.Radio(choices=options, value=None, visible=True), "", gr.update(visible=True), gr.update(visible=False), gr.update(visible=False)
    else:
        final_score = f"### 🏁 Quiz Complete! Your Score: **{score}/{len(quiz_state)}**"
        return "", gr.update(visible=False), final_score, gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)


In [232]:
# Restart quiz
def restart_quiz():
    return "", gr.update(visible=False), "", gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)


## 5: Build Conversational Tutor with LangChain Memory

In [233]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from langchain.llms import HuggingFacePipeline
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory

### 5.1: Setup retriever, memory, llm

In [234]:
# Setup retriever
conversational_retriever = vectordb.as_retriever(search_kwargs={"k": 3})

In [235]:
# Setup memory
conversational_memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

In [236]:
# Setup llm
conversational_llm = ChatOpenAI(model_name=CHATGPT_MODEL, temperature=0.6)

### 5.2: Set up LangChain QA agent with memory

In [237]:
conversational_qa_chain = ConversationalRetrievalChain.from_llm(
    llm=conversational_llm,
    retriever=conversational_retriever,
    memory=conversational_memory,
    verbose=False
)

In [238]:
def chat(user_message, history):
    try:
        # Try RAG-based QA first
        result = conversational_qa_chain({"question": user_message})
        answer = result["answer"].strip()

        # Fallback if RAG fails to answer
        if answer.lower() in ["i don't know.", "i don't know", "i cannot answer that."]:
            fallback_answer = conversational_llm.predict(user_message)
            answer = f"(Fallback Answer)\n{fallback_answer}"

        history.append((user_message, answer))
        return history, history

    except Exception as e:
        history.append((user_message, f"❌ Error: {str(e)}"))
        return history, history


In [239]:
def clear_chat():
    conversational_memory.clear()
    return [], []

## 6: Add DNLI-Style Reasoning Check

### 6.1: Functions for reasoning check

In [240]:
def get_premise_from_docs(topic):
    docs = topics_docs.get(topic, [])
    if not docs:
        return "No content found."

    # Filter out boilerplate or low-content text
    meaningful_docs = [
        doc.page_content.strip() for doc in docs
        if len(doc.page_content.strip()) > 100 and
        "augmented documentation experience" not in doc.page_content.lower()
    ]

    if not meaningful_docs:
        return "No meaningful content available after filtering."

    # Join filtered documents into a single large context
    full_context = "\n\n".join(meaningful_docs)

    # Truncate if context is too long
    truncated_context = full_context[:6000]  # approx ~2,000 tokens

    # Prepare the summarization prompt
    prompt = f"""
You are an expert NLP assistant. Summarize the following technical content clearly and concisely within 30 sentences.

Content:
\"\"\"
{truncated_context}
\"\"\"
Summary:
"""

    try:
        response = client.chat.completions.create(
            model=CHATGPT_MODEL,
            messages=[{"role": "user", "content": prompt}],
            temperature=0.4
        )
        summary = response.choices[0].message.content.strip()
        return summary

    except Exception as e:
        return f"❌ Error while summarizing: {str(e)}"


In [241]:
def populate_premise(topic):
        return get_premise_from_docs(topic)

In [242]:
def dnli_reasoning_check(premise, hypothesis):
        prompt = f"""
You're given a premise from a technical document and a user-generated hypothesis.

Classify their relationship as one of the following:
- entailment (hypothesis logically follows),
- neutral (uncertain),
- contradiction (hypothesis contradicts the premise).

Respond with one word: "entailment", "neutral", or "contradiction".

Premise: "{premise}"
Hypothesis: "{hypothesis}"
"""
        try:
            response = client.chat.completions.create(
                model=CHATGPT_MODEL,
                messages=[{"role": "user", "content": prompt}],
                temperature=0.3
            )
            result = response.choices[0].message.content.strip().lower()
            return f"### 🔎 Result: **{result.capitalize()}**"
        except Exception as e:
            return f"❌ Error: {str(e)}"

## 7: Build Gradio App

In [243]:


with gr.Blocks() as app:
    gr.Markdown("# 🧠 Personalized AI Tutor for Advanced NLP")

    with gr.Tab("📖 Ask a Question (RAG)"):
        q = gr.Textbox(label="Your Question")
        a = gr.Textbox(label="AI Answer + Context")
        gr.Button("Submit").click(ask_question, inputs=q, outputs=a)

    with gr.Tab("📚 Take a Quiz"):
        gr.Markdown("## 🧠 Personalized Quiz from Course Topics")

        topic = gr.Dropdown(choices=dropdown_topics, label="Select Topic")
        num_qs = gr.Slider(label="Number of Questions", minimum=1, maximum=10, step=1, value=5)

        start = gr.Button("Start Quiz")
        qtext = gr.Markdown()
        options = gr.Radio(choices=[], label="Options", visible=False)
        feedback = gr.Markdown()
        submit = gr.Button("Submit", visible=False)
        next_btn = gr.Button("Next", visible=False)
        restart = gr.Button("Restart", visible=False)
        start.click(start_quiz, inputs=[topic, num_qs], outputs=[qtext, options, feedback, submit, restart, next_btn])
        submit.click(submit_answer, inputs=options, outputs=[qtext, options, feedback, submit, restart, next_btn])
        next_btn.click(next_question, outputs=[qtext, options, feedback, submit, restart, next_btn])
        restart.click(restart_quiz, outputs=[qtext, options, feedback, submit, restart, next_btn])

    with gr.Tab("🗣️ Conversational Tutor"):
        chatbot = gr.Chatbot(label="Your Tutor")
        msg = gr.Textbox(label="Ask something", placeholder="Start your conversation...")
        submit_btn = gr.Button("Submit")
        clear_btn = gr.Button("End Chat")

        state = gr.State([])

        submit_btn.click(chat, inputs=[msg, state], outputs=[chatbot, state])
        clear_btn.click(clear_chat, inputs=None, outputs=[chatbot, state])

    with gr.Tab("🔍 Reasoning Check"):
        gr.Markdown("## 🔍 DNLI-Style Reasoning Check")

        topic_dropdown = gr.Dropdown(choices=dropdown_topics[1:], label="Select Topic")  # Exclude "All"
        premise_input = gr.Textbox(label="Premise", lines=3, interactive=False)
        hypothesis_input = gr.Textbox(label="Hypothesis", lines=3, placeholder="Enter your conclusion...")
        result_output = gr.Markdown()

        get_premise_btn = gr.Button("Load Premise")
        check_btn = gr.Button("Check Reasoning")

        get_premise_btn.click(populate_premise, inputs=topic_dropdown, outputs=premise_input)
        check_btn.click(dnli_reasoning_check, inputs=[premise_input, hypothesis_input], outputs=result_output)

app.launch(share=True)

  chatbot = gr.Chatbot(label="Your Tutor")


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://04b0a403e1364bf584.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


