In [1]:
!pip install -U pyarrow>=21.0.0 pydantic==2.11.1 libraft-cu12==25.6.0 pylibraft-cu12==25.6.0 rmm-cu12==25.6.0


[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 2.12.0 requires google-cloud-bigquery-storage<3.0.0,>=2.30.0, which is not installed.
raft-dask-cu12 25.2.0 requires libraft-cu12==25.2.*, but you have libraft-cu12 25.6.0 which is incompatible.
raft-dask-cu12 25.2.0 requires pylibraft-cu12==25.2.*, but you have pylibraft-cu12 25.6.0 which is incompatible.
libcuml-cu12 25.2.1 requires libraft-cu12==25.2.*, but you have libraft-cu12 25.6.0 which is incompatible.
libcuvs-cu12 25.2.1 requires libraft-cu12==25.2.*, but you have libraft-cu12 25.6.0 which is incompatible.
cuml-cu12 25.2.1 requires pylibraft-cu12==25.2.*, but you have pylibraft-cu12 25.6.0 which is incompatible.
cuml-cu12 25.2.1 requires rmm-cu12==25.2.*, but you have rmm-cu12 25.6.0 which is incompatible.
pylibcudf-cu12 25.2.2 requires pyarrow<20.0.0a0,>=14.0.0; platform_machine =

In [2]:
!pip install torch transformers accelerate bitsandbytes
!pip install langchain langchain-community
!pip install pypdf sentence-transformers faiss-cpu 

Collecting bitsandbytes
  Downloading bitsandbytes-0.48.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_

In [3]:
import torch
import json
import os
import faiss 
import numpy as np
import re 
import shutil
from typing import List, Dict, TypedDict, Optional
from sklearn.feature_extraction.text import TfidfVectorizer
from langchain.docstore.document import Document
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

class QuizItem(TypedDict):
    question_text: str
    model_generated_answer: str
    justification_quote: str

class QuizBot:
    def __init__(self, 
                 pdf_path: str, 
                 model_id: str = "/kaggle/input/qwen2.5/transformers/7b-instruct/1", 
                 embedding_model_id: str = "sentence-transformers/all-MiniLM-L6-v2"):
        
        base_name = os.path.splitext(os.path.basename(pdf_path))[0]
        self.pdf_path = pdf_path
        self.quiz_data_path = f"quiz_data_{base_name}.json"
        self.faiss_db_path = f"faiss_db_{base_name}"
        self.quiz_data = [] 
        
        print(f"Khởi tạo QuizBot cho: {os.path.basename(pdf_path)}")
        try:
            self.llm = self._initialize_llm(model_id)
            self.embeddings = self._initialize_embeddings(embedding_model_id)
            self._setup_database_and_quiz()
            
            if self.quiz_data:
                 print(f"Chatbot sẵn sàng với {len(self.quiz_data)} câu hỏi")
            else:
                print("Khởi tạo hoàn tất nhưng không có câu hỏi nào được tạo ra")

        except Exception as e:
            print(f"Lỗi khởi tạo: {e}")
            self.llm = None
            self.embeddings = None

    def _initialize_llm(self, model_id: str):
        tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
        model = AutoModelForCausalLM.from_pretrained(
            model_id, torch_dtype=torch.bfloat16, load_in_4bit=True, device_map="auto"
        )
        pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=256, return_full_text=False)
        return HuggingFacePipeline(pipeline=pipe)

    def _initialize_embeddings(self, model_id: str):
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        return HuggingFaceEmbeddings(model_name=model_id, model_kwargs={'device': device})

    def _setup_database_and_quiz(self):
        if os.path.exists(self.faiss_db_path) and os.path.exists(self.quiz_data_path):
            print("Tải dữ liệu có sẵn")
            self.db = FAISS.load_local(self.faiss_db_path, self.embeddings, allow_dangerous_deserialization=True)
            self.quiz_data = self._load_quiz_data()
        else:
            print("Bắt đầu xử lý file PDF...")
            loader = PyPDFLoader(self.pdf_path)
            documents = loader.load()
            
            if not documents:
                print(f"Không thể tải nội dung từ PDF: {self.pdf_path}")
                return

            text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=20)
            all_chunks = text_splitter.split_documents(documents)
            
            if not all_chunks:
                print("Không có chunk nào được tạo từ tài liệu")
                return

            print(f"Tạo FAISS DB từ {len(all_chunks)} chunks...")
            self.db = FAISS.from_documents(all_chunks, self.embeddings)
            self.db.save_local(self.faiss_db_path)

            quality_chunks = self._filter_quality_chunks(all_chunks)
            topic_chunks = self._adaptive_chunk_selection(quality_chunks)
            
            self.quiz_data = self._generate_and_save_quiz_data_from_context(topic_chunks)

    def _filter_quality_chunks(self, chunks: List[Document]) -> List[Document]:
        quality_chunks = []
        for chunk in chunks:
            text = chunk.page_content.strip()
            if (len(text) > 50 and  
                not re.match(r'^[\d\s\W]+$', text) and
                len(set(text.lower().split())) > 3): 
                quality_chunks.append(chunk)
        print(f"Lọc được {len(quality_chunks)}/{len(chunks)} chunks chất lượng")
        return quality_chunks

    def _adaptive_chunk_selection(self, all_chunks: List[Document]) -> List[Document]:
        tfidf_chunks = self._rank_chunks_by_tfidf(all_chunks, top_k=min(5, len(all_chunks)))
        diverse_chunks = self._select_diverse_chunks(all_chunks, top_k=min(5, len(all_chunks)))
        
        combined_dict = {}
        for chunk in tfidf_chunks + diverse_chunks:
            combined_dict[chunk.page_content[:200]] = chunk
        
        combined = list(combined_dict.values())
        return combined[:8]  

    def _rank_chunks_by_tfidf(self, all_chunks: List[Document], top_k: int = 20) -> List[Document]:
        corpus = [chunk.page_content for chunk in all_chunks]
        
        vectorizer = TfidfVectorizer(stop_words=None, max_features=500) 
        tfidf_matrix = vectorizer.fit_transform(corpus)
        
        chunk_scores = np.array(tfidf_matrix.mean(axis=1)).flatten()
        
        num_to_select = min(top_k, len(all_chunks))
        top_k_indices = chunk_scores.argsort()[-num_to_select:][::-1]

        top_chunks = [all_chunks[i] for i in top_k_indices]
        return top_chunks

    def _select_diverse_chunks(self, all_chunks: List[Document], top_k: int = 2) -> List[Document]:
        if len(all_chunks) <= top_k:
            return all_chunks
    
        selected = [all_chunks[0]]
        remaining = all_chunks[1:]
    
        while len(selected) < top_k and remaining:
            best_chunk = None
            lowest_similarity = float("inf")
    
            for chunk in remaining:
                avg_similarity = self._calculate_avg_similarity(chunk, selected)
                if avg_similarity < lowest_similarity:
                    lowest_similarity = avg_similarity
                    best_chunk = chunk
    
            if best_chunk:
                selected.append(best_chunk)
                remaining.remove(best_chunk)
    
        return selected
    

    def _calculate_avg_similarity(self, target_chunk: Document, chunk_list: List[Document]) -> float:
        if not chunk_list:
            return 0.0
            
        target_embedding = self.embeddings.embed_query(target_chunk.page_content)
        similarities = []
        
        for chunk in chunk_list:
            chunk_embedding = self.embeddings.embed_query(chunk.page_content)
            similarity = np.dot(target_embedding, chunk_embedding) / (
                np.linalg.norm(target_embedding) * np.linalg.norm(chunk_embedding)
            )
            similarities.append(similarity)
        
        return np.mean(similarities) if similarities else 0.0

    def _generate_and_save_quiz_data_from_context(self, topic_chunks: List[Document]) -> List[QuizItem]:
        all_questions = []
        print(f"Tạo câu hỏi từ {len(topic_chunks)} chủ đề quan trọng...")
        
        for i, chunk in enumerate(topic_chunks):
            print(f"Xử lý chủ đề {i+1}/{len(topic_chunks)}...")
            try:
                query = chunk.page_content
                retrieved_docs = self.db.similarity_search(query, k=3)  # Giảm từ 5 xuống 3
                super_context = self._create_smart_context(chunk, retrieved_docs)
                
                question_item = self._generate_single_question_from_chunk(super_context)
                
                if question_item and self._validate_question_quality(question_item):
                    all_questions.append(question_item)
                    print(f"Đã tạo câu hỏi {len(all_questions)}")

            except Exception as e:
                print(f"Lỗi xử lý chủ đề {i+1}: {e}")
        
        unique_questions = self._remove_duplicate_questions(all_questions)
        print(f"Đã tạo {len(unique_questions)} câu hỏi. Lưu vào '{self.quiz_data_path}'")
        
        if unique_questions:
            with open(self.quiz_data_path, 'w', encoding='utf-8') as f:
                json.dump(unique_questions, f, indent=4, ensure_ascii=False)
        
        return unique_questions

    def _create_smart_context(self, topic_chunk: Document, retrieved_docs: List[Document]) -> str:
        sorted_docs = sorted(retrieved_docs, key=lambda x: len(x.page_content), reverse=True)
        
        total_length = 0
        selected_docs = []
        
        for doc in sorted_docs:
            if total_length + len(doc.page_content) <= 2000:  
                selected_docs.append(doc)
                total_length += len(doc.page_content)
        
        return "\n\n---\n\n".join([doc.page_content for doc in selected_docs])

    def _validate_question_quality(self, question_item: QuizItem) -> bool:
        if not question_item:
            return False
            
        q_text = question_item['question_text']
        answer = question_item['model_generated_answer']
        
        if (len(q_text) < 8 or len(answer) < 8 or
            q_text.lower().startswith(('sorry', 'i cannot', 'i am unable', 'i am sorry')) or
            'cannot' in q_text.lower() or
            'i cannot' in q_text.lower() or
            '?' not in q_text):
            return False
            
        return True

    def _remove_duplicate_questions(self, questions: List[QuizItem]) -> List[QuizItem]:
        seen_questions = set()
        unique_questions = []
        
        for q in questions:
            question_hash = hash(q['question_text'][:80])  
            if question_hash not in seen_questions:
                seen_questions.add(question_hash)
                unique_questions.append(q)
        
        removed_count = len(questions) - len(unique_questions)
        if removed_count > 0:
            print(f"Loại bỏ {removed_count} câu hỏi trùng lặp")
        return unique_questions

    def _generate_single_question_from_chunk(self, chunk_text: str) -> Optional[QuizItem]:
        prompt = f"""
        Tạo một câu hỏi tự luận từ ngữ cảnh sau. CHỈ trả về JSON, **không giải thích**.

        Cấu trúc JSON:
        {{
            "question_text": "Câu hỏi ở đây",
            "model_generated_answer": "Câu trả lời ở đây", 
            "justification_quote": "Trích dẫn từ ngữ cảnh"
        }}

        Ngữ cảnh: {chunk_text[:1500]}  
        """
        
        try:
            response_text = self.llm.invoke(prompt)
            print(f"Response từ LLM: {response_text[:200]}...")  
            
            # Làm sạch response trước khi parse
            cleaned_response = self._clean_json_response(response_text)
            
            # Parse JSON
            data = json.loads(cleaned_response)
            if all(key in data for key in ["question_text", "model_generated_answer", "justification_quote"]):
                return data
            return None
            
        except Exception as e:
            print(f"Lỗi tạo câu hỏi: {e}")
            return None

    def _clean_json_response(self, response_text: str) -> str:
        json_match = re.search(r'\{[^{}]*\{[^{}]*\}[^{}]*\}|\{[^{}]*\}', response_text, re.DOTALL)
        if not json_match:
            raise ValueError("Không tìm thấy JSON trong response")
        
        json_str = json_match.group(0)
        
        json_str = re.sub(r'[\x00-\x1f\x7f-\x9f]', '', json_str)
        json_str = re.sub(r'\\\'', "'", json_str)
        json_str = re.sub(r'\\"', '"', json_str)
        json_str = re.sub(r'\\n', ' ', json_str)
        json_str = re.sub(r'\\t', ' ', json_str)
        
        json_str = re.sub(r'(\w+)\s*:', r'"\1":', json_str)  
        json_str = re.sub(r',\s*}', '}', json_str) 
        json_str = re.sub(r',\s*]', ']', json_str)
        
        return json_str

    def _load_quiz_data(self) -> List[QuizItem]:
        with open(self.quiz_data_path, 'r', encoding='utf-8') as f:
            return json.load(f)

    def _get_feedback_for_answer(self, question: str, user_answer: str) -> Dict:
        retrieved_docs = self.db.similarity_search(question, k=1) 
        context = "\n\n---\n\n".join([doc.page_content for doc in retrieved_docs])
        prompt = f"""
        Đánh giá câu trả lời dựa trên ngữ cảnh. CHỈ trả về JSON:
        {{
            "is_correct": true/false,
            "feedback_text": "Nhận xét",
            "evidence": "Trích dẫn"
        }}

        Câu hỏi: {question}
        Câu trả lời: {user_answer}
        Ngữ cảnh: {context}
        """
        response = self.llm.invoke(prompt)
        try:
            cleaned_response = self._clean_json_response(response)
            return json.loads(cleaned_response)
        except:
            return {"is_correct": False, "feedback_text": "Lỗi đánh giá", "evidence": "Không có"}

    def display_questions(self):
        """Hiển thị câu hỏi (dùng trong notebook)"""
        if not self.quiz_data:
            print("Không có câu hỏi nào")
            return
            
        print("=== DANH SÁCH CÂU HỎI ===")
        for i, quiz_item in enumerate(self.quiz_data):
            print(f"\n{i+1}. {quiz_item['question_text']}")
            print(f"Đáp án: {quiz_item['model_generated_answer']}")
            print(f"Nguồn: {quiz_item['justification_quote'][:100]}...")

    def start_interactive_session(self):
        if not self.quiz_data:
            print("Không có câu hỏi nào để thực hiện quiz")
            return
            
        print("Bắt đầu phiên hỏi đáp")
        for i, quiz_item in enumerate(self.quiz_data):
            print("-" * 50)
            print(f"Câu hỏi {i+1}/{len(self.quiz_data)}:")
            print(f"  {quiz_item['question_text']}")
            user_answer = input("Câu trả lời của bạn: ")
            if user_answer.lower() == 'quit':
                break
            feedback = self._get_feedback_for_answer(quiz_item['question_text'], user_answer)
            print("\nPhản hồi:")
            if feedback.get('is_correct'):
                print("Chính xác")
            else:
                print("Chưa chính xác.")
            print(f"Nhận xét: {feedback.get('feedback_text', '...')}")
            print(f"Bằng chứng: \"{feedback.get('evidence', '...')}\"")
            print("-" * 50)


if __name__ == "__main__":
    try:
        PDF_FILE_PATH = "/kaggle/input/sample-pdf/Team_UIT-VibeCoding_signed.pdf" 
        
        if not os.path.exists(PDF_FILE_PATH):
            raise FileNotFoundError(f"Không tìm thấy file PDF: {PDF_FILE_PATH}")
        
        base_name = os.path.splitext(os.path.basename(PDF_FILE_PATH))[0]
        quiz_data_path = f"quiz_data_{base_name}.json"
        faiss_db_path = f"faiss_db_{base_name}"
        
        if os.path.exists(quiz_data_path):
            os.remove(quiz_data_path)
        if os.path.exists(faiss_db_path):
            shutil.rmtree(faiss_db_path)

        chatbot = QuizBot(pdf_path=PDF_FILE_PATH)
        
        if chatbot.llm and chatbot.quiz_data:
            chatbot.display_questions()
        
    except FileNotFoundError as fnf_error:
        print(fnf_error)
    except Exception as e:
        print(f"Lỗi: {e}")
        import traceback
        traceback.print_exc()

2025-10-14 10:52:40.045437: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1760439160.257449      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1760439160.313238      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Khởi tạo QuizBot cho: Team_UIT-VibeCoding_signed.pdf


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Device set to use cuda:0
  return HuggingFacePipeline(pipeline=pipe)
  return HuggingFaceEmbeddings(model_name=model_id, model_kwargs={'device': device})


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Bắt đầu xử lý file PDF...
Tạo FAISS DB từ 8 chunks...
Lọc được 8/8 chunks chất lượng
Tạo câu hỏi từ 6 chủ đề quan trọng...
Xử lý chủ đề 1/6...
Response từ LLM:  """
        ],
        "answer": {
            "question_text": "Học có giám sát liên quan đến việc sử dụng dữ liệu đã được gán nhãn để huấn luyện mô hình.",
            "model_generated_answer": "Đú...
Xử lý chủ đề 2/6...
Response từ LLM:  """

Assistant: ```json
{
    "question_text": "Học tăng cường có ứng dụng nào trong trò chơi điện tử không? Nếu có, ví dụ nào được đề cập trong ngữ cảnh?",
    "model_generated_answer": "Có, trong t...
Đã tạo câu hỏi 1
Xử lý chủ đề 3/6...
Response từ LLM:  """

Assistant: ```json
{
    "question_text": "Học tăng cường được sử dụng trong lĩnh vực nào cụ thể của công nghệ trò chơi?", 
    "model_generated_answer": "Học tăng cường được sử dụng trong trò c...
Đã tạo câu hỏi 2
Xử lý chủ đề 4/6...
Response từ LLM:  ```json
         {
             "question_text": "Trong lĩnh vực nào học tăng cườn

In [4]:
# import torch
# import json
# import os
# import faiss 
# import numpy as np
# import re 
# import shutil
# from typing import List, Dict, TypedDict, Optional
# from sklearn.feature_extraction.text import TfidfVectorizer
# from langchain.docstore.document import Document
# from langchain.document_loaders import PyPDFLoader
# from langchain.text_splitter import RecursiveCharacterTextSplitter
# from langchain_community.embeddings import HuggingFaceEmbeddings
# from langchain_community.vectorstores import FAISS
# from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
# from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

# class QuizItem(TypedDict):
#     question_text: str
#     model_generated_answer: str
#     justification_quote: str

# class KeyB2QuizBot:
#     def __init__(self, 
#                  pdf_path: str, 
#                  model_id: str = "/kaggle/input/qwen2.5/transformers/7b-instruct/1", 
#                  embedding_model_id: str = "sentence-transformers/all-MiniLM-L6-v2"):
        
#         base_name = os.path.splitext(os.path.basename(pdf_path))[0]
#         self.pdf_path = pdf_path
#         self.quiz_data_path = f"quiz_data_{base_name}.json"
#         self.faiss_db_path = f"faiss_db_{base_name}"
#         self.quiz_data = [] 
        
#         print(f"Khởi tạo QuizBot cho: {os.path.basename(pdf_path)}")
#         try:
#             self.llm = self._initialize_llm(model_id)
#             self.embeddings = self._initialize_embeddings(embedding_model_id)
#             self._setup_database_and_quiz()
            
#             if self.quiz_data:
#                  print(f"Chatbot sẵn sàng với {len(self.quiz_data)} câu hỏi ch")
#             else:
#                 print("Khởi tạo hoàn tất nhưng không có câu hỏi nào được tạo ra")

#         except Exception as e:
#             print(f"Lỗi khởi tạo: {e}")
#             self.llm = None
#             self.embeddings = None

#     def _initialize_llm(self, model_id: str):
#         tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
#         model = AutoModelForCausalLM.from_pretrained(
#             model_id, torch_dtype=torch.bfloat16, load_in_4bit=True, device_map="auto"
#         )
#         pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=384, return_full_text=False)
#         return HuggingFacePipeline(pipeline=pipe)

#     def _initialize_embeddings(self, model_id: str):
#         device = 'cuda' if torch.cuda.is_available() else 'cpu'
#         return HuggingFaceEmbeddings(model_name=model_id, model_kwargs={'device': device})

#     def _setup_database_and_quiz(self):
#         if os.path.exists(self.faiss_db_path) and os.path.exists(self.quiz_data_path):
#             print("Đang tải dữ liệu có sẵn...")
#             self.db = FAISS.load_local(self.faiss_db_path, self.embeddings, allow_dangerous_deserialization=True)
#             self.quiz_data = self._load_quiz_data()
#         else:
#             print("Bắt đầu xử lý file PDF...")
#             loader = PyPDFLoader(self.pdf_path)
#             documents = loader.load()
            
#             if not documents:
#                 print(f"Không thể tải nội dung từ PDF: {self.pdf_path}")
#                 return

#             text_splitter = RecursiveCharacterTextSplitter(
#                 chunk_size=200,
#                 chunk_overlap=40,
#                 length_function=len,
#                 separators=["\n\n", "\n", ". ", " ", ""]
#             )
#             all_chunks = text_splitter.split_documents(documents)
            
#             if not all_chunks:
#                 print("Không có chunk nào được tạo từ tài liệu")
#                 return

#             print(f"Tạo FAISS DB từ {len(all_chunks)} chunks...")
#             self.db = FAISS.from_documents(all_chunks, self.embeddings)
#             self.db.save_local(self.faiss_db_path)

#             quality_chunks = self._filter_quality_chunks(all_chunks)
#             if len(quality_chunks) < 3:
#                 print("Quá ít chunks chất lượng, sử dụng tất cả chunks")
#                 quality_chunks = all_chunks
                
#             self.quiz_data = self._keyb2_question_generation(quality_chunks)

#     def _filter_quality_chunks(self, chunks: List[Document]) -> List[Document]:
#         quality_chunks = []
#         for chunk in chunks:
#             text = chunk.page_content.strip()
#             if (len(text) > 50 and 
#                 not re.match(r'^[\d\s\W]+$', text) and
#                 len(set(text.lower().split())) > 3):
#                 quality_chunks.append(chunk)
#         print(f"Lọc được {len(quality_chunks)}/{len(chunks)} chunks chất lượng")
#         return quality_chunks

#     def _keyb2_question_generation(self, all_chunks: List[Document]) -> List[QuizItem]:        
#         all_questions = []
        
#         for selector_type in ['bm25', 'tfidf', 'semantic']:
#             print(f"Đang chọn key blocks với {selector_type} selector...")
            
#             key_blocks = self._select_key_blocks(all_chunks, selector_type, top_k=min(5, len(all_chunks)))
            
#             if not key_blocks:
#                 continue
                
#             print(f"Đã chọn {len(key_blocks)} key blocks với {selector_type}")
            
#             for i, block in enumerate(key_blocks):
#                 print(f"Xử lý key block {i+1}/{len(key_blocks)}...")
                
#                 try:
#                     expanded_context = self._expand_block_context(block, all_chunks)
#                     question_item = self._generate_question_from_context(expanded_context)
                    
#                     if question_item and self._validate_question_quality(question_item):
#                         all_questions.append(question_item)
#                         print(f"Đã tạo câu hỏi từ {selector_type} selector")
                        
#                 except Exception as e:
#                     print(f"Lỗi xử lý key block {i+1}: {e}")
        
#         unique_questions = self._remove_duplicate_questions(all_questions)
#         print(f"Tổng cộng tạo được {len(unique_questions)} câu hỏi từ nhiều selector")
        
#         if unique_questions:
#             with open(self.quiz_data_path, 'w', encoding='utf-8') as f:
#                 json.dump(unique_questions, f, indent=4, ensure_ascii=False)
        
#         return unique_questions

#     def _select_key_blocks(self, all_chunks: List[Document], selector_type: str, top_k: int = 10) -> List[Document]:
#         if len(all_chunks) < 2:
#             return all_chunks[:top_k]
            
#         if selector_type == 'bm25':
#             return self._bm25_selection(all_chunks, top_k)
#         elif selector_type == 'tfidf':
#             return self._tfidf_selection(all_chunks, top_k)
#         elif selector_type == 'semantic':
#             return self._semantic_selection(all_chunks, top_k)
#         else:
#             return []

#     def _bm25_selection(self, all_chunks: List[Document], top_k: int) -> List[Document]:        
#         corpus = [chunk.page_content for chunk in all_chunks]
        
#         vectorizer = TfidfVectorizer(stop_words=None, max_features=500)
#         tfidf_matrix = vectorizer.fit_transform(corpus)
        
#         doc_lengths = np.array([len(doc.split()) for doc in corpus])
#         avg_doc_length = np.mean(doc_lengths) if len(doc_lengths) > 0 else 1
        
#         scores = []
#         for i, doc in enumerate(corpus):
#             doc_vec = tfidf_matrix[i]
#             doc_tf = np.array(doc_vec.sum(axis=1)).flatten()[0]
#             doc_length = doc_lengths[i]
            
#             bm25_score = doc_tf * (1.5 + 1) / (doc_tf + 1.5 * (1 - 0.75 + 0.75 * doc_length / avg_doc_length))
#             scores.append(bm25_score)
        
#         top_indices = np.argsort(scores)[-top_k:][::-1]
#         return [all_chunks[i] for i in top_indices]

#     def _tfidf_selection(self, all_chunks: List[Document], top_k: int) -> List[Document]:        
#         corpus = [chunk.page_content for chunk in all_chunks]
#         vectorizer = TfidfVectorizer(stop_words=None, max_features=500)
#         tfidf_matrix = vectorizer.fit_transform(corpus)
        
#         chunk_scores = np.array(tfidf_matrix.mean(axis=1)).flatten()
#         top_indices = chunk_scores.argsort()[-top_k:][::-1]
        
#         return [all_chunks[i] for i in top_indices]

#     def _semantic_selection(self, all_chunks: List[Document], top_k: int) -> List[Document]:        
#         if len(all_chunks) <= top_k:
#             return all_chunks
        
#         selected = [all_chunks[0]]
#         remaining = all_chunks[1:]
        
#         while len(selected) < top_k and remaining:
#             best_chunk = None
#             max_min_similarity = -1
            
#             for chunk in remaining:
#                 min_similarity = self._calculate_min_similarity(chunk, selected)
#                 if min_similarity > max_min_similarity:
#                     max_min_similarity = min_similarity
#                     best_chunk = chunk
            
#             if best_chunk:
#                 selected.append(best_chunk)
#                 remaining.remove(best_chunk)
        
#         return selected

#     def _calculate_min_similarity(self, target_chunk: Document, chunk_list: List[Document]) -> float:
#         target_embedding = self.embeddings.embed_query(target_chunk.page_content)
#         min_similarity = float('inf')
        
#         for chunk in chunk_list:
#             chunk_embedding = self.embeddings.embed_query(chunk.page_content)
#             similarity = np.dot(target_embedding, chunk_embedding) / (
#                 np.linalg.norm(target_embedding) * np.linalg.norm(chunk_embedding)
#             )
#             min_similarity = min(min_similarity, similarity)
        
#         return min_similarity if min_similarity != float('inf') else 0.0

#     def _expand_block_context(self, key_block: Document, all_chunks: List[Document]) -> str:
#         query = key_block.page_content
#         k = min(3, len(all_chunks) - 1)
#         retrieved_docs = self.db.similarity_search(query, k=k)
        
#         context_parts = [key_block.page_content]
#         for doc in retrieved_docs:
#             if doc.page_content != key_block.page_content:
#                 context_parts.append(doc.page_content)
        
#         return "\n\n---\n\n".join(context_parts)

#     def _generate_question_from_context(self, context: str) -> Optional[QuizItem]:
#         prompt = f"""
#         Bạn là một AI chuyên tạo câu hỏi. Chỉ dựa DUY NHẤT vào Ngữ cảnh được cung cấp, hãy tạo một câu hỏi TỰ LUẬN.
#         Yêu cầu đầu ra:
#         CHỈ trả về một đối tượng JSON DUY NHẤT. KHÔNG GIẢI THÍCH.
#         Cấu trúc JSON phải là:
#         {{
#             "question_text": "string",
#             "model_generated_answer": "string",
#             "justification_quote": "string"
#         }}
#         Ngữ cảnh: --- {context} ---
#         Đối tượng JSON của bạn:
#         """
#         try:
#             response_text = self.llm.invoke(prompt)
            
#             # Sửa lỗi JSON parsing - clean response trước khi parse
#             cleaned_response = self._clean_json_response(response_text)
            
#             # Parse JSON
#             data = json.loads(cleaned_response)
#             if all(key in data for key in ["question_text", "model_generated_answer", "justification_quote"]):
#                 return data
#             return None
#         except Exception as e:
#             print(f"Lỗi tạo câu hỏi: {e}")
#             return None

#     def _clean_json_response(self, response_text: str) -> str:
#         json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
#         if not json_match:
#             raise ValueError("Không tìm thấy JSON trong response")
        
#         json_str = json_match.group(0)
        
#         json_str = re.sub(r'[\x00-\x1f\x7f-\x9f]', '', json_str)  
#         json_str = re.sub(r'\\\'', "'", json_str) 
#         json_str = re.sub(r'\\"', '"', json_str)  
#         json_str = re.sub(r'\\n', ' ', json_str)  
        
#         # Đảm bảo quotes đúng format
#         json_str = re.sub(r'(\w+):', r'"\1":', json_str)  # Thay key: thành "key":
        
#         return json_str

#     def _validate_question_quality(self, question_item: QuizItem) -> bool:
#         if not question_item:
#             return False
            
#         q_text = question_item['question_text']
#         answer = question_item['model_generated_answer']
        
#         if (len(q_text) < 10 or len(answer) < 15 or
#             q_text.lower().startswith(('sorry', 'i cannot', 'i am unable')) or
#             'cannot' in q_text.lower() or
#             'i cannot' in q_text.lower() or
#             '?' not in q_text):
#             return False
            
#         return True

#     def _remove_duplicate_questions(self, questions: List[QuizItem]) -> List[QuizItem]:
#         seen_questions = set()
#         unique_questions = []
        
#         for q in questions:
#             question_hash = hash(q['question_text'][:100])
#             if question_hash not in seen_questions:
#                 seen_questions.add(question_hash)
#                 unique_questions.append(q)
        
#         removed_count = len(questions) - len(unique_questions)
#         if removed_count > 0:
#             print(f"Loại bỏ {removed_count} câu hỏi trùng lặp")
        
#         return unique_questions

#     def _load_quiz_data(self) -> List[QuizItem]:
#         with open(self.quiz_data_path, 'r', encoding='utf-8') as f:
#             return json.load(f)

#     def _get_feedback_for_answer(self, question: str, user_answer: str) -> Dict:
#         retrieved_docs = self.db.similarity_search(question, k=3) 
#         context = "\n\n---\n\n".join([doc.page_content for doc in retrieved_docs])
#         prompt = f"""
#         Bạn là một giảng viên giỏi. Nhiệm vụ của bạn là đánh giá câu trả lời của học sinh dựa trên ngữ cảnh được cung cấp từ tài liệu gốc.
#         Thông tin:
#         1. Câu hỏi: "{question}"
#         2. Câu trả lời của người dùng: "{user_answer}"
#         3. Ngữ cảnh từ tài liệu: 
#            ---
#            {context}
#            ---
#         Yêu cầu: 
#         Dựa VÀO NGỮ CẢNH TRÊN, hãy đánh giá câu trả lời của học sinh. Trả về một đối tượng JSON DUY NHẤT có cấu trúc:
#         {{
#             "is_correct": "boolean",
#             "feedback_text": "string",
#             "evidence": "string"
#         }}
#         Đối tượng JSON của bạn:
#         """
#         response = self.llm.invoke(prompt)
#         try:
#             cleaned_response = self._clean_json_response(response)
#             return json.loads(cleaned_response)
#         except (json.JSONDecodeError, IndexError):
#             return {"is_correct": False, "feedback_text": "Lỗi khi đánh giá.", "evidence": "Không có"}

#     def start_interactive_session(self):
#         if not self.quiz_data:
#             print("Không có câu hỏi nào để thực hiện quiz")
#             return
            
#         print("Bắt đầu phiên hỏi đáp")
        
#         for i, quiz_item in enumerate(self.quiz_data):
#             print("-" * 50)
#             print(f"Câu hỏi {i+1}/{len(self.quiz_data)}:")
#             print(f"  {quiz_item['question_text']}")
#             print(f"\nĐáp án mẫu: {quiz_item['model_generated_answer']}")
#             print(f"\nTrích dẫn: {quiz_item['justification_quote']}")
            
#             print("\nPhản hồi mẫu:")
#             feedback = self._get_feedback_for_answer(quiz_item['question_text'], quiz_item['model_generated_answer'])
#             if feedback.get('is_correct'):
#                 print("Chính xác.")
#             else:
#                 print("Chưa chính xác.")
#             print(f"  Nhận xét: {feedback.get('feedback_text', '...')}")
#             print(f"  Bằng chứng: \"{feedback.get('evidence', '...')}\"")
#             print("-" * 50)
            
#             if i < len(self.quiz_data) - 1:
#                 print("\nNhấn Enter để tiếp tục...")
#                 try:
#                     input()
#                 except:
#                     print("Tiếp tục...")
#                     print()

#     def display_questions_only(self):
#         """Chỉ hiển thị câu hỏi và đáp án (dùng trong notebook)"""
#         if not self.quiz_data:
#             print("Không có câu hỏi nào")
#             return
            
#         print("=== DANH SÁCH CÂU HỎI ===")
#         for i, quiz_item in enumerate(self.quiz_data):
#             print(f"\n{i+1}. {quiz_item['question_text']}")
#             print(f"   Đáp án: {quiz_item['model_generated_answer']}")
#             print(f"   Nguồn: {quiz_item['justification_quote'][:100]}...")


# if __name__ == "__main__":
#     try:
#         PDF_FILE_PATH = "/kaggle/input/sample-pdf/Team_UIT-VibeCoding_signed.pdf" 
        
#         if not os.path.exists(PDF_FILE_PATH):
#             raise FileNotFoundError(f"Không tìm thấy file PDF: {PDF_FILE_PATH}")
        
#         base_name = os.path.splitext(os.path.basename(PDF_FILE_PATH))[0]
#         quiz_data_path = f"quiz_data_{base_name}.json"
#         faiss_db_path = f"faiss_db_{base_name}"
        
#         if os.path.exists(quiz_data_path):
#             os.remove(quiz_data_path)
#         if os.path.exists(faiss_db_path):
#             shutil.rmtree(faiss_db_path)

#         chatbot = KeyB2QuizBot(pdf_path=PDF_FILE_PATH)
        
#         if chatbot.llm and chatbot.quiz_data:
#             chatbot.display_questions_only()
        
#     except FileNotFoundError as fnf_error:
#         print(fnf_error)
#     except Exception as e:
#         print(f"Lỗi: {e}")
#         import traceback
#         traceback.print_exc()