In [1]:
%mkdir -p pdfs 
!gdown "https://drive.google.com/uc?id=11VmupXJDVQv3k4Pc5McIaaayilLHqBsF" -O pdfs/1.pdf
!gdown "https://drive.google.com/uc?id=11Y1Gxi-EN6OcbOSMxb_vJwxV7neIQPss" -O pdfs/2.pdf
!gdown "https://drive.google.com/uc?id=11aKfjXguCoeprGzPrZUHNgTxzUq4u6V-" -O pdfs/3.pdf

Downloading...
From: https://drive.google.com/uc?id=11VmupXJDVQv3k4Pc5McIaaayilLHqBsF
To: /content/pdfs/1.pdf
100% 1.90M/1.90M [00:00<00:00, 14.9MB/s]
Downloading...
From: https://drive.google.com/uc?id=11Y1Gxi-EN6OcbOSMxb_vJwxV7neIQPss
To: /content/pdfs/2.pdf
100% 1.93M/1.93M [00:00<00:00, 14.9MB/s]
Downloading...
From: https://drive.google.com/uc?id=11aKfjXguCoeprGzPrZUHNgTxzUq4u6V-
To: /content/pdfs/3.pdf
100% 401k/401k [00:00<00:00, 5.57MB/s]


In [None]:
!pip install langchain langchain-community langchain-huggingface langchain-text-splitters
!pip install sentence-transformers faiss-cpu
!pip install pypdf python-docx

Collecting pypdf
  Downloading pypdf-6.4.0-py3-none-any.whl.metadata (7.1 kB)
Downloading pypdf-6.4.0-py3-none-any.whl (329 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m329.5/329.5 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf
Successfully installed pypdf-6.4.0


In [13]:
import re
from typing import List
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader, TextLoader
from langchain_huggingface import HuggingFaceEmbeddings, HuggingFacePipeline
from langchain_community.vectorstores import FAISS

from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

1. Document Loading

In [14]:
# ---------------------------
# Document Loading
# ---------------------------
def load_document(file_path: str) -> str:
    """Load document based on file extension using LangChain loaders"""
    try:
        if file_path.endswith(".pdf"):
            loader = PyPDFLoader(file_path)
        elif file_path.endswith(".docx"):
            loader = Docx2txtLoader(file_path)
        elif file_path.endswith(".txt"):
            loader = TextLoader(file_path, encoding="utf-8")
        else:
            raise ValueError(f"Unsupported file type: {file_path}")
        
        documents = loader.load()
        text = "\n".join([doc.page_content for doc in documents])
        return re.sub(r'\s+', ' ', text).strip()
    except Exception as e:
        print(f"Error loading document: {e}")
        raise


2. Text Chunking

In [15]:
# ---------------------------
# Text Chunking
# ---------------------------
def create_chunks(text: str, chunk_size: int = 500, chunk_overlap: int = 50) -> List[str]:
    """Split text into chunks using LangChain's RecursiveCharacterTextSplitter"""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", ". ", " ", ""]
    )
    chunks = text_splitter.split_text(text)
    return chunks

3. Vector Store

In [21]:
class VectorStoreService:
    """Handle embeddings and vector storage using LangChain + FAISS"""
    
    def __init__(self, model_name: str = "sentence-transformers/all-MiniLM-L6-v2"):
        print(f"Loading embedding model: {model_name}")
        self.embeddings = HuggingFaceEmbeddings(
            model_name=model_name,
            model_kwargs={'device': 'cpu'},
            encode_kwargs={'normalize_embeddings': True}
        )
        self.vector_store = None
        print("Embedding model loaded successfully")
    
    def create_vector_store(self, chunks: List[str]):
        """Create FAISS vector store from text chunks"""
        print(f"Creating vector store from {len(chunks)} chunks...")
        self.vector_store = FAISS.from_texts(
            texts=chunks,
            embedding=self.embeddings
        )
        print("Vector store created successfully")
    
    def save_vector_store(self, path: str = "faiss_index"):
        """Save vector store to disk"""
        if self.vector_store:
            self.vector_store.save_local(path)
            print(f"Vector store saved to {path}")
    
    def load_vector_store(self, path: str = "faiss_index"):
        """Load vector store from disk"""
        self.vector_store = FAISS.load_local(
            path, 
            self.embeddings,
            allow_dangerous_deserialization=True
        )
        print(f"Vector store loaded from {path}")
    
    def get_retriever(self, k: int = 5):
        """Get retriever for similarity search"""
        if not self.vector_store:
            raise ValueError("Vector store not initialized")
        return self.vector_store.as_retriever(search_kwargs={"k": k})
    
    def similarity_search(self, query: str, k: int = 5):
        """Direct similarity search returning documents"""
        if not self.vector_store:
            raise ValueError("Vector store not initialized")
        return self.vector_store.similarity_search(query, k=k)

4. RAG Service

In [22]:
class RAGService:
    """RAG service using LangChain chains"""
    
    def __init__(self, vector_store_service: VectorStoreService, model_name: str = "Qwen/Qwen2.5-0.5B-Instruct"):
        self.vector_store_service = vector_store_service
        self.llm = None
        
        try:
            print(f"Loading LLM: {model_name}")
            model = AutoModelForCausalLM.from_pretrained(
                model_name,
                torch_dtype="auto",
                device_map="auto"
            )
            tokenizer = AutoTokenizer.from_pretrained(model_name)
            
            pipe = pipeline(
                "text-generation",
                model=model,
                tokenizer=tokenizer,
                max_new_tokens=512,
                temperature=0.7,
                top_p=0.95,
                repetition_penalty=1.15
            )
            
            self.llm = HuggingFacePipeline(pipeline=pipe)
            print("LLM loaded successfully")
        except Exception as e:
            print(f"Warning: Failed to load LLM ({e}). Using mock responses.")
    
    def answer_query(self, query: str, k: int = 5):
        """Answer a query using RAG - simplified without chains"""
        # Use direct similarity search instead of retriever
        docs = self.vector_store_service.similarity_search(query, k=k)
        
        # Combine context from retrieved documents
        context = "\n\n".join([doc.page_content for doc in docs])
        
        if self.llm:
            try:
                # Create prompt
                prompt = f"""Use the following context to answer the question. If you don't know the answer, say so.

Context: {context}

Question: {query}

Answer:"""
                
                # Generate answer
                answer = self.llm.invoke(prompt)
                
                return {
                    "answer": answer,
                    "source_documents": docs,
                    "context": context
                }
            except Exception as e:
                print(f"Error during query: {e}")
                return self._mock_answer(query, k, docs, context)
        else:
            return self._mock_answer(query, k, docs, context)
    
    def _mock_answer(self, query: str, k: int, docs=None, context=None):
        """Mock answer when LLM is not available"""
        if docs is None or context is None:
            docs = self.vector_store_service.similarity_search(query, k=k)
            context = "\n\n".join([doc.page_content for doc in docs])
        
        return {
            "answer": f"[Mock Mode] Based on the context, here are the relevant sections for your query: '{query}'",
            "source_documents": docs,
            "context": context
        }

5. Main function

In [None]:

# ---------------------------
# Main function
# ---------------------------
def main():
    # Configuration
    file_path = "pdfs/1.pdf"
    chunk_size = 500
    chunk_overlap = 50
    top_k = 5
    
    # Step 1: Load document
    print("\n=== Loading Document ===")
    text = load_document(file_path)
    print(f"Loaded text (first 200 chars): {text[:200]}...")
    
    # Step 2: Create chunks
    print("\n=== Creating Chunks ===")
    chunks = create_chunks(text, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    print(f"Created {len(chunks)} chunks")
    print(f"Sample chunk: {chunks[0][:200]}...")
    
    # Step 3: Create vector store
    print("\n=== Creating Vector Store ===")
    vector_service = VectorStoreService()
    vector_service.create_vector_store(chunks)
    
    # Optional: Save vector store for later use
    # vector_service.save_vector_store("faiss_index")
    
    # Step 4: Initialize RAG service
    print("\n=== Initializing RAG Service ===")
    rag_service = RAGService(vector_service)
    
    # Step 5: Test queries
    print("\n=== Testing Queries ===")
    test_queries = [
        "What are new technologies mentioned in the course?",
        "What databases are mentioned?",
        "What is the main objective of the course?"
    ]
    
    for query in test_queries:
        print(f"\n--- Query: {query} ---")
        result = rag_service.answer_query(query, k=top_k)
        print(f"Answer: {result['answer']}")
        print(f"\nRelevant sources ({len(result['source_documents'])} chunks):")
        for i, doc in enumerate(result['source_documents'][:2]):  
            print(f"  [{i+1}] {doc.page_content[:150]}...")

if __name__ == "__main__":
    main()


=== Loading Document ===
Loaded text (first 200 chars): New Technologies in Software Engineering Introduction Agenda Introduction Objectives & Course Plan New technologies overview Evaluation Introduction Q&A Objectives & Course Plan Objectives 1. Introduc...

=== Creating Chunks ===
Created 40 chunks
Sample chunk: New Technologies in Software Engineering Introduction Agenda Introduction Objectives & Course Plan New technologies overview Evaluation Introduction Q&A Objectives & Course Plan Objectives 1. Introduc...

=== Creating Vector Store ===
Loading embedding model: sentence-transformers/all-MiniLM-L6-v2
Embedding model loaded successfully
Creating vector store from 40 chunks...
Vector store created successfully

=== Initializing RAG Service ===
Loading LLM: Qwen/Qwen2.5-0.5B-Instruct


Device set to use cuda:0


LLM loaded successfully

=== Testing Queries ===

--- Query: What are new technologies mentioned in the course? ---
Answer: Use the following context to answer the question. If you don't know the answer, say so.

Context: New Technologies in Software Engineering Introduction Agenda Introduction Objectives & Course Plan New technologies overview Evaluation Introduction Q&A Objectives & Course Plan Objectives 1. Introduce students to emerging technologies in software engineering with a focus on web, mobile, and AI-driven development. 2. Develop an understanding of modern tools and practices that are reshaping the software industry. 3. Integrate new technologies into their software development lifecycle. 4

Advantage and Innovation Why do we need new technologies? Web Frontend Technologies Single-Page Applications (SPAs) Angular, React, Vue.js Progressive Web Apps (PWAs) Service workers, offline capabilities, and push notifications WebAssembly A low-level, binary format that runs at near-