In [1]:
#STEP 1 — Install Dependencies
!pip install pypdf2 sentence-transformers transformers torch

Collecting pypdf2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-5.1.2-py3-none-any.whl.metadata (16 kB)
Collecting transformers
  Downloading transformers-4.57.1-py3-none-any.whl.metadata (43 kB)
Collecting tqdm (from sentence-transformers)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting huggingface-hub>=0.20.0 (from sentence-transformers)
  Downloading huggingface_hub-1.1.5-py3-none-any.whl.metadata (13 kB)
  Downloading huggingface_hub-0.36.0-py3-none-any.whl.metadata (14 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2025.11.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (40 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers)
  Downloading tokenizers-0.22.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Downloading sa

In [2]:
#STEP 2 — Upload / Load the PDF
file_path = "/workspaces/GenAI_Assessment/NEP_Final_English_0.pdf"  # << replace with your file

In [3]:
#STEP 3 — Extract Text
from PyPDF2 import PdfReader
def extract_text_from_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    full_text = ""
    for page in reader.pages:
        text = page.extract_text()
        if text:
            full_text += text + "\n"
    return full_text
document_text = extract_text_from_pdf(file_path)
len(document_text), document_text[:500]

(300364,
 '1 \n \n \nY  \n \n \n \n \nNational  Education   \nPolicy  2020  \n \n \n \nMinistry  of Human  \nResource  Development  \n \nGovernment  of India  \n \n\n1 \n  \nChapter   Contents  Page  \nNo \n Introduction  3 \n PART   I.  SCHOOL  EDUCATION  \n1  \nEarly  Childhood  Care  and Education:  The Foundation  of Learning   7 \n2 Foundational  Literacy  and Numeracy:  An Urgent  & Necessary  \nPrerequisite  to Learning  8 \n3 Curtailing  Dropout  Rates  and Ensuring  Universal  Access  to Education  at \nAll Levels   10 \n4 Curr')

In [4]:
#STEP 4 — Chunk the Text
def chunk_text(text, chunk_size=1000, overlap=200):
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunk = text[start:end]
        chunks.append(chunk)
        start += chunk_size - overlap
    return chunks
chunks = chunk_text(document_text)
len(chunks)

376

In [5]:
#STEP 5 — Load Models (Embedding + Generator)
from sentence_transformers import SentenceTransformer, util
from transformers import pipeline
embedder = SentenceTransformer("all-MiniLM-L6-v2")
generator = pipeline("text2text-generation", model="google/flan-t5-base")

  from .autonotebook import tqdm as notebook_tqdm
Device set to use cpu


In [6]:
#STEP 6 — Create Embeddings for All Chunks
chunk_embeddings = embedder.encode(chunks, convert_to_tensor=True, show_progress_bar=True)
chunk_embeddings.shape

Batches: 100%|██████████| 12/12 [00:31<00:00,  2.61s/it]


torch.Size([376, 384])

In [7]:
 #STEP 7 — Retrieval Function
import torch
def retrieve_top_chunks(query, embeddings, chunks, model, top_k=3):
    query_embedding = model.encode(query, convert_to_tensor=True)
    scores = util.cos_sim(query_embedding, embeddings)[0]
    top_results = torch.topk(scores, k=top_k)
    output = []
    for idx, score in zip(top_results.indices.tolist(), top_results.values.tolist()):
        output.append({
            "index": idx,
            "score": float(score),
            "text": chunks[idx]
        })
    return output

In [8]:
#STEP 8 — Generate an Answer Using Retrieved Context (RAG)
def answer_question(query, top_k=3):
    retrieved = retrieve_top_chunks(query, chunk_embeddings, chunks, embedder, top_k)
    context = "\n\n".join([f"[source {r['index']}] {r['text'][:800]}" for r in retrieved])
    prompt = (
        "Use ONLY the following context to answer the question. "
        "If answer not found, say 'not in document'.\n\n"
        f"Context:\n{context}\n\nQuestion: {query}\nAnswer:"
    )
    output = generator(prompt, max_length=250, do_sample=False)[0]["generated_text"]
    return output, retrieved

In [11]:
#STEP 9 — Ask a Question
query = "What are the New and Forward-looking Vision for India’s Higher Education System?"
answer, sources = answer_question(query)
print("ANSWER:\n", answer)
print("\nSOURCES USED:")
for s in sources:
    print(f"Chunk {s['index']} (score={s['score']:.3f})")

Both `max_new_tokens` (=256) and `max_length`(=250) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


ANSWER:
 a new conceptual perception/understanding for what constitutes a higher education institution (HEI), i.e., [source 190] pier, cohesive, cultured, productive, innovativ e, progressive, and prosperous nation

SOURCES USED:
Chunk 187 (score=0.757)
Chunk 195 (score=0.742)
Chunk 190 (score=0.739)


In [10]:
#STEP 10 — Summarize Entire Document
summary = generator(document_text[:8000], max_length=500, min_length=150, do_sample=False)
print(summary[0]['generated_text'])

Both `max_new_tokens` (=256) and `max_length`(=500) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


National Education Policy 2020 Ministry of Human Resource Development Government of India 1 Chapter Contents Page No Introduction 3 PART I. SCHOOL EDUCATION 1 Early Childhood Care and Education: The Foundation of Learning 7 2 Foundational Literacy and Numeracy: An Urgent & Necessary Prerequisite to Learning 8 3 Curtailing Dropout Rates and Ensuring Universal Access to Education at All Levels 10 4 Curriculum and Pedagogy in Schools: Learning Should be Holistic, Integrated, Enjoyable and Engaging 11 5 Teachers 20 6 Equitable and Inclusive Educa 4tion: Learning for All 24 7 Efficient Resourcing and Effective Governance through School Complexes/Clusters 28 8 Standard -setting and Accreditation for School Education 30 PART II. HIGHER EDUCATION 9 Quality Universities and Colle ges: A New and Forward -looking Vision for India’s Higher Education System 33 10 Institutional Restructuring and Consolidation 34 11 Towards a More Holistic and Multidisciplinary Education 36 12 Optimal Learning Enviro