In [1]:
from google.colab import drive

In [2]:
drive.mount("drive", force_remount=True)

Mounted at drive


In [None]:
%pip install sentencepiece tiktoken indic-nlp-library tools fitz langdetect faiss-cpu PyMuPDF

Collecting tiktoken
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting indic-nlp-library
  Downloading indic_nlp_library-0.92-py3-none-any.whl.metadata (5.7 kB)
Collecting tools
  Downloading tools-0.1.9.tar.gz (34 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting fitz
  Downloading fitz-0.0.1.dev2-py2.py3-none-any.whl.metadata (816 bytes)
Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting PyMuPDF
  Downloading pymupdf-1.25.5-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Collecting sphinx-argparse (from indic-nlp-library)
  Downloading sphinx_argparse-0.5.2-py3-no

In [None]:
import pymupdf
import os
import re
from langdetect import detect
import faiss
import numpy as np
import json
from sentence_transformers import SentenceTransformer
from transformers import pipeline
from transformers import AutoModel, AutoTokenizer
from nltk.tokenize import sent_tokenize
import nltk
from indicnlp.tokenize import indic_tokenize
import torch
import sentencepiece as spm

In [None]:
nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
nltk.download("punkt_tab")

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
dimension = 384
index = faiss.IndexFlatL2(dimension)

In [None]:
def extract_text_from_pdf(pdf_path):
    text = ""
    doc = pymupdf.open(pdf_path)
    for page in doc:
        text = text + page.get_text("text")+"\n"
    return text.strip()

In [None]:
def extract_text_from_folder(path):
    all_text = ""
    pdf_files = sorted([f for f in os.listdir(path) if f.endswith(".pdf")])
    for pdf_file in pdf_files:
        pdf_path = os.path.join(path,pdf_file)
        all_text = all_text + extract_text_from_pdf(pdf_path) + "\n\n"
    return all_text.strip()

In [None]:
def preprocess_text(text):
    text = re.sub(r'\n+', " ", text)
    text = re.sub(r'\s+'," ", text)
    language = detect(text)
    chunks = sent_tokenize(text)
    return chunks, language

In [None]:
def get_embeddings(chunks,model):
  embeddings = model.encode(chunks, convert_to_tensor=True)
  return embeddings.cpu().numpy()

In [None]:
def store_embeddings(book_name,texts,embeddings,output_dir):
    os.makedirs(output_dir, exist_ok=True)
    vectors = np.array(embeddings).astype("float32")
    global index
    if not isinstance(index, faiss.Index):
        raise ValueError("FAISS INDEX IS NOT INITIALISED PROPERLY")
    index.add(vectors)
    metadata_file = os.path.join(output_dir, f"{book_name}_metadata_file.json")
    with open(metadata_file,"w") as f:
        json.dump(texts, f)
    faiss.write_index(index,os.path.join(output_dir, f"{book_name}_faiss.index"))
    return metadata_file

In [None]:
model_name = "sentence-transformers/all-MiniLM-L6-v2"
model_english = SentenceTransformer(model_name, cache_folder="/root/.cache/huggingface/")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
qa_pipeline = pipeline(
    "text-generation",
    model="gpt2",
    device_map="auto",
)

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cpu


In [None]:
def retrieve_passages(query, model, metadata_file, book_name):
    index = faiss.read_index(f"/content/drive/MyDrive/Course Work/Sem 4/Data Analysis and Visualization/Major Project 1/data/processed/{book_name}_faiss.index")
    if index.ntotal == 0:
        raise ValueError(f"FAISS index is empty for book '{book_name}'. Add vectors before querying.")
    query_embedding = model.encode([query])[0]
    query_vector = np.array([query_embedding]).astype("float32")
    distances, indices = index.search(query_vector, k=5)
    with open(metadata_file, "r") as f:
        texts = json.load(f)
    if indices is None or len(indices) == 0 or len(indices[0]) == 0:
        raise ValueError("FAISS search returned no results.")
    valid_indices = [idx for idx in indices[0] if idx < len(texts)]
    if not valid_indices:
        raise ValueError("No valid results found in FAISS index.")
    results = [texts[idx] for idx in valid_indices]
    return results

In [None]:
def generate_answer(context, question):
    prompt = f"Context: {context}\n\nQuestion: {question}\nAnswer:"
    response = qa_pipeline(prompt, max_length = 500, num_return_sequences=1)
    return response[0]["generated_text"].split("Answer:")[-1].strip()

In [None]:
def ingestion_pipeline(book_name,output_dir):
    folder_path = f"/content/drive/MyDrive/Course Work/Sem 4/Data Analysis and Visualization/Major Project 1/data/books/{book_name}"
    extracted_text = extract_text_from_folder(folder_path)
    chunks,language = preprocess_text(extracted_text)
    model = None
    if (language=="en"):
      model = model_english
    embeddings  = get_embeddings(chunks,model)
    metadata_file = store_embeddings(book_name,chunks,embeddings,output_dir)
    return metadata_file

In [None]:
def retrieval_pipeline(query, metadata_file, book_name):
  language = detect(query)
  model = None
  if language=="en":
    model = model_english
  results = retrieve_passages(query,model,metadata_file,book_name)
  return results

In [None]:
def generation_pipeline(results, query):
    answer = generate_answer(results, query)
    return answer

In [None]:
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cpu


In [None]:
def refine_with_summarization(answer):
    return summarizer(answer, max_length=150, min_length=50, do_sample=False)[0]["summary_text"]

In [None]:
def injector_pipeline(book_name,output_dir):
  metadata_file = ingestion_pipeline(book_name,output_dir)
  return metadata_file

In [None]:
def retrievar_pipeline(query,metadata_file,book_name):
  results = retrieval_pipeline(query, metadata_file,book_name)
  answer = generation_pipeline(results,query)
  refined_answer = refine_with_summarization(answer)
  return refined_answer

In [None]:
def main(book_name, query, output_dir):
    metadata_file = ingestion_pipeline(book_name,output_dir)
    results = retrieval_pipeline(query, metadata_file, book_name)
    answer = generation_pipeline(results, query)
    refined_answer = refine_with_summarization(answer)
    return refined_answer

In [None]:
class_num = "class_12"

In [None]:
subjects = ["sociology","psychology","political_science","physics","mathematics","informatics_practices","home_science","history","geography","fine_art","english","economics","computer_science","chemistry","business_studies","biotechnology","biology","accountancy"]

In [None]:
output_dir = "/content/drive/MyDrive/Course Work/Sem 4/Data Analysis and Visualization/Major Project 1/data/processed"

In [None]:
for subject in subjects:
  book_name = f"{class_num}/{subject}"
  injector_pipeline(book_name,output_dir)

In [None]:
## Testing code segment

In [None]:
query = "what is photosynthesis?"

In [None]:
metadata_file = "/content/drive/MyDrive/Course Work/Sem 4/Data Analysis and Visualization/Major Project 1/data/processed/class_10/science_metadata_file.json"

In [None]:
book_name = "class_10/science"

In [None]:
answer = retrievar_pipeline(query,metadata_file,book_name)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Your max_length is set to 150, but your input_length is only 99. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=49)


In [None]:
answer

'The light rays from the sun are not a direct way of reaching plants, and thus they cannot be directly or indirectly sent to plant tissues. Some beetles are already working at a rate of about 10/6 an inch/h of the light rays. Others have to work about 1/15 the distance a day in order to reach the end of the day.'