In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
!pip install faiss-cpu
!pip install sentence-transformers

Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0.post1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.0 kB)
Downloading faiss_cpu-1.11.0.post1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m60.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.11.0.post1
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.met

In [5]:
import os
import faiss
import numpy as np
import pickle
import torch
from sentence_transformers import SentenceTransformer
from transformers import T5Tokenizer, T5ForConditionalGeneration


def search_index(query, index, chunks, embedder, k=3):
    """
    Search FAISS index for top-k similar chunks.
    Returns chunks and distances.
    """
    query_vec = embedder.encode([query])
    D, I = index.search(np.array(query_vec).astype('float32'), k)
    retrieved_chunks = [chunks[i] for i in I[0]]
    return retrieved_chunks, D[0]


def main():
    # Path index folder
    index_dir = '/content/drive/MyDrive/LLM/data/index'

    # Device
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print(f"Using device: {device}")

    # Load FAISS index and chunks
    print("Loading FAISS index and chunks...")
    try:
        index = faiss.read_index(os.path.join(index_dir, 'faiss.index'))
        with open(os.path.join(index_dir, 'chunks.pkl'), 'rb') as f:
            chunks = pickle.load(f)
        print(f"Loaded FAISS index and {len(chunks)} text chunks.")
    except Exception as e:
        print("Error loading index or chunks:", str(e))
        return

    # Load embedding model
    print("Loading embedding model (Sentence-BERT)...")
    try:
        embedder = SentenceTransformer('all-MiniLM-L6-v2')
        embedder = embedder.to(device)
        print("Embedding model loaded successfully.")
    except Exception as e:
        print("Error loading embedder:", str(e))
        return

    # Load generator model (T5-small for lightweight generation)
    print("Loading T5 generator model...")
    try:
        tokenizer = T5Tokenizer.from_pretrained("t5-small")
        model = T5ForConditionalGeneration.from_pretrained("t5-small").to(device)
        print("Generator model loaded successfully.")
    except Exception as e:
        print("Error loading generator model:", str(e))
        return

    # Predefined query
    query = "What is the capital of Bangladesh?"
    print(f"\nQuery: {query}")

    # Retrieve relevant chunks
    try:
        retrieved_chunks, distances = search_index(query, index, chunks, embedder, k=3)
    except Exception as e:
        print("Error during retrieval:", str(e))
        return

    if len(retrieved_chunks) == 0:
        print("No documents retrieved.")
        return

    print(f"\nRetrieved {len(retrieved_chunks)} chunk(s):")
    for i, chunk in enumerate(retrieved_chunks):
        safe_chunk = chunk[:200].replace('\n', ' ').replace('\r', ' ').strip()
        print(f"  [{i+1}] {safe_chunk}...")

    # Construct input prompt: "question: {query} context: {chunk1} {chunk2} ..."
    context = " ".join(retrieved_chunks)
    input_text = f"question: {query} context: {context}"

    print(f"\nInput to generator: {input_text[:300]}...")

    # Tokenize and generate
    try:
        input_ids = tokenizer(input_text, return_tensors="pt", max_length=1024, truncation=True).input_ids.to(device)

        with torch.no_grad():
            output_ids = model.generate(
                input_ids,
                max_length=150,
                num_beams=4,
                early_stopping=True,
                no_repeat_ngram_size=2,
                temperature=0.7,
            )

        answer = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        print(f"\nAnswer: {answer}")
        print(f"{'-' * 60}")

    except Exception as e:
        print("Error during generation:", str(e))
        import traceback
        traceback.print_exc()


if __name__ == "__main__":
    main()

Using device: cuda
Loading FAISS index and chunks...
Loaded FAISS index and 101 text chunks.
Loading embedding model (Sentence-BERT)...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Embedding model loaded successfully.
Loading T5 generator model...


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Generator model loaded successfully.

Query: What is the capital of Bangladesh?


  return forward_call(*args, **kwargs)
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



Retrieved 3 chunk(s):
  [1] Bangladesh,a officially the Peoples Republic of Bangladesh,b is a country in South Asia. It is the eighth-most populous country in the world and among the most densely populated with a population of o...
  [2] and largest city, is the nations political, financial, and cultural centre. Chittagong is the second-largest city and the busiest port of the country. The territory of modern Bangladesh was a strongho...
  [3] Bawms, Tripuris, Khasis, Khumis, Kukis, Garos, and Bisnupriya Manipuris. The Chittagong Hill Tracts region experienced unrest and an insurgency from 1975 to 1997 in an autonomy movement by its indigen...

Input to generator: question: What is the capital of Bangladesh? context: Bangladesh,a officially the Peoples Republic of Bangladesh,b is a country in South Asia. It is the eighth-most populous country in the world and among the most densely populated with a population of over 171 million within an area of 148,460 squa...

Answer: Dhaka
-------