In [14]:
!pip install -q sentence-transformers faiss-cpu transformers accelerate tqdm



In [15]:
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
import torch
from tqdm.auto import tqdm


In [16]:
from google.colab import files

uploaded = files.upload()

filename = next(iter(uploaded))

with open(filename, 'r', encoding='utf-8') as f:
    docs = [line.strip() for line in f if line.strip()]

print("Loaded documents:", len(docs))
docs[:5]   # show first 5 documents


Saving docomants.txt to docomants (1).txt
Loaded documents: 150


['On average, cats spend 2/3 of every day sleeping. That means a nine-year-old cat has been awake for only three years of its life.',
 'Unlike dogs, cats do not have a sweet tooth. Scientists believe this is due to a mutation in a key taste receptor.',
 'When a cat chases its prey, it keeps its head level. Dogs and humans bob their heads up and down.',
 'The technical term for a cat’s hairball is a “bezoar.”',
 'A group of cats is called a “clowder.”']

In [17]:
print("Loaded from file:", len(docs))

Loaded from file: 150


In [18]:
def chunk(text, max_chars=300):
    return [text[i:i+max_chars] for i in range(0, len(text), max_chars)]

chunks = []
for d in docs:
    chunks.extend(chunk(d))

print("Chunks created:", len(chunks))


Chunks created: 161


In [19]:
embedder = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
print("Embedding model ready")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Embedding model ready


In [20]:
embeddings = embedder.encode(chunks, convert_to_numpy=True, show_progress_bar=True)
embeddings = embeddings.astype("float32")

print("Embeddings shape:", embeddings.shape)


Batches:   0%|          | 0/6 [00:00<?, ?it/s]

Embeddings shape: (161, 768)


In [21]:
faiss.normalize_L2(embeddings)

dimension = embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)
index.add(embeddings)

print("FAISS index built with", index.ntotal, "vectors")


FAISS index built with 161 vectors


In [22]:
def retrieve(query, top_k=3):
    q_emb = embedder.encode([query], convert_to_numpy=True).astype("float32")
    faiss.normalize_L2(q_emb)

    scores, ids = index.search(q_emb, top_k)
    results = [(chunks[i], float(scores[0][j])) for j, i in enumerate(ids[0])]
    return results

# Example
retrieve("How long do cats sleep?")


[('On average, cats spend 2/3 of every day sleeping. That means a nine-year-old cat has been awake for only three years of its life.',
  0.7197109460830688),
 ('Cats sleep 16 to 18 hours per day. When cats are asleep, they are still alert to incoming stimuli. If you poke the tail of a sleeping cat, it will respond accordingly.',
  0.6891071796417236),
 ('One reason that kittens sleep so much is because a growth hormone is released only during sleep.',
  0.5747289657592773)]

In [23]:
model_name = "google/flan-t5-small"

In [24]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [25]:

device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)


In [26]:
generator = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device=0 if device=="cuda" else -1)

print("Generator ready on", device)

Device set to use cuda:0


Generator ready on cuda


In [27]:
def build_prompt(query, retrieved):
    context = "\n".join([f"- {c}" for c, s in retrieved])
    return f"""
Use ONLY the context to answer. If not found, say 'I don't know'.

Context:
{context}

Question: {query}
Answer:
""".strip()

def ask_rag(query, top_k=3):
    retrieved = retrieve(query, top_k)
    prompt = build_prompt(query, retrieved)
    answer = generator(prompt, max_length=120, do_sample=False)[0]["generated_text"]

    return {
        "query": query,
        "retrieved": [c for c, s in retrieved],
        "answer": answer
    }

# Test it
result = ask_rag("How fast can a cat run?")
result


Both `max_new_tokens` (=256) and `max_length`(=120) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


{'query': 'How fast can a cat run?',
 'retrieved': ['A cat can travel at a top speed of approximately 31 mph (49 km) over a short distance.',
  'A cat’s heart beats nearly twice as fast as a human heart, at 110 to 140 beats a minute.',
  'A cat can jump even seven times as high as it is tall.'],
 'answer': '110 to 140 beats a minute'}

In [28]:
ask_rag("What do cats use whiskers for?")


Both `max_new_tokens` (=256) and `max_length`(=120) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


{'query': 'What do cats use whiskers for?',
 'retrieved': ['A cat usually has about 12 whiskers on each side of its face.',
  'The little tufts of hair in a cat’s ear that help keep out dirt direct sounds into the ear, and insulate the ears are called “ear furnishings.”',
  'The technical term for a cat’s hairball is a “bezoar.”'],
 'answer': 'The little tufts of hair in a cat’s ear that help keep out dirt direct sounds into the ear, and insulate the ears are called “ear furnishings.”'}

In [29]:
ask_rag("WHY SOME CATS ARE BLACK AND SOME WHITE?")


Both `max_new_tokens` (=256) and `max_length`(=120) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


{'query': 'WHY SOME CATS ARE BLACK AND SOME WHITE?',
 'retrieved': ['Siamese kittens are born white because of the heat inside the mother’s uterus before birth. This heat keeps the kittens’ hair from darkening on the points.',
  'In the 1930s, two Russian biologists discovered that color change in Siamese kittens depend on their body temperature. Siamese cats carry albino genes that work only when the body temperature is above 98° F. If these kittens are left in a very warm room, their points won’t darken and they will stay ',
  'The color of the points in Siamese cats is heat related. Cool areas are darker.'],
 'answer': "I don't know"}