In [None]:
pip install faiss-cpu


Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl (31.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m72.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.11.0


In [None]:
import os
import json
import re
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer

# Files to include
files = {
    "english_ada": "ada_diabetes_guidelines.txt",
    "english_cdc": "cdc_diabetes_guidelines.txt",
    "english_who": "who_diabetes_guidelines.txt",
    "english_risk_factors": "diabetes_risk_factors.txt",
    "spanish_ndep": "spanish_guidelines_ndep.txt",
    "spanish_who": "spanish_guidelines_who",
    "spanish_cdc": "spanish_guidelines_cdc.txt"
}

# Chunking function
def chunk_text(text, max_words=150, overlap=30):
    paragraphs = re.split(r'\n{2,}', text)
    chunks = []
    current_chunk = []

    for para in paragraphs:
        words = para.split()
        if not words:
            continue

        while words:
            space_left = max_words - len(current_chunk)
            chunk_part = words[:space_left]
            current_chunk.extend(chunk_part)
            words = words[space_left:]

            if len(current_chunk) >= max_words:
                chunks.append(" ".join(current_chunk))
                current_chunk = current_chunk[-overlap:]

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

# Prepare separate lists
english_chunks, spanish_chunks = [], []
english_texts, spanish_texts = [], []

# Process each file
for name, path in files.items():
    if os.path.exists(path):
        print(f"Reading and chunking: {name}")
        with open(path, "r", encoding="utf-8") as f:
            text = f.read()
        chunks = chunk_text(text)
        lang = "es" if "spanish" in name else "en"
        for chunk in chunks:
            chunk_entry = {"text": chunk, "lang": lang, "source": name}
            if lang == "es":
                spanish_chunks.append(chunk_entry)
                spanish_texts.append(chunk)
            else:
                english_chunks.append(chunk_entry)
                english_texts.append(chunk)
    else:
        print(f"File not found: {path}")

print(f"English chunks: {len(english_chunks)}")
print(f"Spanish chunks: {len(spanish_chunks)}")

# Load multilingual model
print("Embedding with multilingual MiniLM...")
model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")

# Encode and build FAISS index for English
print("Embedding English...")
en_embeds = model.encode(english_texts, show_progress_bar=True, convert_to_numpy=True)
en_index = faiss.IndexFlatL2(en_embeds.shape[1])
en_index.add(np.array(en_embeds, dtype="float32"))

faiss.write_index(en_index, "rag_en.index")
with open("rag_passages_en.json", "w", encoding="utf-8") as f:
    json.dump(english_chunks, f, indent=2, ensure_ascii=False)
print("English FAISS and passages saved.")

# Encode and build FAISS index for Spanish
print("Embedding Spanish...")
es_embeds = model.encode(spanish_texts, show_progress_bar=True, convert_to_numpy=True)
es_index = faiss.IndexFlatL2(es_embeds.shape[1])
es_index.add(np.array(es_embeds, dtype="float32"))

faiss.write_index(es_index, "rag_es.index")
with open("rag_passages_es.json", "w", encoding="utf-8") as f:
    json.dump(spanish_chunks, f, indent=2, ensure_ascii=False)
print("Spanish FAISS and passages saved.")

Reading and chunking: english_ada
Reading and chunking: english_cdc
Reading and chunking: english_who
Reading and chunking: english_risk_factors
Reading and chunking: spanish_ndep
File not found: spanish_guidelines_who
Reading and chunking: spanish_cdc
English chunks: 2705
Spanish chunks: 212
Embedding with multilingual MiniLM...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.89k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Embedding English...


Batches:   0%|          | 0/85 [00:00<?, ?it/s]

English FAISS and passages saved.
Embedding Spanish...


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Spanish FAISS and passages saved.


In [None]:
!pip install faiss-cpu

