In [3]:
import os
import json
import re
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer

# Files to include
files = {
    "english_ada": "ada_diabetes_guidelines.txt",
    "english_cdc": "cdc_diabetes_guidelines.txt",
    "english_who": "who_diabetes_guidelines.txt",
    "spanish_ndep": "Spanish Data\spanish_guidelines_ndep.txt",
    "spanish_who": "Spanish Data\spanish_guidelines_who",
    "spanish_cdc": "Spanish Data\spanish_guidelines_cdc.txt"
}

# Chunking function
def chunk_text(text, max_words=150, overlap=30):
    paragraphs = re.split(r'\n{2,}', text)
    chunks = []
    current_chunk = []

    for para in paragraphs:
        words = para.split()
        if not words:
            continue

        while words:
            space_left = max_words - len(current_chunk)
            chunk_part = words[:space_left]
            current_chunk.extend(chunk_part)
            words = words[space_left:]

            if len(current_chunk) >= max_words:
                chunks.append(" ".join(current_chunk))
                current_chunk = current_chunk[-overlap:]

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

# Prepare separate lists
english_chunks, spanish_chunks = [], []
english_texts, spanish_texts = [], []

# Process each file
for name, path in files.items():
    if os.path.exists(path):
        print(f"Reading and chunking: {name}")
        with open(path, "r", encoding="utf-8") as f:
            text = f.read()
        chunks = chunk_text(text)
        lang = "es" if "spanish" in name else "en"
        for chunk in chunks:
            chunk_entry = {"text": chunk, "lang": lang, "source": name}
            if lang == "es":
                spanish_chunks.append(chunk_entry)
                spanish_texts.append(chunk)
            else:
                english_chunks.append(chunk_entry)
                english_texts.append(chunk)
    else:
        print(f"File not found: {path}")

print(f"English chunks: {len(english_chunks)}")
print(f"Spanish chunks: {len(spanish_chunks)}")

# Load multilingual model
print("Embedding with multilingual MiniLM...")
model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")

# Encode and build FAISS index for English
print("Embedding English...")
en_embeds = model.encode(english_texts, show_progress_bar=True, convert_to_numpy=True)
en_index = faiss.IndexFlatL2(en_embeds.shape[1])
en_index.add(np.array(en_embeds, dtype="float32"))

faiss.write_index(en_index, "rag_en.index")
with open("rag_passages_en.json", "w", encoding="utf-8") as f:
    json.dump(english_chunks, f, indent=2, ensure_ascii=False)
print("English FAISS and passages saved.")

# Encode and build FAISS index for Spanish
print("Embedding Spanish...")
es_embeds = model.encode(spanish_texts, show_progress_bar=True, convert_to_numpy=True)
es_index = faiss.IndexFlatL2(es_embeds.shape[1])
es_index.add(np.array(es_embeds, dtype="float32"))

faiss.write_index(es_index, "rag_es.index")
with open("rag_passages_es.json", "w", encoding="utf-8") as f:
    json.dump(spanish_chunks, f, indent=2, ensure_ascii=False)
print("Spanish FAISS and passages saved.")

ImportError: cannot import name 'Literal' from 'typing' (c:\Users\mitta\anaconda3\envs\nlp_course\lib\typing.py)

In [2]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading https://files.pythonhosted.org/packages/25/3c/9d23f4edb99c8613bb24dadecb31f8abaeb01aaa26ff979d6c05416a5b5c/faiss_cpu-1.7.4-cp37-cp37m-win_amd64.whl (10.8MB)
Installing collected packages: faiss-cpu
Successfully installed faiss-cpu-1.7.4
