In [1]:
# !nvidia-smi

In [2]:
import warnings, os
warnings.filterwarnings("ignore")   # suppress all warnings for neatness

### Chunking

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter


text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=170, chunk_overlap=20  # overlap to maintain context betn chunks
)

all_chunks = []

for filename in os.listdir("raw_data/cmu"):
    if filename.endswith(".md"):
        with open(os.path.join("raw_data/cmu", filename), "r", encoding="utf-8") as f:
            data = f.read()
        
        chunks = text_splitter.split_text(data)
        
        for chunk in chunks:
            all_chunks.append({
                "text": chunk,
                "source": filename
            })
            
print(f"Total chunks: {len(all_chunks)}")


texts = [c["text"] for c in all_chunks]
metadatas = [{"source": c["source"]} for c in all_chunks]

Total chunks: 232079


### Embedding

In [None]:
from langchain_huggingface import HuggingFaceEmbeddings

hf_model = HuggingFaceEmbeddings(
    model_name="all-mpnet-base-v2",
    model_kwargs={"device": "cuda"},
    encode_kwargs={"normalize_embeddings": True},
    multi_process=True,  # run encode() on multiple GPUs
)

### Vector storing

In [5]:
from langchain_community.vectorstores import FAISS

db_path = "neu"

# from_texts method takes a list of raw texts and embeds them with the provided embedding model
faiss = FAISS.from_texts(texts=texts, embedding=hf_model, metadatas=metadatas)
# saving the embeddings locally
faiss.save_local(folder_path=db_path)
print("FAISS index saved.")

FAISS index saved.
