In [2]:
import json
import faiss
import numpy as np
from datasets import load_dataset
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def clean_html(html_content):
    soup = BeautifulSoup(html_content, "html.parser")
    return soup.get_text(separator=" ", strip=True)


In [4]:
def chunk_text(text, chunk_size=256, overlap=30):
    words = text.split()
    chunks = []
    step = chunk_size - overlap
    for i in range(0, len(words), step):
        segment = words[i:i+chunk_size]
        reconstitued_chunk = " ".join(segment)
        chunks.append(reconstitued_chunk)
    return chunks
        
    

In [5]:
MODEL_NAME = "all-MiniLM-L6-v2"
CHUNK_SIZE_WORDS = 200
OVERLAP_WORDS = 50

embedder = SentenceTransformer(MODEL_NAME)
dataset = load_dataset("natural_questions", split="train", streaming=True)

all_chunks_text = []
all_urls = []



Loading weights: 100%|██████████| 103/103 [00:00<00:00, 1664.27it/s, Materializing param=pooler.dense.weight]                             
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


In [6]:
for i, rows in enumerate(dataset):
    if i>= 100:
        break
    html = rows["document"]["html"]
    url = rows["document"]["url"]
    
    proper_text = clean_html(html)
    list_chunks = chunk_text(proper_text, chunk_size=CHUNK_SIZE_WORDS, overlap=OVERLAP_WORDS)
    
    for chunk in list_chunks:
        if len(chunk.split()) > 20:
            all_chunks_text.append(chunk)
            all_urls.append(url)

print(f"Number of document treateds: {i}")
print(f"Number of chunks: {len(all_chunks_text)}") 

vectors = embedder.encode(all_chunks_text, convert_to_numpy=True)
    

Number of document treateds: 100
Number of chunks: 4155


In [9]:
vectors.shape

(4155, 384)

In [10]:
dimension = vectors.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(vectors)


In [None]:
faiss.write_index(index)

<faiss.swigfaiss.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x3337b2730> >