In [None]:
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
import pandas as pd
from tqdm import tqdm
import pickle as pkl
import os

In [None]:
dataset = load_dataset("reuters21578", "ModHayes")

In [None]:
model = SentenceTransformer("all-MiniLM-L6-v2")

In [None]:
texts = dataset["train"]["text"] + dataset["test"]["text"]
print(len(texts))
print(texts[0])

In [None]:
text_df = pd.DataFrame({"text": texts, "id": list(range(len(texts)))}, columns=["text", "id"])
text_df

In [None]:
def split_with_overlap(text: str, split_size: int, overlap: int) -> list[str]:
    splits = []
    for i in range(0, len(text), split_size - overlap):
        splits.append(text[i : i + split_size])
    return splits
split_with_overlap(texts[0], 256, 100)

In [None]:
embeddings = []
ids = []

for text, id in tqdm(zip(text_df["text"], text_df["id"]), total=len(text_df)):
    splits = split_with_overlap(text, 256, 100)
    split_embeddings = model.encode(splits, normalize_embeddings=True)
    embeddings.extend(split_embeddings)
    ids.extend([id] * len(split_embeddings))

In [None]:
embeddings_df = pd.DataFrame({"embeddings": embeddings, "id": ids}, columns=["embeddings", "id"])
embeddings_df

In [None]:
out_dir = "./out"
data = (text_df, embeddings_df)
os.makedirs(out_dir, exist_ok=True)
with open(os.path.join(out_dir, "embeddings.pkl"), "wb") as f:
    pkl.dump(data, f)