In [1]:
import logging
import math
import os
import random
from pathlib import Path

import datasets
import evaluate
import torch
import transformers
from accelerate import Accelerator
from accelerate.logging import get_logger
from accelerate.utils import set_seed
from datasets import DatasetDict, load_dataset
from huggingface_hub import HfApi
from torch import nn
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import AutoModel, AutoTokenizer, SchedulerType, default_data_collator, get_scheduler

logger = get_logger(__name__)

In [2]:
def save_model_hook(models, weights, output_dir):
    for i, model in enumerate(models):
        model.save_pretrained(output_dir, state_dict=weights[i])
        # make sure to pop weight so that corresponding model is not saved again
        weights.pop()


def load_model_hook(models, input_dir):
    while len(models) > 0:
        model = models.pop()
        # pop models so that they are not loaded again
        if hasattr(model, "active_adapter") and hasattr(model, "load_adapter"):
            model.load_adapter(input_dir, model.active_adapter, is_trainable=True)

In [3]:
def get_cosing_embeddings(query_embs, product_embs):
    return torch.sum(query_embs * product_embs, axis=1)


def get_loss(cosine_score, labels):
    return torch.mean(torch.square(labels * (1 - cosine_score) + torch.clamp((1 - labels) * cosine_score, min=0.0)))

In [4]:
class AutoModelForSentenceEmbedding(nn.Module):
    def __init__(self, model_name, tokenizer, normalize=True):
        super().__init__()

        self.model = AutoModel.from_pretrained(
            model_name
        )  # , quantizaton_config=BitsAndBytesConfig(load_in_8bit=True), device_map={"":0})
        self.normalize = normalize
        self.tokenizer = tokenizer

    def forward(self, **kwargs):
        model_output = self.model(**kwargs)
        embeddings = self.mean_pooling(model_output, kwargs["attention_mask"])
        if self.normalize:
            embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)

        return embeddings

    def mean_pooling(self, model_output, attention_mask):
        token_embeddings = model_output[0]  # First element of model_output contains all token embeddings
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

    def __getattr__(self, name: str):
        """Forward missing attributes to the wrapped module."""
        try:
            return super().__getattr__(name)  # defer to nn.Module's logic
        except AttributeError:
            if name == "model":  # see #1892: prevent infinite recursion if class is not initialized
                raise
            return getattr(self.model, name)

In [23]:
DATASET_NAME = "smangrul/amazon_esci"
MAX_LENGTH = 70
MODEL_NAME_OR_PATH = "intfloat/e5-large-v2"
PER_DEVICE_TRAIN_BATCH_SIZE = 64
PER_DEVICE_EVAL_BATCH_SIZE = 128
LEARNING_RATE = 5e-4
WEIGHT_DECAY = 0.0
NUM_TRAIN_EPOCHS = 3
GRADIENT_ACCUMULATION_STEPS = 1
OUTPUT_DIR = "results/peft_lora_e5_ecommerce_semantic_search_colab"
SEED = 42
PUSH_TO_HUB = False
WITH_TRACKING = True
REPORT_TO = "wandb"
USE_PEFT = True
CHECKPOINTING_STEPS = "epoch"
LR_SCHEDULER_TYPE = "linear"
NUM_WARMUP_STEPS = 0
SANITY_TEST = True

In [6]:
accelerator_kwargs = {"gradient_accumulation_steps": GRADIENT_ACCUMULATION_STEPS}
if WITH_TRACKING:
    accelerator_kwargs["log_with"] = REPORT_TO
    accelerator_kwargs["project_dir"] = OUTPUT_DIR
accelerator = Accelerator(**accelerator_kwargs)



In [7]:
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.INFO,
)
logger.info(accelerator.state, main_process_only=False)

08/16/2024 12:19:49 - INFO - __main__ - Distributed environment: DistributedType.NO
Num processes: 1
Process index: 0
Local process index: 0
Device: mps

Mixed precision type: no



In [8]:
if accelerator.is_local_main_process:
    datasets.utils.logging.set_verbosity_warning()
    transformers.utils.logging.set_verbosity_info()
else:
    datasets.utils.logging.set_verbosity_error()
    transformers.utils.logging.set_verbosity_error()

In [9]:
if SEED is not None:
    set_seed(SEED)

# Handle the repository creation
if accelerator.is_main_process:
    if OUTPUT_DIR is not None:
        os.makedirs(OUTPUT_DIR, exist_ok=True)
accelerator.wait_for_everyone()

In [10]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME_OR_PATH)

loading file vocab.txt from cache at /Users/amiyamandal/.cache/huggingface/hub/models--intfloat--e5-large-v2/snapshots/b322e09026e4ea05f42beadf4d661fb4e101d311/vocab.txt
loading file tokenizer.json from cache at /Users/amiyamandal/.cache/huggingface/hub/models--intfloat--e5-large-v2/snapshots/b322e09026e4ea05f42beadf4d661fb4e101d311/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at /Users/amiyamandal/.cache/huggingface/hub/models--intfloat--e5-large-v2/snapshots/b322e09026e4ea05f42beadf4d661fb4e101d311/special_tokens_map.json
loading file tokenizer_config.json from cache at /Users/amiyamandal/.cache/huggingface/hub/models--intfloat--e5-large-v2/snapshots/b322e09026e4ea05f42beadf4d661fb4e101d311/tokenizer_config.json


In [11]:
def preprocess_function(examples):
    queries = examples["query"]
    result = tokenizer(queries, padding="max_length", max_length=MAX_LENGTH, truncation=True)
    result = {f"query_{k}": v for k, v in result.items()}

    products = examples["product_title"]
    result_products = tokenizer(products, padding="max_length", max_length=MAX_LENGTH, truncation=True)
    for k, v in result_products.items():
        result[f"product_{k}"] = v

    result["labels"] = examples["relevance_label"]
    return result

In [27]:
if SANITY_TEST:
    train_dataset = load_dataset(DATASET_NAME, split="train[:10240]", verification_mode=False)
    val_dataset = load_dataset(DATASET_NAME, split="validation[:1024]", verification_mode=False)

    dataset = DatasetDict({"train": train_dataset, "validation": val_dataset})
else:
    dataset = load_dataset(DATASET_NAME)


In [28]:
processed_datasets = dataset.map(
        preprocess_function,
        batched=True,
        remove_columns=dataset["train"].column_names,
        desc="Running tokenizer on dataset",
    )

Running tokenizer on dataset:   0%|          | 0/10240 [00:00<?, ? examples/s]

In [30]:
for index in random.sample(range(len(processed_datasets["train"])), 3):
    logger.info(f"Sample {index} of the training set: {processed_datasets['train'][index]}.")

08/16/2024 12:26:17 - INFO - __main__ - Sample 4012 of the training set: {'query_input_ids': [101, 1011, 1061, 2025, 6583, 6820, 7941, 7554, 6887, 27292, 2256, 2063, 4372, 2226, 3514, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'query_token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'query_attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'product_input_ids': [101, 3424, 3526, 15859, 2618, 3949, 21881, 3514, 1011, 2784, 7279, 3388, 18514, 5675, 3096, 3813, 2075, 1998, 18711, 1011, 7126, 3338, 2091, 6638, 8153, 7

In [31]:
model = AutoModelForSentenceEmbedding(MODEL_NAME_OR_PATH, tokenizer)

config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

loading configuration file config.json from cache at /Users/amiyamandal/.cache/huggingface/hub/models--intfloat--e5-large-v2/snapshots/b322e09026e4ea05f42beadf4d661fb4e101d311/config.json
Model config BertConfig {
  "_name_or_path": "intfloat/e5-large-v2",
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.43.4",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}



model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

loading weights file model.safetensors from cache at /Users/amiyamandal/.cache/huggingface/hub/models--intfloat--e5-large-v2/snapshots/b322e09026e4ea05f42beadf4d661fb4e101d311/model.safetensors
All model checkpoint weights were used when initializing BertModel.

All the weights of BertModel were initialized from the model checkpoint at intfloat/e5-large-v2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.


In [33]:
from peft import LoraConfig, TaskType, get_peft_model

In [34]:
peft_config = LoraConfig(r=8,lora_alpha=16,bias="none",task_type=TaskType.FEATURE_EXTRACTION,target_modules=["key", "query", "value"],)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 1,179,648 || all params: 336,321,536 || trainable%: 0.3508


In [35]:
accelerator.print(model)

PeftModelForFeatureExtraction(
  (base_model): LoraModel(
    (model): AutoModelForSentenceEmbedding(
      (model): BertModel(
        (embeddings): BertEmbeddings(
          (word_embeddings): Embedding(30522, 1024, padding_idx=0)
          (position_embeddings): Embedding(512, 1024)
          (token_type_embeddings): Embedding(2, 1024)
          (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): BertEncoder(
          (layer): ModuleList(
            (0-23): 24 x BertLayer(
              (attention): BertAttention(
                (self): BertSdpaSelfAttention(
                  (query): lora.Linear(
                    (base_layer): Linear(in_features=1024, out_features=1024, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Identity()
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_feat

In [36]:
train_dataloader = DataLoader(
    processed_datasets["train"],
    shuffle=True,
    collate_fn=default_data_collator,
    batch_size=PER_DEVICE_TRAIN_BATCH_SIZE,
    pin_memory=True,
)

eval_dataloader = DataLoader(
    processed_datasets["validation"],
    shuffle=False,
    collate_fn=default_data_collator,
    batch_size=PER_DEVICE_EVAL_BATCH_SIZE,
    pin_memory=True,
)

In [37]:
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [38]:
overrode_max_train_steps = False
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / GRADIENT_ACCUMULATION_STEPS)
max_train_steps = NUM_TRAIN_EPOCHS * num_update_steps_per_epoch
num_warmup_steps = NUM_WARMUP_STEPS

In [39]:
lr_scheduler = get_scheduler(name=LR_SCHEDULER_TYPE,optimizer=optimizer,num_warmup_steps=num_warmup_steps,num_training_steps=max_train_steps,)
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(model, optimizer, train_dataloader, eval_dataloader, lr_scheduler)

In [40]:
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / GRADIENT_ACCUMULATION_STEPS)
max_train_steps = NUM_TRAIN_EPOCHS * num_update_steps_per_epoch

In [42]:
checkpointing_steps = CHECKPOINTING_STEPS
if checkpointing_steps is not None and checkpointing_steps.isdigit():
    checkpointing_steps = int(checkpointing_steps)

# We need to initialize the trackers we use, and also store our configuration.
# The trackers initializes automatically on the main process.
if WITH_TRACKING:
    experiment_config = {
            "dataset_name": DATASET_NAME,
            "max_length": MAX_LENGTH,
            "model_name_or_path": MODEL_NAME_OR_PATH,
            "per_device_train_batch_size": PER_DEVICE_TRAIN_BATCH_SIZE,
            "per_device_eval_batch_size": PER_DEVICE_EVAL_BATCH_SIZE,
            "learning_rate": LEARNING_RATE,
            "weight_decay": WEIGHT_DECAY,
            "num_train_epochs": NUM_TRAIN_EPOCHS,
            "gradient_accumulation_steps": GRADIENT_ACCUMULATION_STEPS,
            "output_dir": OUTPUT_DIR,
            "seed": SEED,
            "push_to_hub": PUSH_TO_HUB,
            "hub_model_id": HUB_MODEL_ID,
            "with_tracking": WITH_TRACKING,
            "report_to": REPORT_TO,
            "use_peft": USE_PEFT,
            "checkpointing_steps": CHECKPOINTING_STEPS,
            "lr_scheduler_type": LR_SCHEDULER_TYPE,
            "num_warmup_steps": NUM_WARMUP_STEPS,
            "sanity_test": SANITY_TEST}
    accelerator.init_trackers("peft_semantic_search", experiment_config)

metric = evaluate.load("roc_auc")

total_batch_size = PER_DEVICE_TRAIN_BATCH_SIZE * accelerator.num_processes * GRADIENT_ACCUMULATION_STEPS

Downloading builder script:   0%|          | 0.00/9.54k [00:00<?, ?B/s]

In [43]:
accelerator.register_save_state_pre_hook(save_model_hook)
accelerator.register_load_state_pre_hook(load_model_hook)

<torch.utils.hooks.RemovableHandle at 0x3612c7f50>

In [44]:
logger.info("***** Running training *****")
logger.info(f"  Num examples = {len(processed_datasets['train'])}")
logger.info(f"  Num Epochs = {NUM_TRAIN_EPOCHS}")
logger.info(f"  Instantaneous batch size per device = {PER_DEVICE_TRAIN_BATCH_SIZE}")
logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
logger.info(f"  Gradient Accumulation steps = {GRADIENT_ACCUMULATION_STEPS}")
logger.info(f"  Total optimization steps = {max_train_steps}")

08/16/2024 12:45:43 - INFO - __main__ - ***** Running training *****
08/16/2024 12:45:43 - INFO - __main__ -   Num examples = 10240
08/16/2024 12:45:43 - INFO - __main__ -   Num Epochs = 3
08/16/2024 12:45:43 - INFO - __main__ -   Instantaneous batch size per device = 64
08/16/2024 12:45:43 - INFO - __main__ -   Total train batch size (w. parallel, distributed & accumulation) = 64
08/16/2024 12:45:43 - INFO - __main__ -   Gradient Accumulation steps = 1
08/16/2024 12:45:43 - INFO - __main__ -   Total optimization steps = 480


In [45]:
progress_bar = tqdm(range(max_train_steps), disable=not accelerator.is_local_main_process)
completed_steps = 0
starting_epoch = 0


  0%|                                                                                                                                                                                                                                                                                         | 0/480 [00:00<?, ?it/s]

In [46]:
for epoch in range(starting_epoch, NUM_TRAIN_EPOCHS):
    model.train()
    if WITH_TRACKING:
        total_loss = 0
    for step, batch in enumerate(train_dataloader):
        with accelerator.accumulate(model):
            query_embs = model(**{k.replace("query_", ""): v for k, v in batch.items() if "query" in k})
            product_embs = model(**{k.replace("product_", ""): v for k, v in batch.items() if "product" in k})
            loss = get_loss(get_cosing_embeddings(query_embs, product_embs), batch["labels"])
            total_loss += accelerator.reduce(loss.detach().float(), reduction="sum")
            accelerator.backward(loss)
            optimizer.step()
            lr_scheduler.step()
            model.zero_grad()

        # Checks if the accelerator has performed an optimization step behind the scenes
        if accelerator.sync_gradients:
            progress_bar.update(1)
            completed_steps += 1

        if (step + 1) % 100 == 0:
            logger.info(f"Step: {step+1}, Loss: {total_loss/(step+1)}")
            if WITH_TRACKING:
                accelerator.log({"train/loss": total_loss / (step + 1)}, step=completed_steps)

        if isinstance(checkpointing_steps, int):
            if completed_steps % checkpointing_steps == 0:
                output_dir = f"step_{completed_steps}"
                if OUTPUT_DIR is not None:
                    output_dir = os.path.join(OUTPUT_DIR, output_dir)
                accelerator.save_state(output_dir)

        if completed_steps >= max_train_steps:
            break

    model.eval()
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            query_embs = model(**{k.replace("query_", ""): v for k, v in batch.items() if "query" in k})
            product_embs = model(**{k.replace("product_", ""): v for k, v in batch.items() if "product" in k})
            prediction_scores = get_cosing_embeddings(query_embs, product_embs)
        prediction_scores, references = accelerator.gather_for_metrics((prediction_scores, batch["labels"]))
        metric.add_batch(
            prediction_scores=prediction_scores,
            references=references,
        )

    result = metric.compute()
    result = {f"eval/{k}": v for k, v in result.items()}
    # Use accelerator.print to print only on the main process.
    accelerator.print(f"epoch {epoch}:", result)
    if WITH_TRACKING:
        result["train/epoch_loss"] = total_loss.item() / len(train_dataloader)
        accelerator.log(result, step=completed_steps)

    if OUTPUT_DIR is not None:
        accelerator.wait_for_everyone()
        if accelerator.is_main_process:
            if isinstance(checkpointing_steps, str):
                accelerator.save_state(os.path.join(OUTPUT_DIR, f"epoch_{epoch}"))
            accelerator.unwrap_model(model).save_pretrained(
                OUTPUT_DIR, state_dict=accelerator.get_state_dict(accelerator.unwrap_model(model))
            )
            tokenizer.save_pretrained(OUTPUT_DIR)
            if PUSH_TO_HUB:
                commit_message = (
                    f"Training in progress epoch {epoch}"
                    if epoch < NUM_TRAIN_EPOCHS - 1
                    else "End of training"
                )
                api.upload_folder(
                    repo_id=repo_id,
                    folder_path=OUTPUT_DIR,
                    commit_message=commit_message,
                    run_as_future=True,
                )
        accelerator.wait_for_everyone()
accelerator.end_training()


 21%|████████████████████████████████████████████████████████▍                                                                                                                                                                                                                      | 100/480 [05:41<12:47,  2.02s/it]08/16/2024 12:51:36 - INFO - __main__ - Step: 100, Loss: 0.20186567306518555
 33%|██████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                                                                                    | 160/480 [07:42<10:45,  2.02s/it]08/16/2024 12:53:51 - INFO - accelerate.accelerator - Saving current state to results/peft_lora_e5_ecommerce_semantic_search_colab/epoch_0
08/16/2024 12:53:51 - INFO - accelerate.checkpointing - Optimizer state saved in results/peft_lora_e5_ecommerce_semantic_search_colab/epoch_0/optimizer.bin
08/16/20

epoch 0: {'eval/roc_auc': 0.7911350878471969}


tokenizer config file saved in results/peft_lora_e5_ecommerce_semantic_search_colab/tokenizer_config.json
Special tokens file saved in results/peft_lora_e5_ecommerce_semantic_search_colab/special_tokens_map.json
 54%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                            | 260/480 [11:19<07:24,  2.02s/it]08/16/2024 12:57:14 - INFO - __main__ - Step: 100, Loss: 0.16571727395057678
 67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                          | 320/480 [13:20<05:23,  2.02s/it]08/16/2024 12:59:30 - INFO - accelerate.accelerator - Saving current state to results/peft_

epoch 1: {'eval/roc_auc': 0.7949912325466287}


 88%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                 | 420/480 [16:57<02:01,  2.02s/it]08/16/2024 13:02:52 - INFO - __main__ - Step: 100, Loss: 0.1538492739200592
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 480/480 [18:59<00:00,  2.02s/it]08/16/2024 13:05:08 - INFO - accelerate.accelerator - Saving current state to results/peft_lora_e5_ecommerce_semantic_search_colab/epoch_2
08/16/2024 13:05:08 - INFO - accelerate.checkpointing - Optimizer state saved in results/peft_lora_e5_ecommerce_semantic_search_colab/epoch_2/optimizer.bin
08/16/202

epoch 2: {'eval/roc_auc': 0.7940588512909453}


In [49]:
from peft import PeftModel

In [70]:
eval_dataloader = DataLoader(
    processed_datasets["validation"],
    shuffle=False,
    collate_fn=default_data_collator,
    batch_size=2,
    pin_memory=True,
)

In [72]:
# Imports
import numpy as np
from tqdm import tqdm
from peft import PeftModel

# Load the base model
model = AutoModelForSentenceEmbedding(MODEL_NAME_OR_PATH, tokenizer)

# Load the fine-tuned PEFT model
model = PeftModel.from_pretrained(model, "results/peft_lora_e5_ecommerce_semantic_search_colab/epoch_2/")
model.eval()
model = model.merge_and_unload()

# Prepare for evaluation
num_products = len(dataset['validation'])  # Make sure to use the correct split
d = 1024  # Adjust this if your model's embedding dimension is different
product_embeddings_array = []

# Evaluation loop
for step, batch in enumerate(tqdm(eval_dataloader)):
    with torch.no_grad():
        with torch.amp.autocast(dtype=torch.float32, device_type="cpu"):
            # Remove the 'query_' prefix from input keys
            model_inputs = {k.replace('query_', ''): v.to("cpu") for k, v in batch.items() if k.startswith('query_')}
            product_embs = model(**model_inputs).detach().float().cpu().numpy()
    
    product_embeddings_array.extend(product_embs)

    del product_embs, batch

# Convert to numpy array after collecting all embeddings
product_embeddings_array = np.array(product_embeddings_array)

print(f"Evaluation completed successfully. Shape of embeddings: {product_embeddings_array.shape}")

loading configuration file config.json from cache at /Users/amiyamandal/.cache/huggingface/hub/models--intfloat--e5-large-v2/snapshots/b322e09026e4ea05f42beadf4d661fb4e101d311/config.json
Model config BertConfig {
  "_name_or_path": "intfloat/e5-large-v2",
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.43.4",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file model.safetensors from cache at /Users/amiyamandal/.cache/huggingface/hub/models--intfloat--e5-large-v2/snapshots/b322e09026e4ea05f42beadf4

Evaluation completed successfully. Shape of embeddings: (1024, 1024)





In [74]:
!pip install hnswlib

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting hnswlib
  Downloading hnswlib-0.8.0.tar.gz (36 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: hnswlib
  Building wheel for hnswlib (pyproject.toml) ... [?25ldone
[?25h  Created wheel for hnswlib: filename=hnswlib-0.8.0-cp311-cp311-macosx_11_0_arm64.whl size=169127 sha256=0eff4cc25ab10a7ad6d090a58cdbce4e7ecd437659595401d2a16dbf4fbacac2
  Stored in directory: /Users/amiyamandal/Library/Caches/pip/wheels/ea/4e/27/39aebca9958719776e36fada290845a7ef10f053ad70e22ceb
Successfully built hnswlib
Installing collected packages: hnswlib
Successfully installed hnswlib-0.8.0


In [75]:
import hnswlib

In [102]:
def construct_search_index(dim, num_elements, data):
    # Declaring index
    search_index = hnswlib.Index(space = 'ip', dim = dim) # possible options are l2, cosine or ip
    
    # Initializing index - the maximum number of elements should be known beforehand
    search_index.init_index(max_elements = num_elements, ef_construction = 200, M = 100)

    # Element insertion (can be called several times):
    ids = np.arange(num_elements)
    search_index.add_items(data, ids)

    return search_index

product_search_index = construct_search_index(d, num_products, product_embeddings_array)

In [83]:
import torch
def get_query_embeddings(query, model, tokenizer, device):
    # Check if MPS is available and use it, otherwise fallback to CPU
    if torch.backends.mps.is_available():
        device = torch.device("mps")
    else:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # Move model to the appropriate device
    model = model.to(device)
    
    inputs = tokenizer(query, padding="max_length", max_length=70, truncation=True, return_tensors="pt")
    model.eval()
    with torch.no_grad():
        # Move inputs to the same device as the model
        inputs = {k: v.to(device) for k, v in inputs.items()}
        query_embs = model(**inputs).detach()
        # Move the output back to CPU
        query_embs = query_embs.cpu()
    return query_embs[0]

    
def get_nearest_neighbours(k, search_index, query_embeddings, ids_to_products_dict, threshold=0.7):
    # Controlling the recall by setting ef:
    search_index.set_ef(100) # ef should always be > k

    # Query dataset, k - number of the closest elements (returns 2 numpy arrays)
    labels, distances = search_index.knn_query(query_embeddings, k = k)
    
    return [(ids_to_products_dict[label], (1-distance)) for label, distance in zip(labels[0], distances[0]) if (1-distance)>=threshold]

In [90]:
def convert_dataset_to_dict(dataset, tokenizer: AutoTokenizer) -> dict:
    # Function to decode the product titles from input_ids
    def decode_product_title(input_ids):
        # Check if input_ids is a list or tensor, and convert to tensor if it's a list
        if isinstance(input_ids, list):
            input_ids = torch.tensor(input_ids)
        
        # Remove padding tokens (assuming 0 is the pad token id)
        input_ids = input_ids[input_ids != 0]
        
        return tokenizer.decode(input_ids, skip_special_tokens=True)

    # Create a dictionary with index as key and product title as value
    product_dict = {}
    for idx, example in enumerate(dataset):
        product_title = decode_product_title(example['product_input_ids'])
        product_dict[idx] = product_title

    return product_dict

In [91]:
ids_to_products_dict = convert_dataset_to_dict(eval_dataloader, tokenizer)

In [92]:
ids_to_products_dict

{0: 'fotmishu 6pcs greenhouse hoops rust - free grow tunnel tunnel, 4ft long steel with plastic coated plant supports for garden fabric, plant support garden stakes zippity outdoor products zp19028 unassembled madison vinyl gate kit with fence wings, white',
 1: 'zippity outdoor products zp19026 lightweight portable vinyl picket fence kit w / metal base ( 42 " h x 92 " w ), white colourtree 4\'x 50\'green fence privacy screen windscreen cover fabric shade tarp netting mesh cloth - commercial grade 170 gsm - cable zip ties included - we make custom size',
 2: "colourtree 6'x 50'black fence privacy screen windscreen cover fabric shade tarp netting mesh cloth - commercial grade 170 gsm - cable zip ties included - we make custom size mixc 10 - pack seed trays seedling starter tray, humidity adjustable plant starter kit with dome and base greenhouse grow trays mini propagator for seeds growing starting ( 12 cells per tray )",
 3: 'candyhome green anti bird protection net mesh garden plant n

In [98]:
query = "the"
k = 10
query_embeddings = get_query_embeddings(query, model, tokenizer, device)
search_results = get_nearest_neighbours(k, product_search_index, query_embeddings, ids_to_products_dict, threshold=0.5)

print(f"{query=}") 
for product, cosine_sim_score in search_results:
    print(f"cosine_sim_score={round(cosine_sim_score,2)} {product=}")

query='the'


In [99]:
search_results

[]

In [101]:
query_embeddings.shape

torch.Size([1024])