In [1]:
import json
from torch.utils.data import DataLoader
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, LoggingHandler, util, models, losses, InputExample
import logging
from datetime import datetime
import gzip
import os
import tarfile
import tqdm
from torch.utils.data import Dataset
import random
import pickle
import argparse
import pandas as pd

#### Just some code to print debug information to stdout
logging.basicConfig(
    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
)
#### /print debug information to stdout

In [2]:
from dataclasses import dataclass
@dataclass
class Args():
    train_batch_size: int = 128
    max_seq_length: int = 300
    model_name: str = "sentence-transformers/all-MiniLM-L12-v2"
    max_passages:int = 0
    epochs: int = 2
    pooling: str = "mean"
    negs_to_use: str = None
    warmup_steps: int = 1000
    lr: float = 4e-5
    num_negs_per_system: int = 5
    use_pre_trained_model: bool = False,
    use_all_queries: bool = False,
    ce_score_margin: float = 3.0

print(Args())

Args(train_batch_size=128, max_seq_length=300, model_name='sentence-transformers/all-MiniLM-L12-v2', max_passages=0, epochs=2, pooling='mean', negs_to_use=None, warmup_steps=1000, lr=4e-05, num_negs_per_system=5, use_pre_trained_model=(False,), use_all_queries=(False,), ce_score_margin=3.0)


In [3]:
args = Args()
model_name = args.model_name
model_name

'sentence-transformers/all-MiniLM-L12-v2'

In [4]:
# Increasing the train batch size improves the model performance, but requires more GPU memory
train_batch_size = (
    args.train_batch_size
)
max_seq_length = args.max_seq_length  # Max length for passages. Increasing it, requires more GPU memory
ce_score_margin = args.ce_score_margin  # Margin for the CrossEncoder score between negative and positive passages
num_negs_per_system = (
    args.num_negs_per_system
)  # We used different systems to mine hard negatives. Number of hard negatives to add from each system
num_epochs = args.epochs  # Number of epochs we want to train

In [5]:
# Load our embedding model

logging.info("Create new SBERT model")
word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), args.pooling)
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

model_save_path = "output/train_bi-encoder-triplet-{}-{}".format(
    model_name.replace("/", "-"), datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
)

2024-06-17 06:30:11 - Create new SBERT model




2024-06-17 06:30:11 - Use pytorch device_name: cuda


In [6]:
# We create a custom MSMARCO dataset that returns triplets (query, positive, negative)
# on-the-fly based on the information from the mined-hard-negatives jsonl file.
from torch.utils.data import Dataset
class MSMARCODataset(Dataset):
    def __init__(self, query_pos_negs, queries_df, corpus):
        self.queries_pos = dict(zip(list(query_pos_negs.query_id), list(query_pos_negs.pos_product_ids.apply(eval))))
        self.queries_neg = dict(zip(list(query_pos_negs.query_id), list(query_pos_negs.neg_product_ids.apply(eval))))
        self.queries_ids = list(query_pos_negs.query_id)
        self.corpus = corpus
        
        self.item_dict = dict(zip(corpus.product_id, corpus.item_text))
        self.queries = dict(zip(queries_df.query_id, queries_df["query"]))

    def __getitem__(self, item):
        qid = self.queries_ids[item]
        query_text = self.queries[qid]

        pos_id = self.queries_pos[qid].pop()
        pos_text = self.item_dict[pos_id]
        self.queries_pos[qid].append(pos_id)

        neg_id = self.queries_neg[qid].pop()
        neg_text = self.item_dict[pos_id]
        self.queries_neg[qid].append(neg_id)

        return InputExample(texts=[query_text, pos_text, neg_text])

    def __len__(self):
        return len(self.queries_ids)



In [7]:
## product info for given item
def create_item_text(row):
    title = row["product_title"]
    brand = row["product_brand"]
    color = row["product_color"]
    return f"Title: {title}. Brand: {brand}. Color: {color}"

In [8]:
esci = load_dataset("tasksource/esci")


training_examles = pd.read_csv("train_esci_pos_neg_top3_2024-06-10.csv")
print("training_examles.shape", training_examles.shape)
test_examples = pd.read_csv("test_esci_pos_neg_top3_2024-06-10.csv")


corpus_train = esci["train"].to_pandas()
corpus_product_train = corpus_train[["product_id", "product_title", "product_brand", "product_color"]].drop_duplicates(subset=["product_id"])

corpus_query_train = corpus_train[["query_id", "query"]].drop_duplicates(subset=["query_id"])
corpus_product_train['item_text'] = corpus_product_train.apply(lambda row: create_item_text(row), axis=1)


print("Size of corpus: ", corpus_product_train.shape)

training_examles.shape (17060, 3)
Size of corpus:  (1436116, 5)


In [9]:
train_dataset = MSMARCODataset(training_examles, corpus_query_train, corpus_product_train)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)
train_loss = losses.MultipleNegativesRankingLoss(model=model)
#train_loss = losses.TripletLoss(model=model)

In [10]:
from sentence_transformers import evaluation

In [11]:
evaluation.InformationRetrievalEvaluator?

[0;31mInit signature:[0m
[0mevaluation[0m[0;34m.[0m[0mInformationRetrievalEvaluator[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mqueries[0m[0;34m:[0m [0mDict[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mstr[0m[0;34m][0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcorpus[0m[0;34m:[0m [0mDict[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mstr[0m[0;34m][0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mrelevant_docs[0m[0;34m:[0m [0mDict[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mSet[0m[0;34m[[0m[0mstr[0m[0;34m][0m[0;34m][0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcorpus_chunk_size[0m[0;34m:[0m [0mint[0m [0;34m=[0m [0;36m50000[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmrr_at_k[0m[0;34m:[0m [0mList[0m[0;34m[[0m[0mint[0m[0;34m][0m [0;34m=[0m [0;34m[[0m[0;36m10[0m[0;34m][0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mndcg_at_k[0m[0;34m:[0m [0mList[0m[0;34m[[0m[0mint[0m[0;34m][0m [0;34m=[0m [0;34m[[0m[0;36m10[0m[0;34m][0m

In [10]:
# from sentence_transformers import evaluation

# eval_df = pd.read_csv("eval_esci_exact.csv")
# eval_df = eval_df.dropna(subset=['query', 'product_title'], how='all')

# eval_queries = eval_df["query"].tolist()
# eval_items = eval_df["product_title"].tolist()
# scores = [1.0]*len(eval_queries)
# evaluator = evaluation.EmbeddingSimilarityEvaluator(eval_queries, eval_items, scores)


In [12]:
# from sentence_transformers import evaluation
ir_eval_df = pd.read_csv("ir_eval_data.csv").head(1000)
ir_eval_df = ir_eval_df.dropna(subset=['query'], how='all')
eval_queries = dict(zip(ir_eval_df.query_id.tolist(),ir_eval_df["query"].tolist()))
eval_relevant_docs = dict(zip(ir_eval_df.query_id.tolist(),[eval(x) for x in ir_eval_df["relevant_product_id"].tolist()]))

esci_test = esci["test"].to_pandas()
esci_test = esci_test.drop_duplicates(subset=["product_id"])
eval_corpus = dict(zip(esci_test.product_id.tolist(), esci_test["product_title"].tolist()))

evaluator = evaluation.InformationRetrievalEvaluator(eval_queries, eval_corpus, eval_relevant_docs)

In [13]:
# Train the model
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=num_epochs,
    warmup_steps=args.warmup_steps,
    use_amp=True,
    checkpoint_path=model_save_path,
    checkpoint_save_steps=len(train_dataloader),
    optimizer_params={"lr": args.lr},
    save_best_model=True,
    checkpoint_save_total_limit=2,
    evaluator=evaluator,
    evaluation_steps=1000,
    
)

# # Save the model
# model.save(model_save_path)

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/134 [00:00<?, ?it/s]

2024-06-17 06:31:49 - Save model to output/train_bi-encoder-triplet-sentence-transformers-all-MiniLM-L12-v2-2024-06-17_06-30-11/134
2024-06-17 06:31:49 - Information Retrieval Evaluation of the model on the  dataset after epoch 0:
2024-06-17 06:36:37 - Queries: 1000
2024-06-17 06:36:37 - Corpus: 545818

2024-06-17 06:36:37 - Score-Function: cos_sim
2024-06-17 06:36:37 - Accuracy@1: 38.20%
2024-06-17 06:36:37 - Accuracy@3: 55.30%
2024-06-17 06:36:37 - Accuracy@5: 63.60%
2024-06-17 06:36:37 - Accuracy@10: 73.30%
2024-06-17 06:36:37 - Precision@1: 38.20%
2024-06-17 06:36:37 - Precision@3: 32.23%
2024-06-17 06:36:37 - Precision@5: 29.80%
2024-06-17 06:36:37 - Precision@10: 25.64%
2024-06-17 06:36:37 - Recall@1: 5.11%
2024-06-17 06:36:37 - Recall@3: 11.23%
2024-06-17 06:36:37 - Recall@5: 15.86%
2024-06-17 06:36:37 - Recall@10: 25.11%
2024-06-17 06:36:37 - MRR@10: 0.4893
2024-06-17 06:36:37 - NDCG@10: 0.3255
2024-06-17 06:36:37 - MAP@100: 0.2602
2024-06-17 06:36:37 - Score-Function: dot_scor

Iteration:   0%|          | 0/134 [00:00<?, ?it/s]

2024-06-17 06:37:22 - Save model to output/train_bi-encoder-triplet-sentence-transformers-all-MiniLM-L12-v2-2024-06-17_06-30-11/268
2024-06-17 06:37:23 - Information Retrieval Evaluation of the model on the  dataset after epoch 1:
2024-06-17 06:42:15 - Queries: 1000
2024-06-17 06:42:15 - Corpus: 545818

2024-06-17 06:42:15 - Score-Function: cos_sim
2024-06-17 06:42:15 - Accuracy@1: 39.50%
2024-06-17 06:42:15 - Accuracy@3: 58.00%
2024-06-17 06:42:15 - Accuracy@5: 65.40%
2024-06-17 06:42:15 - Accuracy@10: 74.20%
2024-06-17 06:42:15 - Precision@1: 39.50%
2024-06-17 06:42:15 - Precision@3: 33.63%
2024-06-17 06:42:15 - Precision@5: 30.90%
2024-06-17 06:42:15 - Precision@10: 26.46%
2024-06-17 06:42:15 - Recall@1: 5.49%
2024-06-17 06:42:15 - Recall@3: 11.68%
2024-06-17 06:42:15 - Recall@5: 16.51%
2024-06-17 06:42:15 - Recall@10: 25.53%
2024-06-17 06:42:15 - MRR@10: 0.5047
2024-06-17 06:42:15 - NDCG@10: 0.3370
2024-06-17 06:42:15 - MAP@100: 0.2717
2024-06-17 06:42:15 - Score-Function: dot_scor

In [None]:
model_name