### The Imports

In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer, SentencesDataset, InputExample, LoggingHandler, losses, models, util
from sentence_transformers import SentenceTransformer
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.readers import InputExample
from torch.utils.data import DataLoader
import math
import logging
from datetime import datetime
import sys
import os
import gzip
import csv
import faiss
import numpy as np


### Load in the Dataset

In [2]:
df = pd.read_csv('labels_with_pairs.csv')
df['entry1'] = df['entry1'].astype(str)
df['entry2'] = df['entry2'].astype(str)


### Use the Sentence Transformers Input Example class to get it into the correct format

In [4]:
train_examples = []
for index, row in df.iterrows():
    train_examples.append(InputExample(texts=[row['entry1'], row['entry2']], label=row['score']))

### Define the model layers, loss function and set up Matryoshka Loss

In [5]:
model_name = "Snowflake/snowflake-arctic-embed-m"
num_epochs = 10
model_save_path = (
    "output/matryoshka_sts_" + model_name.replace("/", "-") + "-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
)
word_embedding_model = models.Transformer(model_name)
pooling_model = models.Pooling(
    word_embedding_model.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True,
    pooling_mode_cls_token=False,
    pooling_mode_max_tokens=False,
)

model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=32)
train_loss = losses.CoSENTLoss(model=model)
train_loss = losses.MatryoshkaLoss(model, train_loss, [512,256,128,64])
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)
logging.info("Warmup-steps: {}".format(warmup_steps))

Some weights of BertModel were not initialized from the model checkpoint at Snowflake/snowflake-arctic-embed-m and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GPU is available


### Train the model

In [7]:
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=50,
    use_amp=True,
    warmup_steps=warmup_steps,wandb
    output_path=model_save_path,
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Step,Training Loss
500,23.7148
1000,19.7157
1500,16.8721
2000,15.8492
2500,14.9698
3000,13.7426
3500,13.1985
4000,12.2637
4500,11.5271
5000,10.8256


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

ERROR:sentence_transformers.SentenceTransformer:Error while generating model card:
Traceback (most recent call last):
  File "/usr/local/lib/python3.9/dist-packages/sentence_transformers/SentenceTransformer.py", line 1112, in _create_model_card
    model_card = generate_model_card(self)
  File "/usr/local/lib/python3.9/dist-packages/sentence_transformers/model_card.py", line 977, in generate_model_card
    model_card = ModelCard.from_template(card_data=model.model_card_data, template_path=template_path, hf_emoji="🤗")
  File "/usr/local/lib/python3.9/dist-packages/huggingface_hub/repocard.py", line 414, in from_template
    return super().from_template(card_data, template_path, template_str, **template_kwargs)
  File "/usr/local/lib/python3.9/dist-packages/huggingface_hub/repocard.py", line 324, in from_template
    kwargs = card_data.to_dict().copy()
  File "/usr/local/lib/python3.9/dist-packages/sentence_transformers/model_card.py", line 904, in to_dict
    self.set_widget_examples(da

In [None]:
from sentence_transformers import SentenceTransformer, util
query = 'world war 2 approaching'
model_2 = SentenceTransformer('Snowflake/snowflake-arctic-embed-l')
query_embedding = model.encode([query])
sentences = [
    df['entry2'].loc[0],
    df['entry1'].loc[1],
    "This is the third sentence.",
]

# Encode the sentences using the model
sentences_embeddings = model.encode(sentences)

# Compute cosine similarity between the query embedding and each sentence embedding
cosine_scores = util.pytorch_cos_sim(query_embedding[:,:512], sentences_embeddings[:,:512])

# Print the cosine similarity scores
for i, score in enumerate(cosine_scores):
    print(f"Sentence {i+1}: {score}")

## Create 