In [1]:
!pip install transformers datasets peft sentence-transformers

Collecting peft
  Downloading peft-0.12.0-py3-none-any.whl.metadata (13 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-3.1.0-py3-none-any.whl.metadata (23 kB)
Downloading peft-0.12.0-py3-none-any.whl (296 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m296.4/296.4 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
[?25hDownloading sentence_transformers-3.1.0-py3-none-any.whl (249 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m249.1/249.1 kB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence-transformers, peft
Successfully installed peft-0.12.0 sentence-transformers-3.1.0


In [2]:
import torch
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, losses
from sentence_transformers import InputExample, SentencesDataset, LoggingHandler
from sentence_transformers import (models,util,SentenceTransformerTrainingArguments, SentenceTransformerTrainer,evaluation)
from torch.utils.data import DataLoader
import logging
import pandas as pd
import numpy as np
np.random.seed(42)

In [3]:
data=pd.read_csv("/kaggle/input/data-llm-lemma/data_llm_lemma.csv")
contents=data["sentence1"].unique()
train_contents=np.random.choice(contents,size=50,replace=False)
difference=list(set(contents)-set(train_contents))
val_contents=np.random.choice(difference,size=30,replace=False)
train_data=data[data["sentence1"].isin(train_contents)]
val_data=data[data["sentence1"].isin(val_contents)]

In [4]:
len(train_data),len(val_data)

(94300, 56580)

In [4]:
products=train_data.sentence2.unique()
train_prods=np.random.choice(products,size=1000,replace=False)
val_products=val_data.sentence2.unique()
val_prods=np.random.choice(val_products,size=1000,replace=False)
train_data_1=train_data[train_data["score"]==1.0]
train_data_2=train_data[train_data["sentence2"].isin(train_prods)]
train_data=pd.concat([train_data_1,train_data_2],axis=0,ignore_index=True)
val_data_1=val_data[val_data["score"]==1.0]
val_data_2=val_data[val_data["sentence2"].isin(val_prods)]
val_data=pd.concat([val_data_1,val_data_2],axis=0,ignore_index=True)
#val_data.sort_values("sentence1").to_csv("/kaggle/working/val_data_llm.csv",index=False)
#train_data.sort_values("sentence1").to_csv("/kaggle/working/train_data_llm.csv",index=False)

In [13]:
val_data=pd.read_csv("/kaggle/working/val_data_lemma.csv")
train_data=pd.read_csv("/kaggle/working/train_data_lemma.csv")
val_data["score"]=val_data["score"].apply(lambda x : round(x,2))
train_data["score"]=train_data["score"].apply(lambda x : round(x,2))
train_data.to_csv("/kaggle/working/train_data_lemma.csv",index=False)
val_data.to_csv("/kaggle/working/val_data_lemma.csv",index=False)

In [14]:
# Example dataset loading
dataset = load_dataset('csv', data_files={'train': "/kaggle/working/train_data_lemma.csv", 'validation': "/kaggle/working/val_data_lemma.csv"})
#dataset = load_dataset('csv', data_files={'train': "/kaggle/input/data-for-llm/train_data_v2.0.csv", 'validation': "/kaggle/input/data-for-llm/val_data_v2.0.csv"})

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

In [15]:
train_data=dataset['train']
val_data=dataset['validation']

In [8]:
#Sentence Transformer BERT Model
word_embedding_model = models.Transformer('/kaggle/input/all-minilm-l6-v2-fine-tuned-model/kaggle/working/All-MiniLM-L6-V2-model')

# Applying pooling on final layer
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())

model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

In [19]:
# Define loss
loss = losses.CoSENTLoss(model)

# Define evaluator for evaluation
evaluator = evaluation.EmbeddingSimilarityEvaluator(
    sentences1=val_data['sentence1'],
    sentences2=val_data['sentence2'],
    scores=val_data['score'],
    main_similarity=evaluation.SimilarityFunction.COSINE,
    name="score"
)

In [25]:
# Training arguments
training_args = SentenceTransformerTrainingArguments(
    output_dir="/kaggle/working/data-for-llm/model_checkpoint",  # Save checkpoints
    num_train_epochs=10,  # Reduced number of epochs
    seed=33,
    per_device_train_batch_size=8,  # Larger batch size for faster training
    per_device_eval_batch_size=8,  # Larger batch size for evaluation
    learning_rate=2e-5,
    fp16=True,  # Loading model in mixed-precision
    warmup_ratio=0.1,  # Reduced warmup ratio
    evaluation_strategy="steps",
    eval_steps=7000,  # Less frequent evaluation
    save_total_limit=1,  # Save only the most recent model
    load_best_model_at_end=True,
    save_only_model=True,
    save_steps=7000,
    greater_is_better=True,
    #gradient_accumulation_steps=2  # Simulate larger batch size with accumulation
)



In [26]:
# Train model
trainer = SentenceTransformerTrainer(
    model=model,
    evaluator=evaluator,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    loss=loss
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [27]:
trainer.train()

Step,Training Loss,Validation Loss,Score Pearson Cosine,Score Spearman Cosine,Score Pearson Manhattan,Score Spearman Manhattan,Score Pearson Euclidean,Score Spearman Euclidean,Score Pearson Dot,Score Spearman Dot,Score Pearson Max,Score Spearman Max
7000,2.2401,3.637507,0.85161,0.838621,0.759386,0.742439,0.76168,0.744998,0.717257,0.678927,0.85161,0.838621
14000,2.2696,3.582551,0.848892,0.834446,0.754095,0.74227,0.756673,0.745412,0.676938,0.633573,0.848892,0.834446
21000,2.2885,3.674954,0.839176,0.819023,0.732848,0.718349,0.73577,0.721061,0.638656,0.600239,0.839176,0.819023
28000,2.2884,3.604441,0.84596,0.838801,0.740236,0.730091,0.744364,0.734903,0.687358,0.650699,0.84596,0.838801
35000,2.3039,3.596492,0.845209,0.831083,0.744216,0.732022,0.745538,0.733443,0.675703,0.6357,0.845209,0.831083
42000,2.2775,3.596903,0.842019,0.829384,0.750158,0.735377,0.750672,0.735315,0.66266,0.615548,0.842019,0.829384
49000,2.2445,3.618869,0.852516,0.839497,0.739531,0.727702,0.741791,0.730184,0.671857,0.634896,0.852516,0.839497
56000,2.3523,3.416443,0.84508,0.829456,0.758328,0.744332,0.76105,0.747138,0.650234,0.598794,0.84508,0.829456
63000,2.3193,3.437399,0.848205,0.837678,0.761638,0.750162,0.765775,0.754428,0.669639,0.622506,0.848205,0.837678
70000,2.2692,3.502617,0.855727,0.839171,0.768659,0.753592,0.771893,0.757142,0.676754,0.62482,0.855727,0.839171


TrainOutput(global_step=117880, training_loss=2.241715648743028, metrics={'train_runtime': 8596.7315, 'train_samples_per_second': 109.693, 'train_steps_per_second': 13.712, 'total_flos': 0.0, 'train_loss': 2.241715648743028, 'epoch': 10.0})

In [23]:
# save the model
model.save_pretrained("/kaggle/working/All-MiniLM-L6-V2-model-v3/")

In [101]:
!zip -r /kaggle/working/fine_tuned_model_0_8485.zip /kaggle/working/All-MiniLM-L6-V2-model

  pid, fd = os.forkpty()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  adding: kaggle/working/All-MiniLM-L6-V2-model/ (stored 0%)
  adding: kaggle/working/All-MiniLM-L6-V2-model/tokenizer_config.json (deflated 74%)
  adding: kaggle/working/All-MiniLM-L6-V2-model/config.json (deflated 47%)
  adding: kaggle/working/All-MiniLM-L6-V2-model/model.safetensors (deflated 9%)
  adding: kaggle/working/All-MiniLM-L6-V2-model/1_Pooling/ (stored 0%)
  adding: kaggle/working/All-MiniLM-L6-V2-model/1_Pooling/config.json (deflated 57%)
  adding: kaggle/working/All-MiniLM-L6-V2-model/modules.json (deflated 53%)
  adding: kaggle/working/All-MiniLM-L6-V2-model/tokenizer.json (deflated 71%)
  adding: kaggle/working/All-MiniLM-L6-V2-model/config_sentence_transformers.json (deflated 37%)
  adding: kaggle/working/All-MiniLM-L6-V2-model/special_tokens_map.json (deflated 80%)
  adding: kaggle/working/All-MiniLM-L6-V2-model/sentence_bert_config.json (deflated 4%)
  adding: kaggle/working/All-MiniLM-L6-V2-model/README.md (deflated 77%)
  adding: kaggle/working/All-MiniLM-L6-V2-mo