In [None]:
! pip freeze | grep sentence

sentence-transformers==4.1.0
sentencepiece==0.2.0


- Install sentence-transformers
  - colab comes with preinstalled sentece-transformers

- Popular pre-trained models
  - all-MiniLM-L6-v2 – Fast and lightweight
  - paraphrase-MiniLM-L6-v2
  - multi-qa-MiniLM-L6-cos-v1 – Good for question-answering
  - distiluse-base-multilingual-cased-v1 – Supports 15+ languages (including Malayalam!)

- even for more than one sentence it generate a fixed size encoding

In [4]:
import torch
torch.backends.mps.is_available()

True

In [3]:
from sentence_transformers import SentenceTransformer
import torch

model = SentenceTransformer('all-MiniLM-L6-v2')
# device = 'cuda' if torch.cuda.is_available() else 'cpu'
device = 'mps' if torch.backends.mps.is_available() else 'cpu'
model.to(device)

# sentences = ["This is an example sentence", "Each sentence is converted"]

# embeddings = model.encode(sentences, device=device)

# print(embeddings.shape)  # (2, 384)


SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False, 'architecture': 'BertModel'})
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [4]:
def cosine_similarity(x, y):
  x = F.normalize(x, p=2, dim=1) # p=2 Euclidien distance
  y = F.normalize(y, p=2, dim=1)
  return F.cosine_similarity(x, y, dim=1)

In [None]:
from sentence_transformers import util
import pandas as pd
import torch.nn.functional as F

file_name = 'train.csv'

df = pd.read_csv(file_name, engine='python')
print(df.head())

sentences_1 = df.question1.tolist()
sentences_2 = df.question2.tolist()

encoded_1 = model.encode(
    sentences_1,
    show_progress_bar=True,
    normalize_embeddings=True,
    convert_to_tensor=True,
    device=device
    )

print(f'encoded shape: {encoded_1.shape}')

encoded_2 = model.encode(
    sentences_2,
    show_progress_bar=True,
    normalize_embeddings=True,
    convert_to_tensor=True,
    device=device
    )

encoded_1.to(device)
encoded_2.to(device)

similarity_scores = cosine_similarity(encoded_1, encoded_2)

   id  qid1  qid2                                          question1  \
0   0     1     2  What is the step by step guide to invest in sh...   
1   1     3     4  What is the story of Kohinoor (Koh-i-Noor) Dia...   
2   2     5     6  How can I increase the speed of my internet co...   
3   3     7     8  Why am I mentally very lonely? How can I solve...   
4   4     9    10  Which one dissolve in water quikly sugar, salt...   

                                           question2  is_duplicate  
0  What is the step by step guide to invest in sh...             0  
1  What would happen if the Indian government sto...             0  
2  How can Internet speed be increased by hacking...             0  
3  Find the remainder when [math]23^{24}[/math] i...             0  
4            Which fish would survive in salt water?             0  


Batches:   0%|          | 0/12635 [00:00<?, ?it/s]

encoded shape: torch.Size([404290, 384])


Batches:   0%|          | 0/12635 [00:00<?, ?it/s]

In [None]:
df['similarity_scores'] = similarity_scores.tolist()
df.to_csv('train_with_similarity_scores.csv', index=False)

In [None]:
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,similarity_scores
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,0.912277
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,0.655141
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,0.515561
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,0.104022
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,0.325348


In [None]:
df['clculated_similarity'] = df.apply(
    lambda x: 1 if x['similarity_scores'] > 0.75 else 0,
    axis=1
)

diff_df = df.loc[df['clculated_similarity'] != df['is_duplicate']]
diff_df.to_csv('diff_df.csv', index=False)

In [None]:
# from google.colab import files

# files.download('train_with_similarity_scores.csv')
diff_df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,similarity_scores,clculated_similarity
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,0.912277,1
14,14,29,30,What are the laws to change your status from a...,What are the laws to change your status from a...,0,0.792194,1
15,15,31,32,What would a Trump presidency mean for current...,How will a Trump presidency affect the student...,1,0.662187,0
21,21,43,44,What's causing someone to be jealous?,What can I do to avoid being jealous of someone?,0,0.773075,1
25,25,51,52,What are some tips on making it through the jo...,What are some tips on making it through the jo...,0,0.828949,1


In [None]:
from sklearn.metrics import confusion_matrix, f1_score

confusion_matrix(
    df['is_duplicate'],
    df['clculated_similarity'],
    normalize= 'true'
    )

array([[0.70876025, 0.29123975],
       [0.12695712, 0.87304288]])

In [None]:
f1_score(df['is_duplicate'], df['clculated_similarity'])

0.7365437332202911

## Training a S-BERT Model

### Prepare training data
```
from sentence_transformers import InputExample
train_examples = [
  InputExample(texts=["How to learn Python?", "What's the best way to learn Python?"], label=1.0),
  InputExample(texts=["What is AI?", "How to bake a cake?"], label=0.0),
]
```

### Load pretrained S-BERT model
```
from sentence_transformers import SentenceTransformer`
model = SentenceTransformer('all-MiniLM-L6-v2')
```

### Choose a Loss Function
- `CosineSimilarityLoss`: For similarity regression (labels: 0.0–1.0)
- `ContrastiveLoss` / `TripletLoss`: For contrastive
- `MultipleNegativesRankingLoss`: For large-scale retrieval
- `SoftmaxLoss`: For classification

```
from sentence_transformers import losses

train_loss = losses.CosineSimilarityLoss(model)
```

### Train the Model
```
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=1,
    warmup_steps=100,
    show_progress_bar=True
)
```

### Save the model
```
model.save('output/sbert-finetuned-model')
```

### Use the Model
```
model = SentenceTransformer('output/sbert-finetuned-model')
embeddings = model.encode(["Your sentence here"])
```

## Generator to generate data to the Model
- memory management
- limited data is loaded to memory

In [13]:
import pandas as pd
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import IterableDataset, DataLoader
import torch.nn.functional as F
from datasets import Dataset

EPOCHS = 1

def input_exampe_generator(csv_path, chunk_size=10000):
  """
  Generator to yield input examples from a CSV file
  """
  chunk = pd.read_csv(csv_path, chunksize=chunk_size)
  for df in chunk:
    for index, row in df.iterrows():
      if pd.notnull(row['question1']) and pd.notnull(row['question2']):
        yield InputExample(
            texts=[str(row['question1']), str(row['question2'])],
            label=float(row['is_duplicate'])
        )

class SBERTInputDataset(IterableDataset):
  def __init__(self, csv_path):
    self.csv_path = csv_path

  def __iter__(self):
    return input_exampe_generator(self.csv_path)

TRAIN_FILE = '../data/train.csv'
train_dataset = SBERTInputDataset(TRAIN_FILE)
train_dataloader = DataLoader(train_dataset, batch_size=32)

## Train the model
- SBERT Training

In [23]:
from sentence_transformers import SentenceTransformer, losses, fit_mixin
import os
from datasets import Dataset, DatasetDict

model = SentenceTransformer('all-MiniLM-L6-v2')
train_loss = losses.CosineSimilarityLoss(model)

# disable wandb enronement
os.environ["WANDB_DISABLED"] = "true"
os.environ["WANDB_MODE"] = "offline"
os.environ["WANDB_API_KEY"] = "disabled"

# quick fix to resolve the name error
fit_mixin.Dataset = Dataset
fit_mixin.DatasetDict = DatasetDict

# automatically pick GPU if available
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=EPOCHS,
    warmup_steps=int(0.1*EPOCHS),
    show_progress_bar=True
)

ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>=0.26.0'`

## Save the model

In [3]:
model.save('output/sbert-finetuned-model')

In [4]:
!zip -r output.zip output/

from google.colab import files

files.download('output.zip')

  adding: output/ (stored 0%)
  adding: output/sbert-finetuned-model/ (stored 0%)
  adding: output/sbert-finetuned-model/1_Pooling/ (stored 0%)
  adding: output/sbert-finetuned-model/1_Pooling/config.json (deflated 57%)
  adding: output/sbert-finetuned-model/special_tokens_map.json (deflated 80%)
  adding: output/sbert-finetuned-model/2_Normalize/ (stored 0%)
  adding: output/sbert-finetuned-model/config_sentence_transformers.json (deflated 35%)
  adding: output/sbert-finetuned-model/config.json (deflated 48%)
  adding: output/sbert-finetuned-model/tokenizer_config.json (deflated 73%)
  adding: output/sbert-finetuned-model/tokenizer.json (deflated 71%)
  adding: output/sbert-finetuned-model/sentence_bert_config.json (deflated 4%)
  adding: output/sbert-finetuned-model/modules.json (deflated 62%)
  adding: output/sbert-finetuned-model/vocab.txt (deflated 53%)
  adding: output/sbert-finetuned-model/README.md (deflated 68%)
  adding: output/sbert-finetuned-model/model.safetensors (deflate

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Use the Model

In [1]:
! unzip output.zip

Archive:  output.zip
   creating: output/
   creating: output/sbert-finetuned-model/
   creating: output/sbert-finetuned-model/1_Pooling/
  inflating: output/sbert-finetuned-model/1_Pooling/config.json  
  inflating: output/sbert-finetuned-model/special_tokens_map.json  
   creating: output/sbert-finetuned-model/2_Normalize/
  inflating: output/sbert-finetuned-model/config_sentence_transformers.json  
  inflating: output/sbert-finetuned-model/config.json  
  inflating: output/sbert-finetuned-model/tokenizer_config.json  
  inflating: output/sbert-finetuned-model/tokenizer.json  
  inflating: output/sbert-finetuned-model/sentence_bert_config.json  
  inflating: output/sbert-finetuned-model/modules.json  
  inflating: output/sbert-finetuned-model/vocab.txt  
  inflating: output/sbert-finetuned-model/README.md  
  inflating: output/sbert-finetuned-model/model.safetensors  


## Evaluate the model

In [36]:
from sklearn.metrics import confusion_matrix, f1_score, log_loss
import pandas as pd
import torch
import torch.nn.functional as F
from sentence_transformers import SentenceTransformer


device = 'cuda' if torch.cuda.is_available() else 'cpu'
model_path = '../model/output/sbert-finetuned-model'

model = SentenceTransformer(model_path)

def cosine_similarity(x, y):
  x = F.normalize(x, p=2, dim=1) # p=2 Euclidien distance
  y = F.normalize(y, p=2, dim=1)
  return F.cosine_similarity(x, y, dim=1)


def evaluate_model(model, test_file, return_df = False):
  df = pd.read_csv(test_file, engine='python')
  sentences_1 = df.question1.tolist()
  sentences_2 = df.question2.tolist()

  encoded_1 = model.encode(
      sentences_1,
      batch_size=32,
      show_progress_bar=True,
      normalize_embeddings=True,
      convert_to_tensor=True,
      device=device
      )

  encoded_2 = model.encode(
      sentences_2,
      batch_size=32,
      show_progress_bar=True,
      normalize_embeddings=True,
      convert_to_tensor=True,
      device=device
      )

  similarity_scores = cosine_similarity(
      encoded_1,
      encoded_2
      )

  df['similarity_scores'] = similarity_scores.tolist()

  df['clculated_similarity'] = df.apply(
      lambda x: 1 if x['similarity_scores'] > 0.75 else 0,
      axis=1
  )

  cm = confusion_matrix(
      df['is_duplicate'],
      df['clculated_similarity'],
      normalize= 'true'
      )
  print(f'confusion metircs: {cm}')

  F1 = f1_score(df['is_duplicate'], df['clculated_similarity'])
  print(f'F1 Score: {F1}')

  lg = log_loss(df['is_duplicate'], df['clculated_similarity'])
  print(f'Log Loss: {lg}')

  if return_df:
      return df

def calculate_similarity(model, sentence_1, sentence_2):
  encoded_1 = model.encode(
      sentence_1,
      batch_size=32,
      show_progress_bar=True,
      normalize_embeddings=True,
      convert_to_tensor=True,
      device=device
      )

  encoded_2 = model.encode(
      sentence_2,
      batch_size=32,
      show_progress_bar=True,
      normalize_embeddings=True,
      convert_to_tensor=True,
      device=device
      )
  similarity_scores = cosine_similarity(
      encoded_1,
      encoded_2
      )
  return similarity_scores.tolist()

def predict_duplicates(model, file):
  df = pd.read_csv(file, engine='python')
  df['similarity_scores'] = df.apply(
      lambda x: calculate_similarity(
          model,
          x['question1'],
          x['question2']
      ),
      axis=1
  )
  df['clculated_similarity'] = df.apply(
      lambda x: 1 if x['similarity_scores'] > 0.75 else 0,
      axis=1
  )
  return df

# df = evaluate_model(model, '../data/train.csv', return_df=True)
# df = predict_duplicates(model, 'test.csv')
# df.head()

## Wrong prediction
- Most of the wrong predictions are due to mislabelled examples
- The 'is_duplicate' labels are not correct
    - What is the ideal life after retirement? (question 1)
    - What's life after retirement?
    - 'is_duplicate' flag for this is 0. However, the questions seem to have similar meaning

In [10]:
df_wrong = df.loc[
    df['clculated_similarity'] != df['is_duplicate']
    ]

In [8]:
df.columns

Index(['id', 'qid1', 'qid2', 'question1', 'question2', 'is_duplicate',
       'similarity_scores', 'clculated_similarity'],
      dtype='object')

In [11]:
df_wrong.loc[
     df['clculated_similarity'] == 1
    ]

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,similarity_scores,clculated_similarity
41,41,83,84,When can I expect my Cognizant confirmation mail?,When can I expect Cognizant confirmation mail?,0,0.971432,1
139,139,279,280,What is the ideal life after retirement?,What's life after retirement?,0,0.868934,1
202,202,405,406,What's the best way to start learning robotics?,What is the best way to start robotics? Which ...,0,0.849427,1
277,277,554,555,How do most people die?,How do people die?,0,0.890599,1
298,298,596,597,On what online platforms can I post ads for be...,What online platforms can I post ads for beer ...,0,0.996689,1
...,...,...,...,...,...,...,...,...
404114,404114,537757,537758,Where can I buy this dress? Is it expensive?,Where can I buy this dress?,0,0.769026,1
404167,404167,537817,15196,If you could ask every human presently alive o...,If you could get now instantly and completely ...,0,0.830214,1
404222,404222,211594,523550,How can I make money in travelling?,Is there any way you can make money from trave...,0,0.957303,1
404256,404256,89818,537904,What are some of the most common questions ask...,What are the most common questions asked in in...,0,0.959023,1


## Retrieving similar questions
- ANN (Approximate Nearest Neighbour)
- faiss (implementation by Facebook
- Observations
  - Some of the answers were good
  - Some answers need improvements like
    - Similarity between How do I read and find my YouTube comments? And how can I see all of my comments on Quora? found to be 0.9244135618209839
 - further actions
   - Change the loss function to `ConstrastiveLoss`

In [16]:
## find the match
import pandas as pd
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = SentenceTransformer("../model/output/sbert-finetuned-model")
df = pd.read_csv('../data/train.csv', engine='python')

sentences_1 = df['question1'].tolist()
sentences_2 = df['question2'].tolist()


encoded1 = model.encode(
    sentences_1,
    batch_size=32,
    show_progress_bar=True,
    normalize_embeddings=True,
    convert_to_numpy=True,
    device=device
)
encoded2 = model.encode(
    sentences_2,
    batch_size=32,
    show_progress_bar=True,
    normalize_embeddings=True,
    convert_to_numpy=True,
    device=device
)

faiss.normalize_L2(encoded1)
faiss.normalize_L2(encoded2)

index = faiss.IndexFlatIP(encoded2.shape[1])
index.add(encoded2)

top_k = 5
similarites, indices = index.search(encoded1, top_k)

Batches:   0%|          | 0/12635 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Batches:   0%|          | 0/12635 [00:00<?, ?it/s]

In [5]:
similarites

array([[0.91227645, 0.8604805 , 0.8574519 , 0.8574519 , 0.84151256],
       [0.8410276 , 0.6551413 , 0.61047506, 0.5974746 , 0.5974746 ],
       [0.78315544, 0.7647015 , 0.71803963, 0.71803963, 0.71803963],
       ...,
       [0.74668044, 0.7255267 , 0.6535923 , 0.6329106 , 0.6018275 ],
       [0.70431125, 0.701547  , 0.6977883 , 0.674984  , 0.6730344 ],
       [0.9722142 , 0.91270566, 0.8314768 , 0.8136778 , 0.7995485 ]],
      dtype=float32)

In [7]:
indices

array([[     0, 330730, 178514,  59733, 245368],
       [263614,      1,  86052, 133496, 131477],
       [269902, 127219, 257764, 147891,   1918],
       ...,
       [340794, 404287, 257235, 177369, 160305],
       [ 71510, 292255,  69877, 318344, 328232],
       [404289,  83668,  44714, 253035, 269078]])

In [47]:
for index in range(30):
    sim_q = []
    for i, s in zip(indices[index], similarites[index]):
        if s > .75:
            sim_q.append(f"{df['question2'][i]} (similarity:{s})")
        else:
            break
    if len(sim_q) > 0:
        print(f"question: \n{df['question1'][index]}")
        print('----------------------------------')
        print('similar questions')
        for item in sim_q:
            print(item)
        print('------------------------------------')

question: 
What is the step by step guide to invest in share market in india?
----------------------------------
similar questions
How do I invest in mutual funds in India? (similarity:0.9152430295944214)
How do I invest in mutual funds in India? (similarity:0.9152430295944214)
How do I publish a book in India? (similarity:0.9151650667190552)
How do I invest money in the stock markets of India? (similarity:0.905360221862793)
What are some good ways to invest money for a short period of time in India? (similarity:0.8961689472198486)
------------------------------------
question: 
How can I be a good geologist?
----------------------------------
similar questions
What should I do to be a great geologist? (similarity:0.982363224029541)
------------------------------------
question: 
When do you use シ instead of し?
----------------------------------
similar questions
When do you use シ instead of し? (similarity:0.9999998807907104)
------------------------------------
question: 
How do I rea

In [26]:
indices[0]

array([202369, 111062, 156698, 399128, 234556])

In [42]:
from sklearn.metrics.pairwise import cosine_similarity
s1 = model.encode(df.loc[0, 'question1'], convert_to_numpy=True)
s2 = model.encode(df.loc[156698, 'question2'], convert_to_numpy=True)
cosine_similarity(s1.reshape(1, -1), s2.reshape(1, -1))

  return forward_call(*args, **kwargs)


array([[0.915165]], dtype=float32)