In [1]:
"""
This script contains an example how to extend an existent sentence embedding model to new languages.

Given a (monolingual) teacher model you would like to extend to new languages, which is specified in the teacher_model_name
variable. We train a multilingual student model to imitate the teacher model (variable student_model_name)
on multiple languages.

For training, you need parallel sentence data (machine translation training data). You need tab-seperated files (.tsv)
with the first column a sentence in a language understood by the teacher model, e.g. English,
and the further columns contain the according translations for languages you want to extend to.

This scripts downloads automatically the parallel sentences corpus. This corpus contains transcripts from
talks translated to 100+ languages. For other parallel data, see get_parallel_data_[].py scripts

Further information can be found in our paper:
Making Monolingual Sentence Embeddings Multilingual using Knowledge Distillation
https://arxiv.org/abs/2004.09813
"""
#imports

import logging
import traceback
from datetime import datetime

import numpy as np

from datasets import DatasetDict, load_dataset
from sentence_transformers import LoggingHandler, SentenceTransformer
from sentence_transformers.evaluation import (
    EmbeddingSimilarityEvaluator,
    MSEEvaluator,
    SequentialEvaluator,
    TranslationEvaluator,
)
from sentence_transformers.losses import MSELoss
from sentence_transformers.trainer import SentenceTransformerTrainer
from sentence_transformers.training_args import SentenceTransformerTrainingArguments

logging.basicConfig(
    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
)
logger = logging.getLogger(__name__)


  from pandas.core import (


In [2]:
#intitialzing  parameters

# The teacher model is monolingual, we use it for English embeddings
teacher_model_name = "all-MiniLM-L12-v2"
# The student model is multilingual, we train it such that embeddings of non-English texts mimic the teacher model's English embeddings
student_model_name = "all-MiniLM-L6-v2"

student_max_seq_length = 128  # Student model max. lengths for inputs (number of word pieces)
train_batch_size = 64  # Batch size for training
inference_batch_size = 64  # Batch size at inference
max_sentences_per_language = 500000  # Maximum number of  parallel sentences for training

num_train_epochs = 5  # Train for x epochs
num_evaluation_steps = 5000  # Evaluate performance after every xxxx steps


# Define the language codes you would like to extend the model to
source_languages = set(["en"])  # Our teacher model accepts English (en) sentences
# We want to extend the model to these new languages. For language codes, see the header of the train file
# target_languages = set(["de", "es", "it", "fr", "ar", "tr"])
target_languages = set([ "hi"])

output_dir = (
    "output/make-multilingual-"
    + "-".join(sorted(list(source_languages)) + sorted(list(target_languages)))
    + "-"
    + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
)


In [3]:

# 1a. Here we define our SentenceTransformer teacher model.
teacher_model = SentenceTransformer(teacher_model_name)
# If we want, we can limit the maximum sequence length for the model
# teacher_model.max_seq_length = 128
logging.info(f"Teacher model: {teacher_model}")

# 1b. Here we define our SentenceTransformer student model. If not already a Sentence Transformer model,
# it will automatically create one with "mean" pooling.
student_model = SentenceTransformer(student_model_name)
# If we want, we can limit the maximum sequence length for the model
student_model.max_seq_length = student_max_seq_length
logging.info(f"Student model: {student_model}")

# 2. Load the parallel sentences training dataset: https://huggingface.co/datasets?other=sentence-transformers&sort=trending&search=parallel-sentences
# NOTE: We can also use multiple datasets if we want
dataset_to_use = "sentence-transformers/parallel-sentences-talks"
# dataset_to_use = "sentence-transformers/parallel-sentences-europarl"
# dataset_to_use = "sentence-transformers/parallel-sentences-global-voices"
# dataset_to_use = "sentence-transformers/parallel-sentences-muse"
# dataset_to_use = "sentence-transformers/parallel-sentences-jw300"
# dataset_to_use = "sentence-transformers/parallel-sentences-news-commentary"
# dataset_to_use = "sentence-transformers/parallel-sentences-opensubtitles"
# dataset_to_use = "sentence-transformers/parallel-sentences-tatoeba"
# dataset_to_use = "sentence-transformers/parallel-sentences-wikimatrix"
# dataset_to_use = "sentence-transformers/parallel-sentences-wikititles"


2024-07-13 02:23:44 - Use pytorch device_name: mps
2024-07-13 02:23:44 - Load pretrained SentenceTransformer: all-MiniLM-L12-v2


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/352 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

2024-07-13 02:24:20 - Teacher model: SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)
2024-07-13 02:24:20 - Use pytorch device_name: mps
2024-07-13 02:24:20 - Load pretrained SentenceTransformer: all-MiniLM-L6-v2
2024-07-13 02:24:22 - Student model: SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False

In [4]:
train_dataset_dict = DatasetDict()
eval_dataset_dict = DatasetDict()
for source_lang in source_languages:
    for target_lang in target_languages:
        subset = f"{source_lang}-{target_lang}"
        try:
            train_dataset = load_dataset(dataset_to_use, subset, split="train")
            if len(train_dataset) > max_sentences_per_language:
                train_dataset = train_dataset.select(range(max_sentences_per_language))
        except Exception as exc:
            logging.error(f"Could not load dataset {dataset_to_use}/{source_lang}-{target_lang}: {exc}")
            continue

        try:
            eval_dataset = load_dataset(dataset_to_use, subset, split="dev")
            if len(eval_dataset) > 1000:
                eval_dataset = eval_dataset.select(range(1000))
        except Exception:
            logging.info(
                f"Could not load dataset {dataset_to_use}/{source_lang}-{target_lang} dev split, splitting 1k samples from train"
            )
            dataset = train_dataset.train_test_split(test_size=1000, shuffle=True)
            train_dataset = dataset["train"]
            eval_dataset = dataset["test"]

        train_dataset_dict[subset] = train_dataset
        eval_dataset_dict[subset] = eval_dataset
logging.info(train_dataset_dict)



2024-07-13 02:25:34 - DatasetDict({
    en-hi: Dataset({
        features: ['english', 'non_english'],
        num_rows: 45403
    })
})


In [5]:

# We want the student EN embeddings to be similar to the teacher EN embeddings and
# the student non-EN embeddings to be similar to the teacher EN embeddings
def prepare_dataset(batch):
    return {
        "english": batch["english"],
        "non_english": batch["non_english"],
        "label": teacher_model.encode(batch["english"], batch_size=inference_batch_size, show_progress_bar=False),
    }

In [6]:

column_names = list(train_dataset_dict.values())[0].column_names
train_dataset_dict = train_dataset_dict.map(
    prepare_dataset, batched=True, batch_size=30000, remove_columns=column_names
)
logging.info("Prepared datasets for training:", train_dataset_dict)

# 3. Define our training loss
# MSELoss (https://sbert.net/docs/package_reference/sentence_transformer/losses.html#mseloss) needs one text columns and one
# column with embeddings from the teacher model
train_loss = MSELoss(model=student_model)

# 4. Define evaluators for use during training. This is useful to keep track of alongside the evaluation loss.
evaluators = []

for subset, eval_dataset in eval_dataset_dict.items():
    logger.info(f"Creating evaluators for {subset}")

    # Mean Squared Error (MSE) measures the (euclidean) distance between teacher and student embeddings
    dev_mse = MSEEvaluator(
        source_sentences=eval_dataset["english"],
        target_sentences=eval_dataset["non_english"],
        name=subset,
        teacher_model=teacher_model,
        batch_size=inference_batch_size,
    )
    evaluators.append(dev_mse)

    # TranslationEvaluator computes the embeddings for all parallel sentences. It then check if the embedding of
    # source[i] is the closest to target[i] out of all available target sentences
    dev_trans_acc = TranslationEvaluator(
        source_sentences=eval_dataset["english"],
        target_sentences=eval_dataset["non_english"],
        name=subset,
        batch_size=inference_batch_size,
    )
    evaluators.append(dev_trans_acc)

    # Try to load this subset from STS17
#     test_dataset = None
#     try:
#         test_dataset = load_dataset("mteb/sts17-crosslingual-sts", subset, split="test")
#     except Exception:
#         try:
#             test_dataset = load_dataset("mteb/sts17-crosslingual-sts", f"{subset[3:]}-{subset[:2]}", split="test")
#             subset = f"{subset[3:]}-{subset[:2]}"
#         except Exception:
#             pass
#     if test_dataset:
#         test_evaluator = EmbeddingSimilarityEvaluator(
#             sentences1=test_dataset["sentence1"],
#             sentences2=test_dataset["sentence2"],
#             scores=[score / 5.0 for score in test_dataset["score"]],  # Convert 0-5 scores to 0-1 scores
#             batch_size=inference_batch_size,
#             name=f"sts17-{subset}-test",
#             show_progress_bar=False,
#         )
#         evaluators.append(test_evaluator)

evaluator = SequentialEvaluator(evaluators, main_score_function=lambda scores: np.mean(scores))
# Now also prepare the evaluation datasets for training
eval_dataset_dict = eval_dataset_dict.map(prepare_dataset, batched=True, batch_size=30000, remove_columns=column_names)


Map:   0%|          | 0/45403 [00:00<?, ? examples/s]

2024-07-13 02:27:44 - Prepared datasets for training:
2024-07-13 02:27:44 - Creating evaluators for en-hi


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [8]:

# 5. Define the training arguments
args = SentenceTransformerTrainingArguments(
    # Required parameter:
    output_dir=output_dir,
    # Optional training parameters:
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=train_batch_size,
    per_device_eval_batch_size=train_batch_size,
    warmup_ratio=0.1,
    fp16=True,  # Set to False if you get an error that your GPU can't run on FP16
    bf16=False,  # Set to True if you have a GPU that supports BF16
    learning_rate=2e-5,
    # Optional tracking/debugging parameters:
    eval_strategy="steps",
    eval_steps=num_evaluation_steps,
    save_strategy="steps",
    save_steps=num_evaluation_steps,
    save_total_limit=2,
    logging_steps=100,
    run_name=f"multilingual-{'-'.join(source_languages)}-{'-'.join(target_languages)}",  # Will be used in W&B if `wandb` is installed
)

# 6. Create the trainer & start training
trainer = SentenceTransformerTrainer(
    model=student_model,
    args=args,
    train_dataset=train_dataset_dict,
    eval_dataset=eval_dataset_dict,
    loss=train_loss,
    evaluator=evaluator,
)


In [9]:
trainer.train()

RuntimeError: User specified an unsupported autocast device_type 'mps'

In [None]:

# 7. Save the trained & evaluated model locally
final_output_dir = f"{output_dir}/final"
student_model.save(final_output_dir)

# 8. (Optional) save the model to the Hugging Face Hub!
# It is recommended to run `huggingface-cli login` to log into your Hugging Face account first
model_name = student_model_name if "/" not in student_model_name else student_model_name.split("/")[-1]
try:
    student_model.push_to_hub(f"{model_name}-multilingual-{'-'.join(source_languages)}-{'-'.join(target_languages)}")
except Exception:
    logging.error(
        f"Error uploading model to the Hugging Face Hub:\n{traceback.format_exc()}To upload it manually, you can run "
        f"`huggingface-cli login`, followed by loading the model using `model = SentenceTransformer({final_output_dir!r})` "
        f"and saving it using `model.push_to_hub('{model_name}-multilingual-{'-'.join(source_languages)}-{'-'.join(target_languages)}')`."
    )

In [17]:
sentence = "my name is abhishek"

In [18]:
embedding = student_model.encode(sentence)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [19]:
def generate_embedding(sentence):
    return student_model.encode(sentence)

In [20]:
len(embedding)

384

In [21]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def calculate_cosine_similarity(vec1, vec2):
    # Ensure the input vectors are numpy arrays
    vec1 = np.array(vec1).reshape(1, -1)
    vec2 = np.array(vec2).reshape(1, -1)
    
    # Calculate cosine similarity
    similarity = cosine_similarity(vec1, vec2)
    
    # The result is a 2D array, extract the value from it
    return similarity[0][0]

In [22]:
def get_similarity(sent1, sent2):
    emb1 = generate_embedding(sent1)
    emb2 = generate_embedding(sent2)
    cos_sim = calculate_cosine_similarity(emb1, emb2)
    return cos_sim
    

In [23]:
get_similarity("my name is abhishek","my name is abhishek")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

0.99999994

In [24]:
get_similarity("my name is abhishek","i am eating food")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

0.36388004

In [25]:
def find_top_matches(source_sentences, target_sentences, top_n=3):
    top_matches = {}
    for src_sent in source_sentences:
        matches = []
        for tgt_sent in target_sentences:
            similarity = get_similarity(src_sent, tgt_sent)
            matches.append((tgt_sent, similarity))
        # Sort matches by similarity (higher similarity first) and get top N
        top_matches[src_sent] = sorted(matches, key=lambda x: x[1], reverse=True)[:top_n]
    return top_matches


# Example source and target sentences
source_sentences = [
    "मुझे हिंदी भाषा बहुत पसंद है",
    "आज मौसम बहुत अच्छा है",
    "मैं स्कूल जा रहा हूँ",
    "खाना बहुत स्वादिष्ट है"
]

target_sentences = [
    "मुझे हिंदी सीखना पसंद है",
    "आज का मौसम सुहावना है",
    "मैं स्कूल जा रहा हूँ",
    "यह खाना बहुत स्वादिष्ट है",
    "मुझे यह भाषा बहुत पसंद है",
    "बाहर मौसम बहुत अच्छा है",
    "मैं कॉलेज जा रहा हूँ",
    "खाना बहुत स्वादिष्ट और ताज़ा है",
    "मुझे भाषा सीखना अच्छा लगता है",
    "आज का मौसम बहुत अच्छा है"
]

# Run the experiment to find top 3 matches
top_matches = find_top_matches(source_sentences, target_sentences, top_n=3)

# Print the top 3 matches for each source sentence
for src_sent, matches in top_matches.items():
    print(f"Top 3 matches for '{src_sent}':")
    for idx, (match_sent, similarity) in enumerate(matches, 1):
        print(f"  {idx}. '{match_sent}' with similarity {similarity}")
    print()

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Top 3 matches for 'मुझे हिंदी भाषा बहुत पसंद है':
  1. 'मुझे यह भाषा बहुत पसंद है' with similarity 0.940321683883667
  2. 'मुझे भाषा सीखना अच्छा लगता है' with similarity 0.9368987679481506
  3. 'मुझे हिंदी सीखना पसंद है' with similarity 0.9238975048065186

Top 3 matches for 'आज मौसम बहुत अच्छा है':
  1. 'आज का मौसम बहुत अच्छा है' with similarity 0.991140604019165
  2. 'बाहर मौसम बहुत अच्छा है' with similarity 0.982707142829895
  3. 'आज का मौसम सुहावना है' with similarity 0.9567022323608398

Top 3 matches for 'मैं स्कूल जा रहा हूँ':
  1. 'मैं स्कूल जा रहा हूँ' with similarity 1.000000238418579
  2. 'मैं कॉलेज जा रहा हूँ' with similarity 0.987082839012146
  3. 'आज का मौसम बहुत अच्छा है' with similarity 0.827501118183136

Top 3 matches for 'खाना बहुत स्वादिष्ट है':
  1. 'खाना बहुत स्वादिष्ट और ताज़ा है' with similarity 0.98640376329422
  2. 'यह खाना बहुत स्वादिष्ट है' with similarity 0.9604611396789551
  3. 'आज का मौसम सुहावना है' with similarity 0.952528178691864



In [26]:
# Source Sentences
source_sentences = [
    "मुझे यह फ़िल्म बहुत पसंद है, क्या तुमने देखी है?",
    "यह किताब मेरे लिए बहुत महत्वपूर्ण है।",
    "क्या तुम्हारे पास कोई अच्छा समाधान है इस समस्या का?",
    "वह खुश है अपने नए काम से।"
]

# Target Sentences
target_sentences = [
    "मुझे भी यह फ़िल्म बहुत अच्छी लगी, मैंने कई बार देखी है।",
    "यह किताब तुम्हारे लिए कितनी महत्वपूर्ण है, वह मुझे बताओ।",
    "क्या तुम्हें इस समस्या का कोई नया समाधान पता है?",
    "उसका नया काम उसे बहुत खुश कर रहा है।",
    "फिल्म देखना मेरी पसंद है, मुझे वह बहुत पसंद है।",
    "यहाँ पर फ़िल्म नहीं दिखाई जा रही है, कल आएगी।",
    "वह किताब मेरी पसंद की है, मुझे भी बहुत अच्छी लगी।",
    "यह विषय मुझे बहुत पसंद है, मैंने इसकी अध्ययन बहुत किया है।",
    "क्या तुम्हें इस समस्या का कोई अन्य समाधान आता है?",
    "उसके काम में बहुत बदलाव आया है, अब वह खुश है।"
]

# Run the experiment to find top 3 matches
top_matches = find_top_matches(source_sentences, target_sentences, top_n=3)

# Print the top 3 matches for each source sentence
for src_sent, matches in top_matches.items():
    print(f"Top 3 matches for '{src_sent}':")
    for idx, (match_sent, similarity) in enumerate(matches, 1):
        print(f"  {idx}. '{match_sent}' with similarity {similarity}")
    print()

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Top 3 matches for 'मुझे यह फ़िल्म बहुत पसंद है, क्या तुमने देखी है?':
  1. 'मुझे भी यह फ़िल्म बहुत अच्छी लगी, मैंने कई बार देखी है।' with similarity 0.7783318758010864
  2. 'फिल्म देखना मेरी पसंद है, मुझे वह बहुत पसंद है।' with similarity 0.7720270752906799
  3. 'यह किताब तुम्हारे लिए कितनी महत्वपूर्ण है, वह मुझे बताओ।' with similarity 0.6688283085823059

Top 3 matches for 'यह किताब मेरे लिए बहुत महत्वपूर्ण है।':
  1. 'यह विषय मुझे बहुत पसंद है, मैंने इसकी अध्ययन बहुत किया है।' with similarity 0.9512735605239868
  2. 'यह किताब तुम्हारे लिए कितनी महत्वपूर्ण है, वह मुझे बताओ।' with similarity 0.8466460108757019
  3. 'वह किताब मेरी पसंद की है, मुझे भी बहुत अच्छी लगी।' with similarity 0.8373582363128662

Top 3 matches for 'क्या तुम्हारे पास कोई अच्छा समाधान है इस समस्या का?':
  1. 'क्या तुम्हें इस समस्या का कोई नया समाधान पता है?' with similarity 0.9907006621360779
  2. 'क्या तुम्हें इस समस्या का कोई अन्य समाधान आता है?' with similarity 0.987912654876709
  3. 'यह किताब तुम्हारे लिए कितनी म

In [27]:
target_sentences = ["नरेन्द्र मोदी का जन्म तत्कालीन बॉम्बे राज्य के महेसाना जिला स्थित वडनगर ग्राम में हीराबेन मोदी और दामोदरदास मूलचन्द मोदी के एक मध्यम-वर्गीय परिवार में १७ सितम्बर १९५० को हुआ था।[ वह पैदा हुए छह बच्चों में तीसरे थे। मोदी का परिवार 'मोध-घांची-तेली' समुदाय से था, जिसे भारत सरकार द्वारा अन्य पिछड़ा वर्ग के रूप में वर्गीकृत किया जाता है। वह पूर्णत: शाकाहारी हैं। भारत पाकिस्तान के बीच द्वितीय युद्ध के दौरान अपने तरुणकाल में उन्होंने स्वेच्छा से रेलवे स्टेशनों पर सफ़र कर रहे सैनिकों की सेवा की। युवावस्था में वह छात्र संगठन अखिल भारतीय विद्यार्थी परिषद में शामिल हुए | ",
"उन्होंने साथ ही साथ भ्रष्टाचार विरोधी नव निर्माण आन्दोलन में हिस्सा लिया। एक पूर्णकालिक आयोजक के रूप में कार्य करने के पश्चात् उन्हें भारतीय जनता पार्टी में संगठन का प्रतिनिधि मनोनीत किया गया।  किशोरावस्था में अपने भाई के साथ एक चाय की दुकान चला चुके मोदी ने अपनी स्कूली शिक्षा वड़नगर में पूरी की।[28] उन्होंने आरएसएस के प्रचारक रहते हुए 1980 में गुजरात विश्वविद्यालय से राजनीति विज्ञान में स्नातकोत्तर परीक्षा दी और विज्ञान स्नातकोत्तर की डिग्री प्राप्त की।",
"अपने माता-पिता की कुल छ: सन्तानों में तीसरे पुत्र नरेन्द्र ने बचपन में रेलवे स्टेशन पर चाय बेचने में अपने पिता का भी हाथ बँटाया। बड़नगर के ही एक स्कूल मास्टर के अनुसार नरेन्द्र हालाँकि एक औसत दर्ज़े का छात्र था, लेकिन वाद-विवाद और नाटक प्रतियोगिताओं में उसकी बेहद रुचि थी। इसके अलावा उसकी रुचि राजनीतिक विषयों पर नयी-नयी परियोजनाएँ प्रारम्भ करने की भी थी।",
"13 वर्ष की आयु में नरेन्द्र की सगाई जसोदा बेन चमनलाल के साथ कर दी गयी और जब उनका विवाह हुआ, तब वह मात्र 17 वर्ष के थे। फाइनेंशियल एक्सप्रेस की एक खबर के अनुसार पति-पत्नी ने कुछ वर्ष साथ रहकर बिताये। परन्तु कुछ समय बाद वे दोनों एक दूसरे के लिये अजनबी हो गये क्योंकि नरेन्द्र मोदी ने उनसे कुछ ऐसी ही इच्छा व्यक्त की थी।जबकि नरेन्द्र मोदी के जीवनी-लेखक ऐसा नहीं मानते। उनका कहना है:",
"उन दोनों की शादी जरूर हुई परन्तु वे दोनों एक साथ कभी नहीं रहे। शादी के कुछ बरसों बाद नरेन्द्र मोदी ने घर त्याग दिया और एक प्रकार से उनका वैवाहिक जीवन लगभग समाप्त-सा ही हो गया।,पिछले चार विधान सभा चुनावों में अपनी वैवाहिक स्थिति पर खामोश रहने के बाद नरेन्द्र मोदी ने कहा कि अविवाहित रहने की जानकारी देकर उन्होंने कोई पाप नहीं किया। नरेन्द्र मोदी के मुताबिक एक शादीशुदा के मुकाबले अविवाहित व्यक्ति भ्रष्टाचार के ख़िलाफ़ जोरदार तरीके से लड़ सकता है क्योंकि उसे अपनी पत्नी, परिवार व बालबच्चों की कोई चिन्ता नहीं रहती।[ हालांकि नरेन्द्र मोदी ने शपथ पत्र प्रस्तुत कर जसोदाबेन को अपनी पत्नी स्वीकार किया है।"
]

source_sentences = ["नरेंद्र मोदी का जन्म कहाँ हुआ था और उनके परिवार की क्या पृष्ठभूमि थी?",
"द्वितीय भारत-पाकिस्तान युद्ध के दौरान नरेंद्र मोदी ने कैसे योगदान दिया था?",
"नरेंद्र मोदी का शैक्षिक पृष्ठभूमि और पहले करियर क्या था?",
"नरेंद्र मोदी ने 17 वर्ष की आयु में किससे विवाह किया था और उनके संबंध में बाद में क्या हुआ?",
"नरेंद्र मोदी अपनी वैवाहिक स्थिति को अपने भ्रष्टाचार विरुद्ध लड़ाई से कैसे जोड़ते हैं?"
]

In [28]:
top_matches = find_top_matches(source_sentences, target_sentences, top_n=3)

# Print the top 3 matches for each source sentence
for src_sent, matches in top_matches.items():
    print(f"Top 3 matches for '{src_sent}':")
    for idx, (match_sent, similarity) in enumerate(matches, 1):
        print(f"  {idx}. '{match_sent}' with similarity {similarity}")
    print()

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Top 3 matches for 'नरेंद्र मोदी का जन्म कहाँ हुआ था और उनके परिवार की क्या पृष्ठभूमि थी?':
  1. 'अपने माता-पिता की कुल छ: सन्तानों में तीसरे पुत्र नरेन्द्र ने बचपन में रेलवे स्टेशन पर चाय बेचने में अपने पिता का भी हाथ बँटाया। बड़नगर के ही एक स्कूल मास्टर के अनुसार नरेन्द्र हालाँकि एक औसत दर्ज़े का छात्र था, लेकिन वाद-विवाद और नाटक प्रतियोगिताओं में उसकी बेहद रुचि थी। इसके अलावा उसकी रुचि राजनीतिक विषयों पर नयी-नयी परियोजनाएँ प्रारम्भ करने की भी थी।' with similarity 0.8000470995903015
  2. 'नरेन्द्र मोदी का जन्म तत्कालीन बॉम्बे राज्य के महेसाना जिला स्थित वडनगर ग्राम में हीराबेन मोदी और दामोदरदास मूलचन्द मोदी के एक मध्यम-वर्गीय परिवार में १७ सितम्बर १९५० को हुआ था।[ वह पैदा हुए छह बच्चों में तीसरे थे। मोदी का परिवार 'मोध-घांची-तेली' समुदाय से था, जिसे भारत सरकार द्वारा अन्य पिछड़ा वर्ग के रूप में वर्गीकृत किया जाता है। वह पूर्णत: शाकाहारी हैं। भारत पाकिस्तान के बीच द्वितीय युद्ध के दौरान अपने तरुणकाल में उन्होंने स्वेच्छा से रेलवे स्टेशनों पर सफ़र कर रहे सैनिकों की सेवा की। युवावस्था मे