<a href="https://colab.research.google.com/github/aditiiilaturkar/911Interactions/blob/main/text_comparision.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install sentence-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 KB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.27.2-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m55.9 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m63.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub>=0.4.0
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8

In [3]:
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer, util

# Load the model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Load the CSV file
df = pd.read_csv('textcomparison.csv')

# Define a function to compute embeddings for a given text
def get_embedding(text):
    # Compute the embedding using the model
    embedding = model.encode(text)
    return embedding

# Define a function to calculate similarity between two embeddings
def get_similarity(embedding1, embedding2):
    # Convert embeddings to PyTorch tensors
    embedding1 = torch.from_numpy(embedding1)
    embedding2 = torch.from_numpy(embedding2)
    # Calculate the cosine similarity between the two embeddings
    cosine_similarity = util.pytorch_cos_sim(embedding1, embedding2)
    return cosine_similarity.item()

# Compute the embeddings for each row and column separately
with open('similarity_results.txt', 'w') as f:
    for uid in range(1, 17):
        row = df[df['UID'] == uid].iloc[0]
        whisper_text = row['Whisper_text']
        google_text = row['Google_text']
        otter_text = row['Otter_text']
        transcription_puppy = row['Transcription_puppy']

        whisper_embedding = get_embedding(whisper_text)
        google_embedding = get_embedding(google_text)
        otter_embedding = get_embedding(otter_text)
        transcription_embedding = get_embedding(transcription_puppy)

        # Calculate similarity between Whisper_text and Google_text
        similarity_whisper_google = get_similarity(whisper_embedding, google_embedding)
        # Calculate similarity between Whisper_text and Otter_text
        similarity_whisper_otter = get_similarity(whisper_embedding, otter_embedding)
        # Calculate similarity between Whisper_text and Transcription_puppy
        similarity_whisper_transcription_puppy = get_similarity(whisper_embedding, transcription_embedding)
      
        # Calculate similarity between google_embedding and Transcription_puppy
        similarity_google_transcription_puppy = get_similarity(google_embedding, transcription_embedding)
        # Calculate similarity between google_embedding and otter
        similarity_google_otter = get_similarity(google_embedding, otter_embedding)

        # Calculate similarity between otter and Transcription_puppy
        similarity_otter_transcription_puppy = get_similarity(otter_embedding, transcription_embedding)

        # Write the similarity results to the output file
        f.write(f"Row {uid}:\n")
        f.write(f"Whisper_text and Google_text similarity: {similarity_whisper_google}\n")
        f.write(f"Whisper_text and Otter_text similarity: {similarity_whisper_otter}\n")
        f.write(f"Whisper_text and Transcription_puppy similarity: {similarity_whisper_transcription_puppy}\n")

        f.write(f"google_embedding and Transcription_puppy similarity: {similarity_google_transcription_puppy}\n")
        f.write(f"google_embedding and Otter_text similarity: {similarity_google_otter}\n")

        f.write(f"otter and Transcription_puppy similarity: {similarity_otter_transcription_puppy}\n")

        f.write('\n')


In [3]:
# Get the embedding for the Whisper_text column of the first row
whisper_text = df.loc[0, 'Whisper_text']
whisper_embedding = get_embedding(whisper_text)

# Print the embedding
print(f"Whisper_text embedding for row 1: {whisper_embedding}")


Whisper_text embedding for row 1: [-3.6503360e-02 -3.4791443e-02  5.6870408e-02 -3.5879335e-03
 -5.5157673e-03 -1.2474084e-02  2.0985087e-02  2.2714704e-02
  5.8695737e-02 -8.0677539e-02 -4.9584169e-02  3.6380652e-02
 -1.7913377e-02  1.9348050e-02 -7.2352432e-02  9.1347713e-03
  7.1022145e-02 -8.8094749e-02 -4.3260448e-02  2.1859084e-01
 -5.1986013e-02  6.6244766e-02  5.8568744e-03 -3.1212496e-02
 -5.5758338e-02  3.4942072e-02  4.6868842e-02  5.1484597e-03
 -5.2868407e-02 -1.7710768e-02  9.8479256e-02  1.0891379e-02
 -5.2128624e-02 -2.6821876e-03  2.3660088e-02  2.2632860e-02
 -3.7540526e-03  2.5258197e-02  1.2735824e-02  2.2318346e-02
 -3.5266622e-03 -6.9684803e-02  4.5196030e-02  6.3925661e-02
 -2.7221054e-02  8.0690019e-02 -2.4157893e-02 -4.3907620e-02
  6.9934778e-02 -4.1195672e-02 -8.9537494e-02 -3.4856694e-04
  1.7773792e-02  3.8660768e-02  4.7992367e-02 -3.0620228e-02
  5.6243867e-02 -3.9403189e-02 -3.6149338e-02 -5.0452137e-03
 -2.7861938e-02  4.5149110e-02  4.2085022e-02  5.59

dataframe


In [5]:
# Load the model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Load the CSV file
df = pd.read_csv('textcomparison.csv')

# Define an empty DataFrame to store the similarity results
similarity_df = pd.DataFrame(columns=['UID', 'similarity_whisper_google', 'similarity_whisper_otter', 
                                       'similarity_whisper_transcription_puppy', 'similarity_google_transcription_puppy', 
                                       'similarity_google_otter', 'similarity_otter_transcription_puppy'])

# Define a function to compute embeddings for a given text
def get_embedding(text):
    # Compute the embedding using the model
    embedding = model.encode(text)
    return embedding

# Define a function to calculate similarity between two embeddings
def get_similarity(embedding1, embedding2):
    # Convert embeddings to PyTorch tensors
    embedding1 = torch.from_numpy(embedding1)
    embedding2 = torch.from_numpy(embedding2)
    # Calculate the cosine similarity between the two embeddings
    cosine_similarity = util.pytorch_cos_sim(embedding1, embedding2)
    return cosine_similarity.item()

# Compute the embeddings for each row and column separately
for uid in range(1, 17):
    row = df[df['UID'] == uid].iloc[0]
    whisper_text = row['Whisper_text']
    google_text = row['Google_text']
    otter_text = row['Otter_text']
    transcription_puppy = row['Transcription_puppy']

    whisper_embedding = get_embedding(whisper_text)
    google_embedding = get_embedding(google_text)
    otter_embedding = get_embedding(otter_text)
    transcription_embedding = get_embedding(transcription_puppy)

    # Calculate similarity between Whisper_text and Google_text
    similarity_whisper_google = get_similarity(whisper_embedding, google_embedding)
    # Calculate similarity between Whisper_text and Otter_text
    similarity_whisper_otter = get_similarity(whisper_embedding, otter_embedding)
    # Calculate similarity between Whisper_text and Transcription_puppy
    similarity_whisper_transcription_puppy = get_similarity(whisper_embedding, transcription_embedding)

    # Calculate similarity between google_embedding and Transcription_puppy
    similarity_google_transcription_puppy = get_similarity(google_embedding, transcription_embedding)
    # Calculate similarity between google_embedding and otter
    similarity_google_otter = get_similarity(google_embedding, otter_embedding)

    # Calculate similarity between otter and Transcription_puppy
    similarity_otter_transcription_puppy = get_similarity(otter_embedding, transcription_embedding)

    # Create a dictionary with the similarity results for the current row
    similarity_dict = {'UID': uid,
                       'similarity_whisper_google': similarity_whisper_google,
                       'similarity_whisper_otter': similarity_whisper_otter,
                       'similarity_whisper_transcription_puppy': similarity_whisper_transcription_puppy,
                       'similarity_google_transcription_puppy': similarity_google_transcription_puppy,
                       'similarity_google_otter': similarity_google_otter,
                       'similarity_otter_transcription_puppy': similarity_otter_transcription_puppy}

    # Append the similarity results for the current row as a new row to the DataFrame
    similarity_df = similarity_df.append(similarity_dict, ignore_index=True)
    print(similarity_df)


  similarity_df = similarity_df.append(similarity_dict, ignore_index=True)


   UID  similarity_whisper_google  similarity_whisper_otter  \
0  1.0                   0.545512                  0.622999   

   similarity_whisper_transcription_puppy  \
0                                0.570512   

   similarity_google_transcription_puppy  similarity_google_otter  \
0                                0.75015                 0.748107   

   similarity_otter_transcription_puppy  
0                              0.934645  


  similarity_df = similarity_df.append(similarity_dict, ignore_index=True)


   UID  similarity_whisper_google  similarity_whisper_otter  \
0  1.0                   0.545512                  0.622999   
1  2.0                   0.648007                  0.796144   

   similarity_whisper_transcription_puppy  \
0                                0.570512   
1                                0.773030   

   similarity_google_transcription_puppy  similarity_google_otter  \
0                               0.750150                 0.748107   
1                               0.702531                 0.746380   

   similarity_otter_transcription_puppy  
0                              0.934645  
1                              0.892616  


  similarity_df = similarity_df.append(similarity_dict, ignore_index=True)


   UID  similarity_whisper_google  similarity_whisper_otter  \
0  1.0                   0.545512                  0.622999   
1  2.0                   0.648007                  0.796144   
2  3.0                   0.530484                  0.572743   

   similarity_whisper_transcription_puppy  \
0                                0.570512   
1                                0.773030   
2                                0.615809   

   similarity_google_transcription_puppy  similarity_google_otter  \
0                               0.750150                 0.748107   
1                               0.702531                 0.746380   
2                               0.690854                 0.718479   

   similarity_otter_transcription_puppy  
0                              0.934645  
1                              0.892616  
2                              0.807707  


  similarity_df = similarity_df.append(similarity_dict, ignore_index=True)


   UID  similarity_whisper_google  similarity_whisper_otter  \
0  1.0                   0.545512                  0.622999   
1  2.0                   0.648007                  0.796144   
2  3.0                   0.530484                  0.572743   
3  4.0                   0.519324                  0.662781   

   similarity_whisper_transcription_puppy  \
0                                0.570512   
1                                0.773030   
2                                0.615809   
3                                0.664780   

   similarity_google_transcription_puppy  similarity_google_otter  \
0                               0.750150                 0.748107   
1                               0.702531                 0.746380   
2                               0.690854                 0.718479   
3                               0.682140                 0.596330   

   similarity_otter_transcription_puppy  
0                              0.934645  
1                           

  similarity_df = similarity_df.append(similarity_dict, ignore_index=True)


   UID  similarity_whisper_google  similarity_whisper_otter  \
0  1.0                   0.545512                  0.622999   
1  2.0                   0.648007                  0.796144   
2  3.0                   0.530484                  0.572743   
3  4.0                   0.519324                  0.662781   
4  5.0                   0.500903                  0.674274   

   similarity_whisper_transcription_puppy  \
0                                0.570512   
1                                0.773030   
2                                0.615809   
3                                0.664780   
4                                0.670620   

   similarity_google_transcription_puppy  similarity_google_otter  \
0                               0.750150                 0.748107   
1                               0.702531                 0.746380   
2                               0.690854                 0.718479   
3                               0.682140                 0.596330   
4    

  similarity_df = similarity_df.append(similarity_dict, ignore_index=True)


   UID  similarity_whisper_google  similarity_whisper_otter  \
0  1.0                   0.545512                  0.622999   
1  2.0                   0.648007                  0.796144   
2  3.0                   0.530484                  0.572743   
3  4.0                   0.519324                  0.662781   
4  5.0                   0.500903                  0.674274   
5  6.0                   0.637301                  0.758191   

   similarity_whisper_transcription_puppy  \
0                                0.570512   
1                                0.773030   
2                                0.615809   
3                                0.664780   
4                                0.670620   
5                                0.751249   

   similarity_google_transcription_puppy  similarity_google_otter  \
0                               0.750150                 0.748107   
1                               0.702531                 0.746380   
2                               0.6

  similarity_df = similarity_df.append(similarity_dict, ignore_index=True)


   UID  similarity_whisper_google  similarity_whisper_otter  \
0  1.0                   0.545512                  0.622999   
1  2.0                   0.648007                  0.796144   
2  3.0                   0.530484                  0.572743   
3  4.0                   0.519324                  0.662781   
4  5.0                   0.500903                  0.674274   
5  6.0                   0.637301                  0.758191   
6  7.0                   0.729738                  0.725994   

   similarity_whisper_transcription_puppy  \
0                                0.570512   
1                                0.773030   
2                                0.615809   
3                                0.664780   
4                                0.670620   
5                                0.751249   
6                                0.692421   

   similarity_google_transcription_puppy  similarity_google_otter  \
0                               0.750150                 0.748107

  similarity_df = similarity_df.append(similarity_dict, ignore_index=True)


   UID  similarity_whisper_google  similarity_whisper_otter  \
0  1.0                   0.545512                  0.622999   
1  2.0                   0.648007                  0.796144   
2  3.0                   0.530484                  0.572743   
3  4.0                   0.519324                  0.662781   
4  5.0                   0.500903                  0.674274   
5  6.0                   0.637301                  0.758191   
6  7.0                   0.729738                  0.725994   
7  8.0                   0.556769                  0.783185   

   similarity_whisper_transcription_puppy  \
0                                0.570512   
1                                0.773030   
2                                0.615809   
3                                0.664780   
4                                0.670620   
5                                0.751249   
6                                0.692421   
7                                0.737570   

   similarity_google_trans

  similarity_df = similarity_df.append(similarity_dict, ignore_index=True)


   UID  similarity_whisper_google  similarity_whisper_otter  \
0  1.0                   0.545512                  0.622999   
1  2.0                   0.648007                  0.796144   
2  3.0                   0.530484                  0.572743   
3  4.0                   0.519324                  0.662781   
4  5.0                   0.500903                  0.674274   
5  6.0                   0.637301                  0.758191   
6  7.0                   0.729738                  0.725994   
7  8.0                   0.556769                  0.783185   
8  9.0                   0.615593                  0.673429   

   similarity_whisper_transcription_puppy  \
0                                0.570512   
1                                0.773030   
2                                0.615809   
3                                0.664780   
4                                0.670620   
5                                0.751249   
6                                0.692421   
7        

  similarity_df = similarity_df.append(similarity_dict, ignore_index=True)


    UID  similarity_whisper_google  similarity_whisper_otter  \
0   1.0                   0.545512                  0.622999   
1   2.0                   0.648007                  0.796144   
2   3.0                   0.530484                  0.572743   
3   4.0                   0.519324                  0.662781   
4   5.0                   0.500903                  0.674274   
5   6.0                   0.637301                  0.758191   
6   7.0                   0.729738                  0.725994   
7   8.0                   0.556769                  0.783185   
8   9.0                   0.615593                  0.673429   
9  10.0                   0.735528                  0.831416   

   similarity_whisper_transcription_puppy  \
0                                0.570512   
1                                0.773030   
2                                0.615809   
3                                0.664780   
4                                0.670620   
5                        

  similarity_df = similarity_df.append(similarity_dict, ignore_index=True)


     UID  similarity_whisper_google  similarity_whisper_otter  \
0    1.0                   0.545512                  0.622999   
1    2.0                   0.648007                  0.796144   
2    3.0                   0.530484                  0.572743   
3    4.0                   0.519324                  0.662781   
4    5.0                   0.500903                  0.674274   
5    6.0                   0.637301                  0.758191   
6    7.0                   0.729738                  0.725994   
7    8.0                   0.556769                  0.783185   
8    9.0                   0.615593                  0.673429   
9   10.0                   0.735528                  0.831416   
10  11.0                   0.580259                  0.660423   

    similarity_whisper_transcription_puppy  \
0                                 0.570512   
1                                 0.773030   
2                                 0.615809   
3                                 0

  similarity_df = similarity_df.append(similarity_dict, ignore_index=True)


     UID  similarity_whisper_google  similarity_whisper_otter  \
0    1.0                   0.545512                  0.622999   
1    2.0                   0.648007                  0.796144   
2    3.0                   0.530484                  0.572743   
3    4.0                   0.519324                  0.662781   
4    5.0                   0.500903                  0.674274   
5    6.0                   0.637301                  0.758191   
6    7.0                   0.729738                  0.725994   
7    8.0                   0.556769                  0.783185   
8    9.0                   0.615593                  0.673429   
9   10.0                   0.735528                  0.831416   
10  11.0                   0.580259                  0.660423   
11  12.0                   0.676546                  0.780141   

    similarity_whisper_transcription_puppy  \
0                                 0.570512   
1                                 0.773030   
2               

  similarity_df = similarity_df.append(similarity_dict, ignore_index=True)


     UID  similarity_whisper_google  similarity_whisper_otter  \
0    1.0                   0.545512                  0.622999   
1    2.0                   0.648007                  0.796144   
2    3.0                   0.530484                  0.572743   
3    4.0                   0.519324                  0.662781   
4    5.0                   0.500903                  0.674274   
5    6.0                   0.637301                  0.758191   
6    7.0                   0.729738                  0.725994   
7    8.0                   0.556769                  0.783185   
8    9.0                   0.615593                  0.673429   
9   10.0                   0.735528                  0.831416   
10  11.0                   0.580259                  0.660423   
11  12.0                   0.676546                  0.780141   
12  13.0                   0.650026                  0.787926   

    similarity_whisper_transcription_puppy  \
0                                 0.570512 

  similarity_df = similarity_df.append(similarity_dict, ignore_index=True)


     UID  similarity_whisper_google  similarity_whisper_otter  \
0    1.0                   0.545512                  0.622999   
1    2.0                   0.648007                  0.796144   
2    3.0                   0.530484                  0.572743   
3    4.0                   0.519324                  0.662781   
4    5.0                   0.500903                  0.674274   
5    6.0                   0.637301                  0.758191   
6    7.0                   0.729738                  0.725994   
7    8.0                   0.556769                  0.783185   
8    9.0                   0.615593                  0.673429   
9   10.0                   0.735528                  0.831416   
10  11.0                   0.580259                  0.660423   
11  12.0                   0.676546                  0.780141   
12  13.0                   0.650026                  0.787926   
13  14.0                   0.557976                  0.632433   

    similarity_whisper_t

  similarity_df = similarity_df.append(similarity_dict, ignore_index=True)


     UID  similarity_whisper_google  similarity_whisper_otter  \
0    1.0                   0.545512                  0.622999   
1    2.0                   0.648007                  0.796144   
2    3.0                   0.530484                  0.572743   
3    4.0                   0.519324                  0.662781   
4    5.0                   0.500903                  0.674274   
5    6.0                   0.637301                  0.758191   
6    7.0                   0.729738                  0.725994   
7    8.0                   0.556769                  0.783185   
8    9.0                   0.615593                  0.673429   
9   10.0                   0.735528                  0.831416   
10  11.0                   0.580259                  0.660423   
11  12.0                   0.676546                  0.780141   
12  13.0                   0.650026                  0.787926   
13  14.0                   0.557976                  0.632433   
14  15.0                 

  similarity_df = similarity_df.append(similarity_dict, ignore_index=True)


Similarity index sheet 1


In [20]:
import pandas as pd
import math
import torch
from sentence_transformers import SentenceTransformer, util

# Load the model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Load the CSV file
df = pd.read_excel('all_text-2.xlsx')
# print(df)
# Define a function to compute embeddings for a given text
def get_embedding(text):
    # Compute the embedding using the model
    embedding = model.encode(text)
    return embedding

# Define a function to calculate similarity between two embeddings
def get_similarity(embedding1, embedding2):
    # Convert embeddings to PyTorch tensors
    embedding1 = torch.from_numpy(embedding1)
    embedding2 = torch.from_numpy(embedding2)
    # Calculate the cosine similarity between the two embeddings
    cosine_similarity = util.pytorch_cos_sim(embedding1, embedding2)
    return cosine_similarity.item()

# Compute the embeddings for each row and column separately
results = []

# for index, row in df.iterrows():
#   print(index, row['path'], row['Whisper_text'])

df = df.fillna('')

for index, row in df.iterrows():
    print("index --- ", index)
    path = row['path']
    whisper_text = row['Whisper_text']
    google_text = row['Google_text']
    otter_text = row['Otter_text']
    # transcription_puppy = row['Transcription_puppy']

    whisper_embedding = get_embedding(whisper_text)
    # google_text = "" if math.isnan(google_text) else google_text
    # print("sahilllll. ", google_text)
    google_embedding = get_embedding(google_text)

    otter_embedding = get_embedding(otter_text)
    # transcription_embedding = get_embedding(transcription_puppy)

    # Calculate similarity between Whisper_text and Google_text
    similarity_whisper_google = get_similarity(whisper_embedding, google_embedding)
    # Calculate similarity between Whisper_text and Otter_text
    similarity_whisper_otter = get_similarity(whisper_embedding, otter_embedding)
    # Calculate similarity between Whisper_text and Transcription_puppy
    # similarity_whisper_transcription_puppy = get_similarity(whisper_embedding, transcription_embedding)

    # Calculate similarity between google_embedding and Transcription_puppy
    # similarity_google_transcription_puppy = get_similarity(google_embedding, transcription_embedding)
    # Calculate similarity between google_embedding and otter
    similarity_google_otter = get_similarity(google_embedding, otter_embedding)

    # # Calculate similarity between otter and Transcription_puppy
    # similarity_otter_transcription_puppy = get_similarity(otter_embedding, transcription_embedding)

    result = {'path': path,
              'similarity_whisper_google': similarity_whisper_google,
              'similarity_whisper_otter': similarity_whisper_otter,
              # 'similarity_whisper_transcription_puppy': similarity_whisper_transcription_puppy,
              # 'similarity_google_transcription_puppy': similarity_google_transcription_puppy,
              'similarity_google_otter': similarity_google_otter,
              # 'similarity_otter_transcription_puppy': similarity_otter_transcription_puppy
              }

    results.append(result)

# Save results to a CSV file
df_results = pd.DataFrame(results)
df_results.to_csv('similarity_results.csv', index=False)


index ---  0
index ---  1
index ---  2
index ---  3
index ---  4
index ---  5
index ---  6
index ---  7
index ---  8
index ---  9
index ---  10
index ---  11
index ---  12
index ---  13
index ---  14
index ---  15
index ---  16
index ---  17
index ---  18
index ---  19
index ---  20
index ---  21
index ---  22
index ---  23
index ---  24
index ---  25
index ---  26
index ---  27
index ---  28
index ---  29
index ---  30
index ---  31
index ---  32
index ---  33
index ---  34
index ---  35
index ---  36
index ---  37
index ---  38
index ---  39
index ---  40
index ---  41
index ---  42
index ---  43
index ---  44
index ---  45
index ---  46
index ---  47
index ---  48
index ---  49
index ---  50
index ---  51
index ---  52
index ---  53
index ---  54
index ---  55
index ---  56
index ---  57
index ---  58
index ---  59
index ---  60
index ---  61
index ---  62
index ---  63
index ---  64
index ---  65
index ---  66
index ---  67
index ---  68
index ---  69
index ---  70
index ---  71
in

Similarity index sheet 2 : in comaprion with transcription_puppy


In [22]:
import pandas as pd
import math
import torch
from sentence_transformers import SentenceTransformer, util

# Load the model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Load the CSV file
df = pd.read_excel('all-text-sheet2.xlsx')
# print(df)
# Define a function to compute embeddings for a given text
def get_embedding(text):
    # Compute the embedding using the model
    embedding = model.encode(text)
    return embedding

# Define a function to calculate similarity between two embeddings
def get_similarity(embedding1, embedding2):
    # Convert embeddings to PyTorch tensors
    embedding1 = torch.from_numpy(embedding1)
    embedding2 = torch.from_numpy(embedding2)
    # Calculate the cosine similarity between the two embeddings
    cosine_similarity = util.pytorch_cos_sim(embedding1, embedding2)
    return cosine_similarity.item()

# Compute the embeddings for each row and column separately
results = []

# for index, row in df.iterrows():
#   print(index, row['path'], row['Whisper_text'])

df = df.fillna('')

for index, row in df.iterrows():
    print("index --- ", index)
    path = row['path']
    whisper_text = row['Whisper_text']
    google_text = row['Google_text']
    otter_text = row['Otter_text']
    transcription_puppy = row['Transcription_puppy']

    whisper_embedding = get_embedding(whisper_text)
    google_embedding = get_embedding(google_text)

    otter_embedding = get_embedding(otter_text)
    transcription_embedding = get_embedding(transcription_puppy)

    # Calculate similarity between Whisper_text and Google_text
    # similarity_whisper_google = get_similarity(whisper_embedding, google_embedding)
    # Calculate similarity between Whisper_text and Otter_text
    # similarity_whisper_otter = get_similarity(whisper_embedding, otter_embedding)
    # Calculate similarity between google_embedding and otter
    # similarity_google_otter = get_similarity(google_embedding, otter_embedding)



    # Calculate similarity between Whisper_text and Transcription_puppy
    similarity_whisper_transcription_puppy = get_similarity(whisper_embedding, transcription_embedding)
    # Calculate similarity between google_embedding and Transcription_puppy
    similarity_google_transcription_puppy = get_similarity(google_embedding, transcription_embedding)
    # # Calculate similarity between otter and Transcription_puppy
    similarity_otter_transcription_puppy = get_similarity(otter_embedding, transcription_embedding)

    result = {'path': path,
              # 'similarity_whisper_google': similarity_whisper_google,
              # 'similarity_whisper_otter': similarity_whisper_otter,
              'similarity_whisper_transcription_puppy': similarity_whisper_transcription_puppy,
              'similarity_google_transcription_puppy': similarity_google_transcription_puppy,
              'similarity_google_otter': similarity_google_otter,
              'similarity_otter_transcription_puppy': similarity_otter_transcription_puppy
              }

    results.append(result)

# Save results to a CSV file
df_results = pd.DataFrame(results)
df_results.to_csv('similarity_results_sheet2.csv', index=False)


index ---  0
index ---  1
index ---  2
index ---  3
index ---  4
index ---  5
index ---  6
index ---  7
index ---  8
index ---  9
index ---  10
index ---  11
index ---  12
index ---  13
index ---  14
index ---  15
