In [15]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load the model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('hkunlp/instructor-large')
model = AutoModelForSeq2SeqLM.from_pretrained('hkunlp/instructor-large')


Some weights of T5ForConditionalGeneration were not initialized from the model checkpoint at hkunlp/instructor-large and are newly initialized: ['decoder.block.0.layer.0.SelfAttention.k.weight', 'decoder.block.0.layer.0.SelfAttention.o.weight', 'decoder.block.0.layer.0.SelfAttention.q.weight', 'decoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight', 'decoder.block.0.layer.0.SelfAttention.v.weight', 'decoder.block.0.layer.0.layer_norm.weight', 'decoder.block.0.layer.1.EncDecAttention.k.weight', 'decoder.block.0.layer.1.EncDecAttention.o.weight', 'decoder.block.0.layer.1.EncDecAttention.q.weight', 'decoder.block.0.layer.1.EncDecAttention.v.weight', 'decoder.block.0.layer.1.layer_norm.weight', 'decoder.block.0.layer.2.DenseReluDense.wi.weight', 'decoder.block.0.layer.2.DenseReluDense.wo.weight', 'decoder.block.0.layer.2.layer_norm.weight', 'decoder.block.1.layer.0.SelfAttention.k.weight', 'decoder.block.1.layer.0.SelfAttention.o.weight', 'decoder.block.1.layer.0.SelfAttenti

In [23]:
# Load the CSV file
df = pd.read_csv('cv-valid-dev.csv')

# Define the hotwords
hotwords = ["BE CAREFUL", "DESTROY", "STRANGER"]

# Function to calculate the cosine similarity between two embeddings
def calculate_similarity(embedding1, embedding2):
    # Ensure the embeddings are numpy arrays
    if not isinstance(embedding1, np.ndarray):
        embedding1 = embedding1.detach().numpy()
    if not isinstance(embedding2, np.ndarray):
        embedding2 = embedding2.detach().numpy()

    # Reshape the embeddings to 2-D if they are 1-D
    if len(embedding1.shape) == 1:
        embedding1 = embedding1.reshape(1, -1)
    if len(embedding2.shape) == 1:
        embedding2 = embedding2.reshape(1, -1)

    return cosine_similarity(embedding1, embedding2)[0][0]

def calculate_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt")
    decoder_input_ids = tokenizer(text, return_tensors="pt")["input_ids"]
    outputs = model(**inputs, decoder_input_ids=decoder_input_ids)
    return outputs.encoder_last_hidden_state.mean(dim=1)

In [26]:
# Calculate the embeddings for the hotwords
hotword_embeddings = [calculate_embeddings(hotword) for hotword in hotwords]

similar_rows = []

# Create an empty DataFrame to store the similar texts
df_similar = pd.DataFrame(columns=['filename', 'generated_text_cv'])

for index, row in df.iterrows():
    
    # Skip rows where the generated_text_cv column is empty
    if pd.isna(row['generated_text_cv']):
        continue

    print(f"Processing file: {row['filename']}")
    # Calculate the embeddings for the text
    text_embeddings = calculate_embeddings(row['generated_text_cv'])
    
    # Calculate the similarity between the text embeddings and the hotword embeddings
    similarities = [calculate_similarity(text_embeddings, hotword_embedding) for hotword_embedding in hotword_embeddings]
    
    # If the maximum similarity is above a certain threshold, add the row to similar_rows
    if max(similarities) > 0.5:  # Assuming that if max similarity is above 0.5, the phrase is similar to one of the hotwords
        similar_rows.append(row)

# Create df_similar from similar_rows
df_similar = pd.concat(similar_rows, axis=1).transpose()

Processing file: cv-valid-dev/sample-000000.mp3
Processing file: cv-valid-dev/sample-000001.mp3
Processing file: cv-valid-dev/sample-000002.mp3
Processing file: cv-valid-dev/sample-000003.mp3
Processing file: cv-valid-dev/sample-000004.mp3
Processing file: cv-valid-dev/sample-000005.mp3
Processing file: cv-valid-dev/sample-000006.mp3
Processing file: cv-valid-dev/sample-000007.mp3
Processing file: cv-valid-dev/sample-000008.mp3
Processing file: cv-valid-dev/sample-000009.mp3
Processing file: cv-valid-dev/sample-000010.mp3
Processing file: cv-valid-dev/sample-000011.mp3
Processing file: cv-valid-dev/sample-000012.mp3
Processing file: cv-valid-dev/sample-000013.mp3
Processing file: cv-valid-dev/sample-000014.mp3
Processing file: cv-valid-dev/sample-000015.mp3
Processing file: cv-valid-dev/sample-000016.mp3
Processing file: cv-valid-dev/sample-000017.mp3
Processing file: cv-valid-dev/sample-000018.mp3
Processing file: cv-valid-dev/sample-000019.mp3
Processing file: cv-valid-dev/sample-000

In [37]:
df_similar

Unnamed: 0,filename,text,up_votes,down_votes,age,gender,accent,duration,generated_text,generated_text_cv,generated_text_cv_wer,generated_text_wer
0,cv-valid-dev/sample-000000.mp3,BE CAREFUL WITH YOUR PROGNOSTICATIONS SAID THE...,1,0,,,,5.068125,BE CAREFUL WITH YOUR PROGNOSTICATIONS SAID THE...,BE CAREFUL WHET YOUR PROGNOSTICATIONS SAID THE...,0.125,0.0
3,cv-valid-dev/sample-000003.mp3,I THOUGHT THAT EVERYTHING I OWNED WOULD BE DES...,3,0,,,,4.468125,I FELT THAT EVERYTHING I OWNED WOULD BE DESTROYED,I FELT THAT EVERYTHING I OWNED WOULD BE DESTROYED,0.111111,0.111111
39,cv-valid-dev/sample-000039.mp3,HE MUST BE DISGUISED TO AVOID ENCOUNTERS WITH ...,1,0,,,,11.548125,YOU MUST BE DISGUISED TO AVOID IN CONTAST WITH...,YOU MUST BE DISGUISED TO AVOID IN CONTEST WITH...,0.444444,0.444444
89,cv-valid-dev/sample-000089.mp3,THE STRANGER SEEMED SATISFIED WITH THE ANSWER,3,0,,,,3.028125,THE STRANGER SEEMED SATISFIED IT THE ANSWER,THE STRANGER SEEMED SATISFIED WITH THE ANSWER,0.0,0.142857
216,cv-valid-dev/sample-000216.mp3,THROUGHOUT THE ENTIRE DAY WE WILL BE ON THE LO...,1,0,twenties,male,indian,5.140125,THROUGHOUT THE ENTIRE DAY WE'LL BE ON THE LOOK...,THROUGHOUT THE ENTIRE DAY WE'LL BE ON THE LOOK...,0.307692,0.153846
...,...,...,...,...,...,...,...,...,...,...,...,...
3808,cv-valid-dev/sample-003808.mp3,I HAD TO TEST YOUR COURAGE THE STRANGER SAID,1,0,twenties,female,england,3.148125,I HAD TO TEST YOUR COURAGE THE STRANGER SAID,I HAD TO TEST YOUR COURAGE THE STRANGER SAID,0.0,0.0
3828,cv-valid-dev/sample-003828.mp3,SOMEBODY'S GOING TO GET HURT,1,0,thirties,female,england,2.308125,SOMEBODY'S GOING TO GET HURT,SOMEBODY'S GOING TO GET HURT,0.0,0.0
3832,cv-valid-dev/sample-003832.mp3,SOMETHING HAPPENS HERE YOU'LL HAVE TO REMEMBER...,1,0,twenties,male,us,3.388125,SOMETHING HAPPENS HERE YOU'LL HAVE TO REMEMBER...,SOMETHING HAPPENS HERE YOU'LL HAVE TO REMEMBER...,0.0,0.0
3907,cv-valid-dev/sample-003907.mp3,HE MUST BE DISGUISED TO AVOID ENCOUNTERS WITH ...,3,0,thirties,female,canada,4.676125,HE MUST BE DISGUISED TO AVOID ENCOUNTERS WITH ...,HE MUST BE DISGUISED TO AVOID ENCOUNTERS WITH ...,0.0,0.0


In [34]:
df = pd.read_csv('cv-valid-dev.csv')

# function to check if a row exists in df_similar and records True/False
def check_similarity(row):
    return row.name in df_similar.index

df['similarity'] = df.apply(check_similarity, axis=1)

In [43]:
df.head()

Unnamed: 0,filename,text,up_votes,down_votes,age,gender,accent,duration,generated_text,generated_text_cv,generated_text_cv_wer,generated_text_wer,similarity
0,cv-valid-dev/sample-000000.mp3,BE CAREFUL WITH YOUR PROGNOSTICATIONS SAID THE...,1,0,,,,5.068125,BE CAREFUL WITH YOUR PROGNOSTICATIONS SAID THE...,BE CAREFUL WHET YOUR PROGNOSTICATIONS SAID THE...,0.125,0.0,True
1,cv-valid-dev/sample-000001.mp3,THEN WHY SHOULD THEY BE SURPRISED WHEN THEY SE...,2,0,,,,3.596125,THEN WHY SHOULD THEY BE SURPRISED WHEN THEY SE...,THEN WHY SHOULD THEY BE SURPRISED WITH MHE SEBON,0.4,0.0,False
2,cv-valid-dev/sample-000002.mp3,A YOUNG ARAB ALSO LOADED DOWN WITH BAGGAGE ENT...,2,0,,,,5.980125,A YOUNG ARAB ALSO LOADED DOWN WITH BAGGAGE ENT...,A YOUNG ARAB ALSO LOADED DOWN WITH PACKAGE ENT...,0.076923,0.0,False
3,cv-valid-dev/sample-000003.mp3,I THOUGHT THAT EVERYTHING I OWNED WOULD BE DES...,3,0,,,,4.468125,I FELT THAT EVERYTHING I OWNED WOULD BE DESTROYED,I FELT THAT EVERYTHING I OWNED WOULD BE DESTROYED,0.111111,0.111111,True
4,cv-valid-dev/sample-000004.mp3,HE MOVED ABOUT INVISIBLE BUT EVERYONE COULD HE...,1,0,fourties,female,england,4.540125,HE MOVED ABOUT INVISIBLE BUT EVERY ONE COULD H...,HE MOVED ABOUT INVISIBLE BUT EVERYONE COULD HE...,0.0,0.222222,False


In [44]:
df.to_csv('cv-valid-dev.csv', index=False)