In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib

from sklearn.cluster import KMeans
from sklearn.manifold import TSNE

import nltk

In [2]:
import sentence_transformers
from sentence_transformers import SentenceTransformer, util

  from .autonotebook import tqdm as notebook_tqdm


### 1.SBert Embeddings

In [10]:
df = pd.read_csv("../1.Database/dataset_out_of_place_open_responses.csv")

In [11]:
def initialize_model(model_name = "paraphrase-multilingual-mpnet-base-v2"):
    sentence_transformer_model = SentenceTransformer(model_name)
    return sentence_transformer_model

def compute_embeddings(sentence_transformer_model, training: str):
    embeddings = sentence_transformer_model.encode(training, convert_to_tensor = True)
    return embeddings.numpy()

#### 1.Preprocess text

In [None]:
from clean_asr_service import CleanASRService

cleanASRservice = CleanASRService()

# Normalize text, delete duplicates etc
df["cleaned_asr"] = df.asr.apply(lambda x: cleanASRservice.execute(x, delete_stopwords=False))

# Normalize text, delete duplicates etc + delete stopwords
# Not use "no" and "sí" as stopwords
stopword_es = nltk.corpus.stopwords.words('spanish')
stopword_es.remove("sí")
stopword_es.remove("no")
df["cleaned_asr_without_stopw"] = df.asr.apply(lambda x: cleanASRservice.execute(x, stopwords_list=stopword_es,delete_stopwords=True))

#### 1.2 Compute SBert embeddings

In [13]:
# Compute embeddings
sentence_transformer_model = initialize_model()
df["embedding_asr"] = df.asr.apply(lambda x: compute_embeddings(sentence_transformer_model,x))
df["embedding_cleaned_asr"] = df.cleaned_asr.apply(lambda x: compute_embeddings(sentence_transformer_model,x))
df["embedding_cleaned_asr_without_stopw"] = df.cleaned_asr_without_stopw.apply(lambda x: compute_embeddings(sentence_transformer_model,x))
# Save encoding database
df.to_csv("SBERT_embeddings.csv")