# Uso de modelos de embeddings de Hugging Face: Sentence Transformers



## Intalacion y carga de libreiras

In [None]:
!pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentencepiece (from sentence-transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone
  Created wheel for sentence-transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125923 sha256=c9d17dbd12d8c084817a96bd18418d539d15be1b6ce5984db8fee6172d87428d
  Stored in directory: /root/.cache/pip/wheels/62/f2/10/1e606fd5f02395388f74e7462910fe851042f97238cbbd902f
Successfully built sentence-tra

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util


## Cargar dataset, el archivo csv con el que se va a trabajar


In [None]:
df_avatar = pd.read_csv('/content/atla-episodes-scripts.csv')

In [None]:
df_avatar.head(10)

Unnamed: 0,Character,script,ep_number,Book,total_number
0,,"As the title card fades, the scene opens onto ...",1,1,1
1,Sokka,It's not getting away from me this time. [Clos...,1,1,1
2,,"The shot pans quickly from Sokka to Katara, wh...",1,1,1
3,Katara,"[Happily surprised.] Sokka, look!",1,1,1
4,Sokka,"[Close-up of Sokka; whispering.] Sshh! Katara,...",1,1,1
5,,"Behind Sokka, Katara is still making circular ...",1,1,1
6,Katara,[Struggling with the water that passes right i...,1,1,1
7,,The bubble containing her fish slowly drifts a...,1,1,1
8,Katara,[Exclaims indignantly.] Hey!,1,1,1
9,,"As Sokka lets out a gasp of discomfort, the fi...",1,1,1


## Usar Sentences Transformer como prueba (No es necesario hacer esto)

In [None]:
sentences = ['este es el primer ejemplo', 'y este es el segundo ejemplo']
model = SentenceTransformer('sentence-transformers/all-MiniLM-L12-v2')
embeddings = model.encode(sentences)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/573 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/134M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/352 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [None]:
embeddings.shape

(2, 384)

## Aplicar los embeddings

Pasa todos los script del dialogo del csv a espacios vectoriales

In [None]:
embeddings = model.encode(df_avatar['script'], batch_size=64, show_progress_bar=True)

Batches:   0%|          | 0/209 [00:00<?, ?it/s]

# Guardar dataset con nueva columna llamada embeddings

embeddings: son los espacios vectoriales del dialogo

In [None]:
df_avatar['embeddings'] = embeddings.tolist()

In [None]:
df_avatar

Unnamed: 0,Character,script,ep_number,Book,total_number,embeddings
0,,"As the title card fades, the scene opens onto ...",1,1,1,"[0.018827201798558235, 0.026618875563144684, -..."
1,Sokka,It's not getting away from me this time. [Clos...,1,1,1,"[0.05028131604194641, 0.022047238424420357, 0...."
2,,"The shot pans quickly from Sokka to Katara, wh...",1,1,1,"[0.04786450415849686, 0.024271463975310326, -0..."
3,Katara,"[Happily surprised.] Sokka, look!",1,1,1,"[-0.030522499233484268, 0.031033219769597054, ..."
4,Sokka,"[Close-up of Sokka; whispering.] Sshh! Katara,...",1,1,1,"[0.011988013982772827, 0.002811463549733162, 0..."
...,...,...,...,...,...,...
13364,Suki,And why did you paint me firebending?,21,3,61,"[-0.04141026735305786, 0.09808457642793655, 0...."
13365,Sokka,I thought it looked more exciting that way. [M...,21,3,61,"[0.025984492152929306, -0.01141583826392889, 0..."
13366,Iroh,"[Points at painting.] Hey, my belly's not that...",21,3,61,"[0.03740331530570984, 0.08375449478626251, 0.0..."
13367,Toph,Well I think you all look perfect! [They laugh.],21,3,61,"[0.021190252155065536, -0.01194122713059187, 0..."


## Busqueda semantica vectorial

In [None]:
query_embedding = model.encode(['Hi!'])

df_avatar['similarity'] = df_avatar.embeddings.apply(lambda x : util.cos_sim(x, query_embedding[0] ))

In [None]:
df_avatar.head()

Unnamed: 0,Character,script,ep_number,Book,total_number,embeddings,similarity
0,,"As the title card fades, the scene opens onto ...",1,1,1,"[0.018827201798558235, 0.026618875563144684, -...",[[tensor(0.0632)]]
1,Sokka,It's not getting away from me this time. [Clos...,1,1,1,"[0.05028131604194641, 0.022047238424420357, 0....",[[tensor(0.2086)]]
2,,"The shot pans quickly from Sokka to Katara, wh...",1,1,1,"[0.04786450415849686, 0.024271463975310326, -0...",[[tensor(0.1206)]]
3,Katara,"[Happily surprised.] Sokka, look!",1,1,1,"[-0.030522499233484268, 0.031033219769597054, ...",[[tensor(0.3151)]]
4,Sokka,"[Close-up of Sokka; whispering.] Sshh! Katara,...",1,1,1,"[0.011988013982772827, 0.002811463549733162, 0...",[[tensor(0.2004)]]


In [None]:
df_avatar.sort_values('similarity', ascending=False).head(10)

Unnamed: 0,Character,script,ep_number,Book,total_number,embeddings,similarity
2124,Katara,Hi.,10,1,10,"[-0.08202335238456726, 0.06577710062265396, 0....",[[tensor(0.8144)]]
2714,Aang,Hey!,12,1,12,"[-0.059960562735795975, 0.06668634712696075, 0...",[[tensor(0.7913)]]
7931,Huge round angry face,Hey!,15,2,35,"[-0.059960562735795975, 0.06668634712696075, 0...",[[tensor(0.7913)]]
7492,Iroh,Hey!,13,2,33,"[-0.059960562735795975, 0.06668634712696075, 0...",[[tensor(0.7913)]]
188,Aang,"Hi hi hi! Ha, ha, ha!",1,1,1,"[-0.08792741596698761, 0.05534249544143677, 0....",[[tensor(0.7374)]]
7188,Zuko,Hello.,12,2,32,"[-0.0770374983549118, 0.006127546541392803, 0....",[[tensor(0.7214)]]
5433,Aang,Hello? Who are you?,4,2,24,"[-0.07091774046421051, 0.05386243015527725, 0....",[[tensor(0.6982)]]
1229,Haru,"Hi, Mom.",6,1,6,"[-0.026744017377495766, 0.04878418147563934, 0...",[[tensor(0.6237)]]
98,Katara,Hey.,1,1,1,"[-0.040797147899866104, 0.04000037536025047, 0...",[[tensor(0.5962)]]
2123,Jet,Hey.,10,1,10,"[-0.040797147899866104, 0.04000037536025047, 0...",[[tensor(0.5962)]]
