**Laser Embedding**

In [110]:
#!pip install torch==2.2.1



In [111]:
!pip install -q laserembeddings==1.1.2

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchaudio 2.2.1+cu121 requires torch==2.2.1, but you have torch 1.13.1 which is incompatible.
torchdata 0.7.1 requires torch>=2, but you have torch 1.13.1 which is incompatible.
torchtext 0.17.1 requires torch==2.2.1, but you have torch 1.13.1 which is incompatible.
torchvision 0.17.1+cu121 requires torch==2.2.1, but you have torch 1.13.1 which is incompatible.[0m[31m
[0m

In [112]:
!python -m laserembeddings download-models

Downloading models into /usr/local/lib/python3.10/dist-packages/laserembeddings/data

⏳   Downloading https://dl.fbaipublicfiles.com/laser/models/93langs.fcodes...✅   Downloaded https://dl.fbaipublicfiles.com/laser/models/93langs.fcodes    
✅   Downloaded https://dl.fbaipublicfiles.com/laser/models/93langs.fvocab    
✅   Downloaded https://dl.fbaipublicfiles.com/laser/models/bilstm.93langs.2018-12-26.pt    

✨ You're all set!


In [113]:
import pandas as pd
import numpy as np

In [114]:
dataframe = pd.read_csv('preprocessed_messages.csv')

In [115]:
texts = dataframe['text']
tokenized = dataframe['stemmed_sentence_Porter']

In [116]:
langs = ["en" for i in range(len(texts))]

In [117]:
%%time

from laserembeddings import Laser

laser = Laser()
embeddings_text = laser.embed_sentences(
    texts,
    lang=langs)

embeddings_tokenized = laser.embed_sentences(
    tokenized,
    lang=langs)

CPU times: user 5.6 s, sys: 472 ms, total: 6.07 s
Wall time: 6.96 s


In [118]:
print('Text embedding shape: ', embeddings_text.shape)
print('Tokenized embedding shape: ', embeddings_tokenized.shape)

Text embedding shape:  (50, 1024)
Tokenized embedding shape:  (50, 1024)


In [119]:
print('text:', embeddings_text[0], texts[0])
print('embeddings:', embeddings_tokenized[0], texts[0], tokenized[0])

text: [ 0.0086212  -0.0001213   0.04254882 ...  0.01014943  0.00585767
  0.0261275 ] We can do it online so that everyone could join
embeddings: [1.4702949e-02 5.2515816e-06 1.8564619e-02 ... 2.1697991e-02 2.4591812e-03
 2.8372105e-02] We can do it online so that everyone could join ['onlin', 'everyon', 'could', 'join']


In [120]:
reference = 'let s do the meeting'
embedding_reference = laser.embed_sentences(
    reference,
    lang='en')
print('text:', embedding_reference)

text: [[ 0.01682114  0.01596329  0.00836292 ...  0.00719668  0.01053112
  -0.00772411]]


In [121]:
from sklearn.metrics.pairwise import cosine_similarity
similarities_text = cosine_similarity(embedding_reference, embeddings_text)
most_similar_text = np.argmax(similarities_text)

In [122]:
most_similar_indices_text = np.argsort(similarities_text[0])[:-11:-1]
print(texts[most_similar_indices_text])

45                                       I vote the 4th
1                             I'm ok if we do it online
22                                 I manage to add them
23                                        I can try now
49         We could use our names/surnames concatenated
0       We can do it online so that everyone could join
8     I coudn't do it directly when saving the messa...
27    I pushed the new version of test (just with th...
17    It's probably something with the working direc...
21    wow,\r\ngreat\r\nCan you send the csv, I just ...
Name: text, dtype: object


In [123]:
similarities_tokenized = cosine_similarity(embedding_reference, embeddings_tokenized)
most_similar_tokenized = np.argmax(similarities_tokenized)

In [124]:
most_similar_indices_token = np.argsort(similarities_tokenized[0])[:-11:-1]
print(texts[most_similar_indices_token], tokenized[most_similar_indices_token])

45                                  I vote the 4th
49    We could use our names/surnames concatenated
22                            I manage to add them
23                                   I can try now
16                                         Perfect
29                                             Yes
9                                         perfetto
38                               I think it’s this
28                                           great
25                                   ahh, too late
Name: text, dtype: object 45                                 ['vote', '4th']
49    ['could', 'use', 'names/surnam', 'concaten']
22                                ['manag', 'add']
23                                         ['tri']
16                                     ['perfect']
29                                          ['ye']
9                                     ['perfetto']
38                                  ['think', '’']
28                                       ['great']
25   

In [125]:
print('Most similar index:\ntext: ', most_similar_text, '\ntokenized: ', most_similar_tokenized)

Most similar index:
text:  45 
tokenized:  45


In [126]:
print('Most similar text:', texts[most_similar_text])
print('Most similar tokenized:', texts[most_similar_tokenized], tokenized[most_similar_tokenized])

Most similar text: I vote the 4th
Most similar tokenized: I vote the 4th ['vote', '4th']



------------------------------------

**Sentence Transformers**

In [127]:
#!pip install sentence_transformers



Check https://huggingface.co/sentence-transformers for lower dimensionality transformers

In [128]:
from sentence_transformers import SentenceTransformer

emodel = SentenceTransformer("average_word_embeddings_glove.6B.300d") #300

embedded_texts = emodel.encode(texts)
embedded_tokenized = emodel.encode(tokenized)
reference_embedding =  emodel.encode(reference)

In [129]:
print(reference_embedding.shape, embedded_texts.shape, embedded_tokenized.shape)

(300,) (50, 300) (50, 300)


In [130]:
similarities_text = cosine_similarity(reference_embedding.reshape(1, -1), embedded_texts)
most_similar_text = np.argmax(similarities_text)

In [131]:
most_similar_indices = np.argsort(similarities_text[0])[:-11:-1]
print(texts[most_similar_indices])

36    I asked them where they want to meet, I'll let...
31                 Did you meet the polinetwork’s guys?
30    Yeah,\r\nIt's probably ok. He was very support...
6     Do you want to meet/call sometime soon or we w...
11    If you react with different reactions to this ...
38                                    I think it’s this
37                        Is tomorrow at 14 ok for you?
23                                        I can try now
32    hmm\r\nare you sure? did you change the values...
47    It would be more convenient to set it related ...
Name: text, dtype: object


In [132]:
similarities_token = cosine_similarity(reference_embedding.reshape(1, -1), embedded_tokenized)
most_similar_token = np.argmax(similarities_token)

In [133]:
most_similar_indices_token = np.argsort(similarities_token[0])[:-11:-1]
print(texts[most_similar_indices_token], tokenized[most_similar_indices_token])

36    I asked them where they want to meet, I'll let...
6     Do you want to meet/call sometime soon or we w...
30    Yeah,\r\nIt's probably ok. He was very support...
31                 Did you meet the polinetwork’s guys?
33    Now it works just fine, I don't know why it di...
26    Also, if you want to commit the changes in the...
17    It's probably something with the working direc...
47    It would be more convenient to set it related ...
37                        Is tomorrow at 14 ok for you?
13    I guess the csv can be modified once transform...
Name: text, dtype: object 36    ['ask', 'want', 'meet', ',', "'ll", 'let', 'kn...
6     ['want', 'meet/cal', 'sometim', 'soon', 'work'...
30    ['yeah', ',', "'s", 'probabl', 'ok', '.', 'sup...
31             ['meet', 'polinetwork', '’', 'guy', '?']
33    ['work', 'fine', ',', "n't", 'know', "n't", 'w...
26    ['also', ',', 'want', 'commit', 'chang', 'repo...
17    ["'s", 'probabl', 'someth', 'work', 'directori...
47    ['would', 'conve

In [134]:
print('Most similar index.\nText:', most_similar_text, '\nTokenized:', most_similar_token)

Most similar index.
Text: 36 
Tokenized: 36


In [135]:
print('Most similar text.\nText:', texts[most_similar_text], '\nTokenized:', texts[most_similar_token], tokenized[most_similar_token])

Most similar text.
Text: I asked them where they want to meet, I'll let you know as soon as I receive an answer 
Tokenized: I asked them where they want to meet, I'll let you know as soon as I receive an answer ['ask', 'want', 'meet', ',', "'ll", 'let', 'know', 'soon', 'receiv', 'answer']


In [136]:
# other options
# https://huggingface.co/spaces/mteb/leaderboard

Apply PCA

*PCA on original text and reference*

In [137]:
from sklearn.decomposition import PCA

all_texts = reference + texts
all_embeddings = emodel.encode(all_texts)

# Apply PCA to the encoded embeddings
pca = PCA(n_components=50)
pca.fit(all_embeddings)
embedded_texts_pca = pca.transform(all_embeddings)

reference_embedding_pca = embedded_texts_pca[0]
embedded_texts_pca = embedded_texts_pca[1:]


In [138]:
similarities_text = cosine_similarity(reference_embedding_pca.reshape(1, -1), embedded_texts_pca)
most_similar_text = np.argmax(similarities_text)
most_similar_indices = np.argsort(similarities_text[0])[:-11:-1]
print(texts[most_similar_indices])

1                             I'm ok if we do it online
0       We can do it online so that everyone could join
43                                 amidierfan@gmail.com
5     Thursday, I'm all available except from 5:15 t...
34    @Martinavigano \r\nTell me if you could find t...
16                                              Perfect
9                                              perfetto
46    TelegramInfromationSpace\r\nAnamolyDetectionBo...
20    When I run directly from the terminal I get th...
25                                        ahh, too late
Name: text, dtype: object


*PCA on tokenized text and original reference (think about tokenizing the reference...)*

In [139]:
all_texts = reference + tokenized
all_embeddings = emodel.encode(all_texts)

# Apply PCA to the encoded embeddings
pca = PCA(n_components=50)
pca.fit(all_embeddings)
embedded_tokenized_pca = pca.transform(all_embeddings)

reference_embedding_pca = embedded_tokenized_pca[0]
embedded_tokenized_pca = embedded_tokenized_pca[1:]

In [140]:
similarities_tokenized = cosine_similarity(reference_embedding_pca.reshape(1, -1), embedded_tokenized_pca)
most_similar_tokenized = np.argmax(similarities_tokenized)
most_similar_indices = np.argsort(similarities_tokenized[0])[:-11:-1]
print(texts[most_similar_indices])

27    I pushed the new version of test (just with th...
42                       Martina2.vigano@mail.polimi.it
41                         lorenzo.mondo@mail.polimi.it
40    Do you remember the name of the technique prop...
28                                                great
15            Could you do it like reactions: {"🤞": 1}?
22                                 I manage to add them
45                                       I vote the 4th
8     I coudn't do it directly when saving the messa...
35                        Room 2.1.3, it should be free
Name: text, dtype: object


-------------------------------

**Bert**

In [141]:
from transformers import DistilBertModel, DistilBertTokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')