<a href="https://colab.research.google.com/github/barkhaaroraa/nlp_or_ai-agents/blob/main/emoji_attack.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [24]:
!pip install -q sentence-transformers


In [25]:

from sentence_transformers import SentenceTransformer, util
import numpy as np
import pandas as pd


In [26]:
model = SentenceTransformer('all-MiniLM-L6-v2')


In [27]:
sentences = [
    "I love pizza",
    " I  love  pizza ",
    "I lo ve pizza",
    "I love piz za",
    "I love p izza",
    "I love pi zza",
    "I love pizza 🍕",
    "I love pizza 🍕🍕",
    "🍕I 🍕love 🍕pizza ",
    "I love pizza 😍",
    "I love pizza 😢",
    "I love pizza 😂",
    "I love piz🍕za ",
    "I love pi🍕zza 🍕",
    "I love p🍕izza 🍕"
]


In [28]:
embeddings = model.encode(sentences, convert_to_tensor=True)


In [29]:
emb_np = embeddings.cpu().numpy()
emb_np


array([[-0.09892612,  0.03360058,  0.01034015, ...,  0.07289542,
         0.06621676, -0.10262094],
       [-0.09892612,  0.03360058,  0.01034015, ...,  0.07289542,
         0.06621676, -0.10262094],
       [-0.102662  , -0.00889306,  0.00115738, ...,  0.01807693,
        -0.02456306, -0.15943387],
       ...,
       [-0.07859205, -0.0214042 ,  0.08077011, ...,  0.10299173,
         0.00272973, -0.01266244],
       [-0.04292594,  0.01288689,  0.1009925 , ...,  0.08996274,
        -0.00515315, -0.03269631],
       [-0.04292594,  0.01288689,  0.1009925 , ...,  0.08996274,
        -0.00515315, -0.03269631]], dtype=float32)

In [30]:
cosine_sim = util.cos_sim(embeddings, embeddings)


In [31]:
df = pd.DataFrame(cosine_sim.cpu().numpy(),
                  index=sentences,
                  columns=sentences)
display(df)


Unnamed: 0,I love pizza,I love pizza.1,I lo ve pizza,I love piz za,I love p izza,I love pi zza,I love pizza 🍕,I love pizza 🍕🍕,🍕I 🍕love 🍕pizza,I love pizza 😍,I love pizza 😢,I love pizza 😂,I love piz🍕za,I love pi🍕zza 🍕,I love p🍕izza 🍕
I love pizza,1.0,1.0,0.707189,0.354682,0.464234,0.431655,0.924648,0.924648,0.228005,0.924648,0.924648,0.924648,0.437873,0.42164,0.42164
I love pizza,1.0,1.0,0.707189,0.354682,0.464234,0.431655,0.924648,0.924648,0.228005,0.924648,0.924648,0.924648,0.437873,0.42164,0.42164
I lo ve pizza,0.707189,0.707189,1.0,0.317045,0.369016,0.346529,0.683882,0.683882,0.20133,0.683882,0.683882,0.683882,0.265368,0.246351,0.246351
I love piz za,0.354682,0.354682,0.317045,1.0,0.514354,0.750778,0.332603,0.332603,0.126032,0.332603,0.332603,0.332603,0.272024,0.259408,0.259408
I love p izza,0.464234,0.464234,0.369016,0.514354,1.0,0.479577,0.432648,0.432648,0.233733,0.432648,0.432648,0.432648,0.372566,0.372599,0.372599
I love pi zza,0.431655,0.431655,0.346529,0.750778,0.479577,1.0,0.402838,0.402838,0.204526,0.402838,0.402838,0.402838,0.269405,0.276293,0.276293
I love pizza 🍕,0.924648,0.924648,0.683882,0.332603,0.432648,0.402838,1.0,1.0,0.317919,1.0,1.0,1.0,0.557799,0.561558,0.561558
I love pizza 🍕🍕,0.924648,0.924648,0.683882,0.332603,0.432648,0.402838,1.0,1.0,0.317919,1.0,1.0,1.0,0.557799,0.561558,0.561558
🍕I 🍕love 🍕pizza,0.228005,0.228005,0.20133,0.126032,0.233733,0.204526,0.317919,0.317919,1.0,0.317919,0.317919,0.317919,0.446849,0.590415,0.590415
I love pizza 😍,0.924648,0.924648,0.683882,0.332603,0.432648,0.402838,1.0,1.0,0.317919,1.0,1.0,1.0,0.557799,0.561558,0.561558


In [32]:
base_vector = emb_np[0]  # "I love pizza"
shifts = [np.linalg.norm(vec - base_vector) for vec in emb_np]


In [33]:
shift_df = pd.DataFrame({"Sentence": sentences, "Vector Shift (L2 Distance)": shifts})
print("\nEmbedding Shifts Compared to Base Sentence:")
display(shift_df)



Embedding Shifts Compared to Base Sentence:


Unnamed: 0,Sentence,Vector Shift (L2 Distance)
0,I love pizza,0.0
1,I love pizza,0.0
2,I lo ve pizza,0.76526
3,I love piz za,1.136061
4,I love p izza,1.035149
5,I love pi zza,1.066157
6,I love pizza 🍕,0.388205
7,I love pizza 🍕🍕,0.388205
8,🍕I 🍕love 🍕pizza,1.242574
9,I love pizza 😍,0.388205


In [35]:
# Install TextBlob
!pip install -q textblob
!python -m textblob.download_corpora
##does not work
import re
from textblob import TextBlob

def normalize_text(text):
    # Remove emojis and unusual characters
    text = re.sub(r'[^\w\s]', '', text, flags=re.UNICODE)

    # Correct fragmented words
    corrected_words = []
    for word in text.split():
        corrected = str(TextBlob(word).correct())
        corrected_words.append(corrected)

    return " ".join(corrected_words)

example = "I love pi zza 🍕"
print(normalize_text(example))  # -> "I love pizza"


[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package conll2000 to /root/nltk_data...
[nltk_data]   Unzipping corpora/conll2000.zip.
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.
Finished.
I love i a
