In [1]:
from transformers import DistilBertTokenizer, DistilBertModel
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import torch
from annoy import AnnoyIndex

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained("distilbert-base-uncased")

In [80]:
def pre_process(value: str):
    encoded_input = tokenizer(value, return_tensors='pt')
    output = model(**encoded_input)
    return output.last_hidden_state.squeeze(0)[-1].detach().numpy().reshape(1, -1)

In [149]:
df_1 = pd.read_csv('../data/sample_data/top_10_artists_songs.csv')
song_col = 'song_name'

df_2 = pd.read_csv('../data/emoji.csv')
emo_col = 'emoji'

In [171]:
vectorized_lyrics = np.load('../data/sample_data/lyrics_vectorized.npy')
embedding_dimension = model.config.hidden_size
annoy_index_1 = AnnoyIndex(embedding_dimension, 'euclidean')
for i, vector in enumerate(vectorized_lyrics):
    annoy_index_1.add_item(i, vector)
annoy_index_1.build(10)

def find_nearest_song_annoy(emojis, df, n=1):
    idx = annoy_index_1.get_nns_by_vector(pre_process(emojis).reshape(-1,1), n)
    return list(df[song_col].iloc[idx])

In [172]:
find_nearest_song_annoy('If I were a Boy', df_1)

  idx = annoy_index_1.get_nns_by_vector(pre_process(emojis).reshape(-1,1), n)


['If I Were A Boy']

In [173]:
vectorized_name = np.load('../data/emoji_name_vectorized.npy')
embedding_dimension = model.config.hidden_size
annoy_index_2 = AnnoyIndex(embedding_dimension, 'euclidean')
for i, vector in enumerate(vectorized_name):
    annoy_index_2.add_item(i, vector)
annoy_index_2.build(10)

def clean_text(text):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    words = [token.text for token in doc if token.pos_ not in ['ADP', 'CCONJ', 'DET', 'PUNCT']]
    text = ' '.join(words)
    return text

def find_nearest_emoji_annoy(in_text, df, n=1):
    idx = annoy_index_2.get_nns_by_vector(pre_process(in_text).reshape(-1,1), n)
    return list(df[emo_col].iloc[idx])

def translate_text(text, k=1):
    translated = ""
    for word in clean_text(text).split():
        closest = find_nearest_emoji_annoy(word, df_2, n=k)
        translated += closest[-1] if isinstance(closest, list) else closest
    return translated


In [182]:
translate_text('If I were a Boy')

  idx = annoy_index_2.get_nns_by_vector(pre_process(in_text).reshape(-1,1), n)


'🅰️🆔🅰️👦'

In [183]:
res = []
for i in range(100):
    cosine_similarity(a, b)


print(np.mean(res))

  idx = annoy_index_2.get_nns_by_vector(pre_process(in_text).reshape(-1,1), n)


'🧑\u200d⚖️➕🧑\u200d⚖️🧑\u200d⚖️➕🧑\u200d⚖️'

In [181]:
test_text = df_1['lyrics'][2][:30]
print(test_text)
print(translate_text(test_text))


If I were a Boy
Even Just for
🅰️🆔🅰️👦🇮🇹🇮🇹


  idx = annoy_index_2.get_nns_by_vector(pre_process(in_text).reshape(-1,1), n)


In [189]:
n = 2
res = []
for i in range(n):
    text = df_1['lyrics'][i][:30]
    vec_from_song = pre_process(text)
    vec_from_emoji = pre_process(translate_text(text))
    
    res.append(cosine_similarity(vec_from_song, vec_from_emoji))

print(np.mean(res))


  idx = annoy_index_2.get_nns_by_vector(pre_process(in_text).reshape(-1,1), n)


Remember those walls I built

🔙🪟🆔➕


  idx = annoy_index_2.get_nns_by_vector(pre_process(in_text).reshape(-1,1), n)


Driver roll up the partition p
🏎️😪📝✌️
0.96724147


In [133]:
find_nearest_song_annoy('ca', df_1)

  idx = annoy_index.get_nns_by_vector(pre_process(emojis).reshape(-1,1), n)


['the 1']

In [130]:
df_1['lyrics'][0][:30]

'Remember those walls I built\r\n'