In [1]:
import pandas as pd
import numpy as np
import sys
import logging
import emoji 

sys.path.append("src")
from song_search import find_nearest_song_annoy, get_sub_lyrics
from preprocess import clean_text

In [7]:
lyrics = pd.read_json('../data/sample_data/top_300_spotify_with_embeddings_translated.json')
emojis = lyrics['translated_lyrics'].values

def get_emoji_name(emoji_char):
    emoji_info = emoji.demojize(emoji_char)
    emoji_name = emoji_info.strip(':').replace('\u200d', '')
    if len(emoji_name) > 0:
        return emoji_name
    return ''

# get a song name by an emoji sequence
def avg_pos(data, n=10):
    matched = []
    for i in range(n):
        match = lyrics['song_name'].iloc[i]

        input_emoji = data[i]

        if not input_emoji:
            continue
        
        # emoji lib
        input_emoji_names = [get_emoji_name(emoji_char) for emoji_char in input_emoji]
        concatenated_names = '.'.join(input_emoji_names)

        guess = find_nearest_song_annoy(concatenated_names, 10000) + [match]
        print(guess.index(match))
        print(input_emoji)
        print(concatenated_names)
        print('-------')
        matched.append(guess.index(match))

    return np.mean(matched)

def test_accuracy_lyrics2emoji_search():
    accuracy = avg_pos(emojis) # n=len(emojis)
    logging.critical(f'mean searched pos: {accuracy}')



In [8]:
test_accuracy_lyrics2emoji_search()

141
👍
🤪🤯💩👈🌙
👈🔮🔝💭
👈🔥👋💨🌙
thumbs_up.
.zany_face.exploding_head.pile_of_poo.backhand_index_pointing_left.crescent_moon.
.backhand_index_pointing_left.crystal_ball.TOP_arrow.thought_balloon.
.backhand_index_pointing_left.fire.waving_hand.dashing_away.crescent_moon
-------
11
🌅⬆️🕰️🌞😌👍🅿️. Diddy
(
👋,❓⬆️,👧?
)
sunrise.up_arrow..mantelpiece_clock..sun_with_face.relieved_face.thumbs_up.P_button.... .D.i.d.d.y.
.(.
.waving_hand.,.red_question_mark.up_arrow..,.girl.?.
.)
-------
13
😮👶👶
😮👶👶
😮👶👶
❓🤷‍♀️🤔🔮🧠
face_with_open_mouth.baby.baby.
.face_with_open_mouth.baby.baby.
.face_with_open_mouth.baby.baby.
.red_question_mark.person_shrugging..female_sign..thinking_face.crystal_ball.brain
-------
215
😭👍🗣️🔇👂
😔🙈😳👥🔝💯
🚫👥😔👍0️⃣
🚫👥🤝💔😭
loudly_crying_face.thumbs_up.speaking_head..muted_speaker.ear.
.pensive_face.see-no-evil_monkey.flushed_face.busts_in_silhouette.TOP_arrow.hundred_points.
.prohibited.busts_in_silhouette.pensive_face.thumbs_up.0..⃣.
.prohibited.busts_in_silhouette.handshake.broken_heart.loudly_crying_

CRITICAL:root:mean searched pos: 75.85714285714286


2
💑👍, 💑🥇
🤔💭🚫💰
💑👍 '🕒💑🚫
🏠🔨👀🔥
couple_with_heart.thumbs_up.,. .couple_with_heart.1st_place_medal.
.thinking_face.thought_balloon.prohibited.money_bag.
.couple_with_heart.thumbs_up. .'.three_o’clock.couple_with_heart.prohibited.
.house.hammer.eyes.fire
-------
31
🎤-🎵
🤔-🤔-🤔-🤔-🤔-🤔
🗣️👥, 🗣️👥
microphone.-.musical_note.
.thinking_face.-.thinking_face.-.thinking_face.-.thinking_face.-.thinking_face.-.thinking_face.
.speaking_head..busts_in_silhouette.,. .speaking_head..busts_in_silhouette
-------
118
👋, 👩‍🦰👩‍🦳👩‍🦱👩‍🦲👇
👀👉👉👈👈🌍
🚫🙈👧🍌🍌
💃👯‍♀️👍💃‍♀️💃
waving_hand.,. .woman..red_hair.woman..white_hair.woman..curly_hair.woman..bald.backhand_index_pointing_down.
.eyes.backhand_index_pointing_right.backhand_index_pointing_right.backhand_index_pointing_left.backhand_index_pointing_left.globe_showing_Europe-Africa.
.prohibited.see-no-evil_monkey.girl.banana.banana.
.woman_dancing.people_with_bunny_ears..female_sign..thumbs_up.woman_dancing..female_sign..woman_dancing
-------


In [2]:
from sklearn.metrics.pairwise import cosine_similarity

df = pd.read_json('../data/sample_data/top_300_spotify_with_embeddings.json')
emojis_emb = df['translated_lyrics_embedding'].values

# get a song name by an emoji sequence
def avg_pos(data, n=10):
    matched = []
    for i in range(n):
        match = df['song_name'].iloc[i]

        input_emoji = data[i]

        if not input_emoji:
            continue

        guess = find_nearest_song_annoy(input_emoji, 10000)
        print('------------------------------')
        print("annoy position", guess.index(i))
        print("actual data:")
        print(df['translated_lyrics'][i])
        print(" \n".join(df['lyrics'][i].split('\n')[0:4]).replace('\r', ''))
        cos = cosine_similarity(np.array(df['lyrics_embedding'][i]).reshape(1, -1), \
            np.array(df['translated_lyrics_embedding'][i]).reshape(1, -1))[0][0]
        print('--- cos', round(cos, 4))
        
        for i in list(df['lyrics'][i] for i in guess[:5]):
            print("\n")
            print('- lyrics from annoy:')
            print("\n".join(i.split('\n')[0:4]).replace('\r', ''))
        # print("guess lyrics")
        # print(df['lyrics'].iloc[i] for i in guess[:5])
        # if guess.index(match) < 10:
        #     matched.append((df['song_name'][i], df['translated_lyrics'][i]))

    # return matched[0]

def test_accuracy_lyrics2emoji_search():
    accuracy = avg_pos(emojis_emb, n=len(emojis_emb))
    logging.critical(f'mean searched pos: {accuracy}')


In [3]:
test_accuracy_lyrics2emoji_search()

------------------------------
annoy position 116
actual data:
👍
🤪🤯💩👈🌙
👈🔮🔝💭
👈🔥👋💨🌙
Yeah 
All the crazy shit I did tonight 
Those would be the best memories 
I just wanna let it go for the night
--- cos 0.7199


- lyrics from annoy:
Welcome
Welcome to The E.N.D
Do not panic, there is nothing to fear
Everything around you is changing


- lyrics from annoy:
Ow
Uh-huh, yeah, yeah
I've known a few guys who thought they were pretty smart
But you've got being right down to an art


- lyrics from annoy:
RedOne
Uh
Let's go to the beach-each, let's go get a wave
They say, what they gonna say


- lyrics from annoy:
Shine bright like a diamond
Shine bright like a diamond
Find light in the beautiful sea
I choose to be happy


- lyrics from annoy:
You would not believe your eyes if ten million fireflies
Lit up the world as I fell asleep
'Cause they'd fill the open air and leave teardrops everywhere
You'd think me rude but I would just stand and stare
------------------------------
annoy position 0
ac

CRITICAL:root:mean searched pos: None


------------------------------
annoy position 0
actual data:
👤🔛1️⃣📞↔️
👤🔜📍👈👩‍🚒🌞
💪🦸‍♂️🚫🎁👤
👤🔛1️⃣📞↔️
I'm only one call away 
I'll be there to save the day 
Superman got nothing on me 
I'm only one call away
--- cos 0.7649


- lyrics from annoy:
I'm only one call away
I'll be there to save the day
Superman got nothing on me
I'm only one call away


- lyrics from annoy:
Hello, hello, baby, you called? I can't hear a thing
I have got no service in the club, you say, say? (Say)
Wha-wha-what did you say, huh? You're breakin' up on me
Sorry, I cannot hear you, I'm kinda busy (I'm kinda busy)


- lyrics from annoy:
Welcome
Welcome to The E.N.D
Do not panic, there is nothing to fear
Everything around you is changing


- lyrics from annoy:
Wait till you're announced
We've not yet lost all our graces
The hounds will stay in chains
Look upon Your Greatness and she'll


- lyrics from annoy:
Where there's a will, there's a way, kinda beautiful
And every night has its day, so magical
And if there's love

In [121]:
import spacy
nlp = spacy.load("en_core_web_sm")

df = pd.read_json('../data/sample_data/top_300_spotify_with_embeddings.json')

df = df.head(5)

def clean_text(lyric):
    doc = nlp(lyric)
    pos_tags = ['AUX', 'INTJ', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'X']
    words = [token.text for token in doc if token.pos_ not in pos_tags] # filter words
    lyric = ' '.join(words).split('\n') # make full string
    lyric = [i.strip() for i in lyric if len(i) > 15] # clear small lines
    lyric = '\n'.join(lyric).split('\n')[:4] # get the first 4 lines only
    lyric = '\n'.join(lyric) # completed string
    
    return lyric

for lyric in df['lyrics'].values:
    print()
    print(clean_text(lyric))



All the crazy shit I did tonight
Those the best memories
I just let it go for the night
That the best therapy for me

Wake up in the morning feelin like
Grab my glasses I out the door I gon na hit this city
I leave brush my teeth with a bottle of
I leave for the night I n't coming back

I supposed to know
something n't right here
I n't let you go
And now you out of sight

It you screaming and no one hear
You almost feel ashamed someone that important
without them you feel like nothing
No one ever understand much it hurts

We good we gold
dream that n't sold
We right we n't
Built a home and watched it burn


In [103]:
doc = nlp("n't")
[token.pos_ for token in doc]

['PART']