In [19]:
from pathlib import Path
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.utils import simple_preprocess
import numpy as np
import pandas as pd
from songs_analyzer.config import get_root_path

from sklearn.neighbors import NearestNeighbors

In [2]:
songs_data_path = get_root_path() / "data/raw/song_feature_data.csv"

songs_df = pd.read_csv(songs_data_path)

In [3]:
songs_df.head()

Unnamed: 0,uri,name,artist,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,spotify:track:7fgC7BPn2SzVno2d7ooLGI,Rufus Wainwright Spotify Pride Intro,Various Artists,0,0.0,0.629,10,-8.432,0,0.0,0.348,0.0,0.383,0.0,0.0,13750,0
1,spotify:track:3l6EkMrhwXPSVEuTDksWB8,Cigarettes And Chocolate Milk,Rufus Wainwright,44,0.328,0.357,4,-9.939,1,0.0274,0.791,0.0,0.0832,0.161,97.816,280360,4
2,spotify:track:2PwYIEV1H34mbQBvbnnmLx,Over the Rainbow,Judy Garland,0,0.231,0.0426,8,-21.141,1,0.0426,0.901,0.000136,0.153,0.199,77.74,166467,4
3,spotify:track:6bFrlORduDtYQ9BlPenV3o,My Baby Just Cares For Me,Nina Simone,0,0.76,0.196,11,-17.872,0,0.111,0.812,2e-06,0.0845,0.418,118.348,216689,4
4,spotify:track:3uxg8Hl1NFVySLDL0euMbu,Cre Spoda,Klaus Nomi,16,0.234,0.858,8,-6.145,0,0.071,0.337,0.459,0.228,0.2,127.745,183947,4


In [23]:
music_name_feature = [simple_preprocess(str(music_name).lower()) for music_name in songs_df["name"]]
music_name_tagged = [TaggedDocument(tokens, [i]) for i, tokens in enumerate(music_name_feature)]
music_name_tagged

[TaggedDocument(words=['rufus', 'wainwright', 'spotify', 'pride', 'intro'], tags=[0]),
 TaggedDocument(words=['cigarettes', 'and', 'chocolate', 'milk'], tags=[1]),
 TaggedDocument(words=['over', 'the', 'rainbow'], tags=[2]),
 TaggedDocument(words=['my', 'baby', 'just', 'cares', 'for', 'me'], tags=[3]),
 TaggedDocument(words=['cre', 'spoda'], tags=[4]),
 TaggedDocument(words=['dance', 'disco', 'heat'], tags=[5]),
 TaggedDocument(words=['do', 'you', 'really', 'want', 'to', 'hurt', 'me'], tags=[6]),
 TaggedDocument(words=['freedom'], tags=[7]),
 TaggedDocument(words=['goodbye', 'yellow', 'brick', 'road'], tags=[8]),
 TaggedDocument(words=['sometimes', 'you', 'need'], tags=[9]),
 TaggedDocument(words=['it', 'sin'], tags=[10]),
 TaggedDocument(words=['rufus', 'wainwright', 'on', 'pride'], tags=[11]),
 TaggedDocument(words=['swings', 'both', 'ways'], tags=[12]),
 TaggedDocument(words=['come', 'rain', 'or', 'come', 'shine', 'live'], tags=[13]),
 TaggedDocument(words=['wish', 'knew', 'how', 'i

In [24]:
model = Doc2Vec(vector_size=100, min_count=1, epochs=50)

In [25]:
model.build_vocab(music_name_tagged)

In [28]:
model.dv.vectors

array([[-0.00523082, -0.00597913, -0.00988075, ..., -0.00201204,
         0.00889517,  0.00235126],
       [-0.00556037, -0.00318053, -0.00597086, ..., -0.00406312,
         0.00215229,  0.00532112],
       [-0.00787459, -0.0092044 ,  0.00841427, ..., -0.00115433,
         0.00969793, -0.00679331],
       ...,
       [-0.00279424,  0.00458743, -0.00997371, ..., -0.00242311,
         0.00388322,  0.00875279],
       [ 0.00850275, -0.00704437, -0.00668973, ...,  0.00165882,
        -0.00107105,  0.00684784],
       [ 0.00131204, -0.00914756,  0.00945708, ..., -0.0017827 ,
         0.00580041,  0.00186356]], dtype=float32)

In [30]:
preprocessed_music_names = pd.DataFrame(model.dv.vectors)
preprocessed_music_names.to_csv(get_root_path() / "data" / "processed" / "vectorized_music_names.csv", index=False)