In [26]:
import pandas as pd
import numpy as np
import os 
import pickle
#from fuzzywuzzy import process, fuzz
from rapidfuzz import process, utils, fuzz
from collections import Counter
from tqdm import tqdm
pd.options.display.max_columns = None

# checking the dataset's data
Checking the data from the [given](https://www.kaggle.com/datasets/andrewmvd/spotify-playlists?select=spotify_dataset.csv) database.

In [27]:
data = pd.read_csv('../data/spotify_millsongdata.csv', sep = ',', on_bad_lines='skip')
print(data.shape)
data.head()

(57650, 4)


Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


check if data has duplicated values

In [28]:
data.columns

Index(['artist', 'song', 'link', 'text'], dtype='object')

In [29]:
data[['artist', 'song']].duplicated().sum()

2

In [30]:
data = data.drop_duplicates(subset=['artist', 'song'])
print(data.shape)
data.head()

(57648, 4)


Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


remove unnecesary columns

In [31]:
data_new = data.drop(['link'], axis = 1)
print(data_new.shape)
data_new.head()

(57648, 3)


Unnamed: 0,artist,song,text
0,ABBA,Ahe's My Kind Of Girl,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante","Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,Making somebody happy is a question of give an...


# check if the data is clean
It is possible that some artist are named similarly and create duplicated values

In [32]:
data_new.dropna(subset=['artist'], inplace=True)
data_new.dropna(subset=['song'], inplace=True)
print(data_new.shape)
data_new.head()

(57648, 3)


Unnamed: 0,artist,song,text
0,ABBA,Ahe's My Kind Of Girl,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante","Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,Making somebody happy is a question of give an...


In [33]:
print('unique artists:',len(data_new['artist'].unique().tolist()),'; unique songs (with unique names):',len(data_new['song'].unique().tolist()))

unique artists: 643 ; unique songs (with unique names): 44824


making all `artist` lowercase, removing all spaces and weird characters

In [34]:
data_new['artist_clean'] = data_new['artist'].str.strip().str.lower().str.replace('&', 'and').str.replace('  ', ' ').str.replace('-', '/').str.replace(' / ', '/')
print(len(data_new['artist_clean'].unique().tolist()), len(data_new['artist'].unique().tolist()))

643 643


doing the same with the `tracks`

In [35]:
data_new['track_clean'] = data_new['song'].str.strip().str.lower().str.replace('&', 'and').str.replace('  ', ' ').str.replace(' / ', '/')
print(len(data_new['track_clean'].unique().tolist()), len(data_new['song'].unique().tolist()))

44757 44824


In [36]:
data_new

Unnamed: 0,artist,song,text,artist_clean,track_clean
0,ABBA,Ahe's My Kind Of Girl,"Look at her face, it's a wonderful face \r\nA...",abba,ahe's my kind of girl
1,ABBA,"Andante, Andante","Take it easy with me, please \r\nTouch me gen...",abba,"andante, andante"
2,ABBA,As Good As New,I'll never know why I had to go \r\nWhy I had...,abba,as good as new
3,ABBA,Bang,Making somebody happy is a question of give an...,abba,bang
4,ABBA,Bang-A-Boomerang,Making somebody happy is a question of give an...,abba,bang-a-boomerang
...,...,...,...,...,...
57645,Ziggy Marley,Good Old Days,Irie days come on play \r\nLet the angels fly...,ziggy marley,good old days
57646,Ziggy Marley,Hand To Mouth,Power to the workers \r\nMore power \r\nPowe...,ziggy marley,hand to mouth
57647,Zwan,Come With Me,all you need \r\nis something i'll believe \...,zwan,come with me
57648,Zwan,Desire,northern star \r\nam i frightened \r\nwhere ...,zwan,desire


In [37]:
data_new = data_new.drop_duplicates(subset=['artist_clean', 'track_clean'])
print(data_new.shape)
data_new.head()

(57648, 5)


Unnamed: 0,artist,song,text,artist_clean,track_clean
0,ABBA,Ahe's My Kind Of Girl,"Look at her face, it's a wonderful face \r\nA...",abba,ahe's my kind of girl
1,ABBA,"Andante, Andante","Take it easy with me, please \r\nTouch me gen...",abba,"andante, andante"
2,ABBA,As Good As New,I'll never know why I had to go \r\nWhy I had...,abba,as good as new
3,ABBA,Bang,Making somebody happy is a question of give an...,abba,bang
4,ABBA,Bang-A-Boomerang,Making somebody happy is a question of give an...,abba,bang-a-boomerang


In [38]:
def parecido_artistas(col, dict_artistas):   
    maximo = 0
    for key in dict_artistas.keys():
    #print(key)
        #print(genero)
        comparo = fuzz.ratio(key, col)
        if comparo > maximo:
            maximo = comparo
            resultado = key
    if maximo > 90:
        return resultado
    else:
        return np.nan

Fixing the `artist`s, in order to remove duplicated artist or typos -- replacing by the most common occurance by similarity.

In [39]:
data_new[data_new['artist_clean'].str.contains('costello') == True]['artist_clean'].unique().tolist()[:10]

['elvis costello']

In [40]:
fuzz.ratio('elvis costello and the attractions', 'elvis costello/the attractions')

90.625

In [41]:
cuenta_artistas = Counter(artistas for artistas in data_new['artist_clean'])
print(len(cuenta_artistas))
cuenta_artistas

643


Counter({'abba': 113,
         'ace of base': 74,
         'adam sandler': 70,
         'adele': 54,
         'aerosmith': 171,
         'air supply': 174,
         'aiza seguerra': 25,
         'alabama': 187,
         'alan parsons project': 102,
         'aled jones': 23,
         'alice cooper': 174,
         'alice in chains': 95,
         'alison krauss': 145,
         'allman brothers band': 116,
         'alphaville': 105,
         'america': 184,
         'amy grant': 147,
         'andrea bocelli': 25,
         'andy williams': 138,
         'annie': 32,
         'ariana grande': 51,
         'ariel rivera': 19,
         'arlo guthrie': 113,
         'arrogant worms': 89,
         'avril lavigne': 143,
         'backstreet boys': 164,
         'barbie': 18,
         'barbra streisand': 157,
         'beach boys': 151,
         'the beatles': 178,
         'beautiful south': 149,
         'beauty and the beast': 12,
         'bee gees': 170,
         'bette midler': 158,
     

In [49]:
dict_artistas = dict(cuenta_artistas)
dict_artistas = {k:v for k,v in dict_artistas.items() if v > 0}
print(len(dict_artistas))
dict_artistas#.get(3)

643


{'abba': 113,
 'ace of base': 74,
 'adam sandler': 70,
 'adele': 54,
 'aerosmith': 171,
 'air supply': 174,
 'aiza seguerra': 25,
 'alabama': 187,
 'alan parsons project': 102,
 'aled jones': 23,
 'alice cooper': 174,
 'alice in chains': 95,
 'alison krauss': 145,
 'allman brothers band': 116,
 'alphaville': 105,
 'america': 184,
 'amy grant': 147,
 'andrea bocelli': 25,
 'andy williams': 138,
 'annie': 32,
 'ariana grande': 51,
 'ariel rivera': 19,
 'arlo guthrie': 113,
 'arrogant worms': 89,
 'avril lavigne': 143,
 'backstreet boys': 164,
 'barbie': 18,
 'barbra streisand': 157,
 'beach boys': 151,
 'the beatles': 178,
 'beautiful south': 149,
 'beauty and the beast': 12,
 'bee gees': 170,
 'bette midler': 158,
 'bill withers': 35,
 'billie holiday': 150,
 'billy joel': 141,
 'bing crosby': 157,
 'black sabbath': 156,
 'blur': 136,
 'bob dylan': 188,
 'bob marley': 86,
 'bob rivers': 48,
 'bob seger': 158,
 'bon jovi': 181,
 'boney m.': 98,
 'bonnie raitt': 149,
 'bosson': 52,
 'brea

In [50]:
data_test = data_new.sample(1000)

In [51]:
data_test['artist'] = data_test.apply(lambda x: parecido_artistas(x['artist_clean'], dict_artistas), axis=1)
data_test[['artist_clean','artist']].value_counts()

artist_clean       artist           
ozzy osbourne      ozzy osbourne        7
hank williams jr.  hank williams jr.    6
misfits            misfits              6
kenny rogers       kenny rogers         6
kenny chesney      kenny chesney        6
                                       ..
pharrell williams  pharrell williams    1
george michael     george michael       1
pitbull            pitbull              1
poison             poison               1
lloyd cole         lloyd cole           1
Length: 458, dtype: int64

In [52]:
#data_test.to_csv('test.csv', index=False, sep=';')

In [53]:
tqdm.pandas()

data_new['artist_clean_new'] = data_new.progress_apply(lambda x: parecido_artistas(x['artist_clean'], dict_artistas), axis=1)
data_new[['artist_clean','artist_clean_new']].value_counts()

100%|██████████| 57648/57648 [00:15<00:00, 3621.75it/s]


artist_clean      artist_clean_new
donna summer      donna summer        191
gordon lightfoot  gordon lightfoot    189
george strait     george strait       188
bob dylan         bob dylan           188
cher              cher                187
                                     ... 
ungu              ungu                  2
zed               zed                   1
x/treme           x/treme               1
zoe               zoe                   1
u/kiss            u/kiss                1
Length: 643, dtype: int64

In [54]:
data_new['check'] = np.where(data_new['artist_clean_new'] == data_new['artist_clean'], 'same', 'different' )
data_new['check'].value_counts()

same    57648
Name: check, dtype: int64

In [56]:
data_new[data_new['check']== 'same']

Unnamed: 0,artist,song,text,artist_clean,track_clean,artist_clean_new,check
0,ABBA,Ahe's My Kind Of Girl,"Look at her face, it's a wonderful face \r\nA...",abba,ahe's my kind of girl,abba,same
1,ABBA,"Andante, Andante","Take it easy with me, please \r\nTouch me gen...",abba,"andante, andante",abba,same
2,ABBA,As Good As New,I'll never know why I had to go \r\nWhy I had...,abba,as good as new,abba,same
3,ABBA,Bang,Making somebody happy is a question of give an...,abba,bang,abba,same
4,ABBA,Bang-A-Boomerang,Making somebody happy is a question of give an...,abba,bang-a-boomerang,abba,same
...,...,...,...,...,...,...,...
57645,Ziggy Marley,Good Old Days,Irie days come on play \r\nLet the angels fly...,ziggy marley,good old days,ziggy marley,same
57646,Ziggy Marley,Hand To Mouth,Power to the workers \r\nMore power \r\nPowe...,ziggy marley,hand to mouth,ziggy marley,same
57647,Zwan,Come With Me,all you need \r\nis something i'll believe \...,zwan,come with me,zwan,same
57648,Zwan,Desire,northern star \r\nam i frightened \r\nwhere ...,zwan,desire,zwan,same


Doing the same process with the `tracks`, in order to remove typos.

In [58]:
data_new.head()

Unnamed: 0,artist,song,text,artist_clean,track_clean,artist_clean_new,check
0,ABBA,Ahe's My Kind Of Girl,"Look at her face, it's a wonderful face \r\nA...",abba,ahe's my kind of girl,abba,same
1,ABBA,"Andante, Andante","Take it easy with me, please \r\nTouch me gen...",abba,"andante, andante",abba,same
2,ABBA,As Good As New,I'll never know why I had to go \r\nWhy I had...,abba,as good as new,abba,same
3,ABBA,Bang,Making somebody happy is a question of give an...,abba,bang,abba,same
4,ABBA,Bang-A-Boomerang,Making somebody happy is a question of give an...,abba,bang-a-boomerang,abba,same


In [59]:
data_new[data_new['track_clean'].str.contains("satisfaction") == True]['track_clean'].unique().tolist()[:10]

['satisfaction (encore)', 'satisfaction']

In [61]:
cuenta_tracks = Counter(track for track in data_new['track_clean'])
print(len(cuenta_tracks))
cuenta_tracks

44757


Counter({"ahe's my kind of girl": 1,
         'andante, andante': 1,
         'as good as new': 1,
         'bang': 4,
         'bang-a-boomerang': 1,
         'burning my bridges': 1,
         'cassandra': 1,
         'chiquitita': 1,
         'crazy world': 4,
         'crying over you': 2,
         'dance': 6,
         'dancing queen': 4,
         'disillusion': 1,
         'does your mother know': 2,
         'dream world': 2,
         'dum dum diddle': 1,
         'eagle': 1,
         'every good man': 1,
         'fernando': 1,
         'fernando (in spanish)': 1,
         'free as a bumble bee': 1,
         'from a twinkling star to a passing angel': 1,
         'gimme gimme gimme': 1,
         "givin' a little bit more": 1,
         'gonna sing you my lovesong': 1,
         'hamlet iii': 1,
         'happy hawaii': 1,
         'happy new year': 3,
         'he is your brother': 1,
         'head over heels': 4,
         "here we'll stay": 1,
         'hey hey helen': 1,
       

In [62]:
dict_tracks = dict(cuenta_tracks)
dict_tracks = {k:v for k,v in dict_tracks.items() if v > 4}
print(len(dict_tracks))
dict_tracks

808


{'dance': 6,
 'lovers': 5,
 'move on': 9,
 'so long': 7,
 "don't stop": 14,
 'perfect world': 7,
 'hello': 11,
 'i miss you': 5,
 'now and then': 8,
 'someone like you': 10,
 'fever': 14,
 "i'm ready": 7,
 'lay it down': 6,
 'my girl': 10,
 'remember': 7,
 'somebody': 5,
 'sunshine': 10,
 'the other side': 7,
 'after all': 10,
 'all by myself': 7,
 'always': 19,
 'black and blue': 9,
 'come to me': 9,
 'crazy love': 12,
 'do it again': 10,
 "don't walk away": 6,
 'end of the line': 8,
 'evil woman': 5,
 'faith': 8,
 'here i am': 11,
 'i want you': 17,
 "i'm alive": 15,
 'miracles': 5,
 'now and forever': 5,
 'o come all ye faithful': 9,
 'old habits die hard': 5,
 'sleigh ride': 7,
 'someone': 7,
 'sweet dreams': 5,
 'the first noel': 11,
 'winter wonderland': 16,
 'if': 9,
 'open arms': 7,
 'sorry': 10,
 'down by the riverside': 6,
 'goodbye': 19,
 'here we are': 5,
 'hollywood': 13,
 'if i had you': 5,
 'games people play': 5,
 'time': 9,
 'give it up': 14,
 'lullaby': 13,
 'love son

In [71]:
def parecido_tracks(col, dict_tracks):   
    maximo = 0
    for key in dict_tracks.keys():
    #print(key)
        #print(genero)
        comparo = fuzz.ratio(key, col)
        if comparo > maximo:
            maximo = comparo
            resultado = key
    if maximo > 70:
        return resultado
    else:
        return np.nan

In [64]:
data_new.head()

Unnamed: 0,artist,song,text,artist_clean,track_clean,artist_clean_new,check
0,ABBA,Ahe's My Kind Of Girl,"Look at her face, it's a wonderful face \r\nA...",abba,ahe's my kind of girl,abba,same
1,ABBA,"Andante, Andante","Take it easy with me, please \r\nTouch me gen...",abba,"andante, andante",abba,same
2,ABBA,As Good As New,I'll never know why I had to go \r\nWhy I had...,abba,as good as new,abba,same
3,ABBA,Bang,Making somebody happy is a question of give an...,abba,bang,abba,same
4,ABBA,Bang-A-Boomerang,Making somebody happy is a question of give an...,abba,bang-a-boomerang,abba,same


In [72]:
tqdm.pandas()

data_new['track_clean_new'] = data_new.progress_apply(lambda x: parecido_tracks(x['track_clean'], dict_tracks), axis=1)
data_new[['track_clean','track_clean_new']].value_counts()

100%|██████████| 57648/57648 [00:20<00:00, 2811.19it/s]


track_clean                             track_clean_new                       
have yourself a merry little christmas  have yourself a merry little christmas    35
angel                                   angel                                     28
hold on                                 hold on                                   28
home                                    home                                      27
i believe                               i believe                                 26
                                                                                  ..
hold me now                             hold me                                    1
hold me lord                            hold me                                    1
hold me back                            come back                                  1
hold 'em joe                            hold me                                    1
zero                                    hero                           

In [73]:
data_new['check'] = np.where(data_new['track_clean_new'] == data_new['track_clean'], 'same', 'different')
data_new['check'].value_counts()

different    51543
same          6105
Name: check, dtype: int64

In [74]:
data_new[data_new['check'] == 'different']

Unnamed: 0,artist,song,text,artist_clean,track_clean,artist_clean_new,check,track_clean_new
0,ABBA,Ahe's My Kind Of Girl,"Look at her face, it's a wonderful face \r\nA...",abba,ahe's my kind of girl,abba,different,
1,ABBA,"Andante, Andante","Take it easy with me, please \r\nTouch me gen...",abba,"andante, andante",abba,different,
2,ABBA,As Good As New,I'll never know why I had to go \r\nWhy I had...,abba,as good as new,abba,different,
3,ABBA,Bang,Making somebody happy is a question of give an...,abba,bang,abba,different,
4,ABBA,Bang-A-Boomerang,Making somebody happy is a question of give an...,abba,bang-a-boomerang,abba,different,
...,...,...,...,...,...,...,...,...
57643,Ziggy Marley,G7,Seven richest countries in the world \r\nThem...,ziggy marley,g7,ziggy marley,different,
57644,Ziggy Marley,Generation,Many generation have passed away \r\nFighting...,ziggy marley,generation,ziggy marley,different,my generation
57645,Ziggy Marley,Good Old Days,Irie days come on play \r\nLet the angels fly...,ziggy marley,good old days,ziggy marley,different,
57646,Ziggy Marley,Hand To Mouth,Power to the workers \r\nMore power \r\nPowe...,ziggy marley,hand to mouth,ziggy marley,different,


In [None]:
print(len(data_new[' "trackname"'].unique().tolist()), len(data_new['track'].unique().tolist()))

# Conversion
I will save the input file as .pkl as the original one is too big for github.

In [None]:
data_final = data_new.drop_duplicates(subset=['artist_clean', 'track_clean'])
print(data_final.shape)
data_final.head()

In [None]:
with open('../data/spotify_dataset.pickle', 'wb') as data_nowplaying:
    pickle.dump(data_new[['artist_clean', 'track_clean']], data_nowplaying)