In [1]:
import pandas as pd
import numpy as np
import os 
import pickle
#from fuzzywuzzy import process, fuzz
from rapidfuzz import process, utils, fuzz
from collections import Counter
from tqdm import tqdm
pd.options.display.max_columns = None

# checking the dataset's data
Checking the data from the [given](https://www.kaggle.com/datasets/andrewmvd/spotify-playlists?select=spotify_dataset.csv) database.

In [2]:
data = pd.read_csv('../data/spotify_millsongdata.csv', sep = ',', on_bad_lines='skip')
print(data.shape)
data.head()

(57650, 4)


Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


check if data has duplicated values

In [3]:
data.columns

Index(['artist', 'song', 'link', 'text'], dtype='object')

In [4]:
data[['artist', 'song']].duplicated().sum()

2

In [5]:
data = data.drop_duplicates(subset=['artist', 'song'])
print(data.shape)
data.head()

(57648, 4)


Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


remove unnecesary columns

In [6]:
data_new = data.drop(['link'], axis = 1)
print(data_new.shape)
data_new.head()

(57648, 3)


Unnamed: 0,artist,song,text
0,ABBA,Ahe's My Kind Of Girl,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante","Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,Making somebody happy is a question of give an...


# check if the data is clean
It is possible that some artist are named similarly and create duplicated values

In [7]:
data_new.dropna(subset=['artist'], inplace=True)
data_new.dropna(subset=['song'], inplace=True)
print(data_new.shape)
data_new.head()

(57648, 3)


Unnamed: 0,artist,song,text
0,ABBA,Ahe's My Kind Of Girl,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante","Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,Making somebody happy is a question of give an...


In [8]:
print('unique artists:',len(data_new['artist'].unique().tolist()),'; unique songs (with unique names):',len(data_new['song'].unique().tolist()))

unique artists: 643 ; unique songs (with unique names): 44824


making all `artist` lowercase, removing all spaces and weird characters

In [9]:
data_new['artist_clean'] = data_new['artist'].str.strip().str.lower().str.replace('&', 'and').str.replace('  ', ' ').str.replace('-', '/').str.replace(' / ', '/')
print(len(data_new['artist_clean'].unique().tolist()), len(data_new['artist'].unique().tolist()))

643 643


doing the same with the `tracks`

In [10]:
data_new['track_clean'] = data_new['song'].str.strip().str.lower().str.replace('&', 'and').str.replace('  ', ' ').str.replace(' / ', '/')
print(len(data_new['track_clean'].unique().tolist()), len(data_new['song'].unique().tolist()))

44757 44824


In [11]:
data_new

Unnamed: 0,artist,song,text,artist_clean,track_clean
0,ABBA,Ahe's My Kind Of Girl,"Look at her face, it's a wonderful face \r\nA...",abba,ahe's my kind of girl
1,ABBA,"Andante, Andante","Take it easy with me, please \r\nTouch me gen...",abba,"andante, andante"
2,ABBA,As Good As New,I'll never know why I had to go \r\nWhy I had...,abba,as good as new
3,ABBA,Bang,Making somebody happy is a question of give an...,abba,bang
4,ABBA,Bang-A-Boomerang,Making somebody happy is a question of give an...,abba,bang-a-boomerang
...,...,...,...,...,...
57645,Ziggy Marley,Good Old Days,Irie days come on play \r\nLet the angels fly...,ziggy marley,good old days
57646,Ziggy Marley,Hand To Mouth,Power to the workers \r\nMore power \r\nPowe...,ziggy marley,hand to mouth
57647,Zwan,Come With Me,all you need \r\nis something i'll believe \...,zwan,come with me
57648,Zwan,Desire,northern star \r\nam i frightened \r\nwhere ...,zwan,desire


In [12]:
data_new = data_new.drop_duplicates(subset=['artist_clean', 'track_clean'])
print(data_new.shape)
data_new.head()

(57648, 5)


Unnamed: 0,artist,song,text,artist_clean,track_clean
0,ABBA,Ahe's My Kind Of Girl,"Look at her face, it's a wonderful face \r\nA...",abba,ahe's my kind of girl
1,ABBA,"Andante, Andante","Take it easy with me, please \r\nTouch me gen...",abba,"andante, andante"
2,ABBA,As Good As New,I'll never know why I had to go \r\nWhy I had...,abba,as good as new
3,ABBA,Bang,Making somebody happy is a question of give an...,abba,bang
4,ABBA,Bang-A-Boomerang,Making somebody happy is a question of give an...,abba,bang-a-boomerang


In [13]:
def parecido_artistas(col, dict_artistas):   
    maximo = 0
    for key in dict_artistas.keys():
    #print(key)
        #print(genero)
        comparo = fuzz.ratio(key, col)
        if comparo > maximo:
            maximo = comparo
            resultado = key
    if maximo > 90:
        return resultado
    else:
        return np.nan

Fixing the `artist`s, in order to remove duplicated artist or typos -- replacing by the most common occurance by similarity.

In [14]:
data_new[data_new['artist_clean'].str.contains('costello') == True]['artist_clean'].unique().tolist()[:10]

['elvis costello']

In [15]:
fuzz.ratio('elvis costello and the attractions', 'elvis costello/the attractions')

90.625

In [16]:
cuenta_artistas = Counter(artistas for artistas in data_new['artist_clean'])
print(len(cuenta_artistas))
cuenta_artistas

643


Counter({'abba': 113,
         'ace of base': 74,
         'adam sandler': 70,
         'adele': 54,
         'aerosmith': 171,
         'air supply': 174,
         'aiza seguerra': 25,
         'alabama': 187,
         'alan parsons project': 102,
         'aled jones': 23,
         'alice cooper': 174,
         'alice in chains': 95,
         'alison krauss': 145,
         'allman brothers band': 116,
         'alphaville': 105,
         'america': 184,
         'amy grant': 147,
         'andrea bocelli': 25,
         'andy williams': 138,
         'annie': 32,
         'ariana grande': 51,
         'ariel rivera': 19,
         'arlo guthrie': 113,
         'arrogant worms': 89,
         'avril lavigne': 143,
         'backstreet boys': 164,
         'barbie': 18,
         'barbra streisand': 157,
         'beach boys': 151,
         'the beatles': 178,
         'beautiful south': 149,
         'beauty and the beast': 12,
         'bee gees': 170,
         'bette midler': 158,
     

In [17]:
dict_artistas = dict(cuenta_artistas)
dict_artistas = {k:v for k,v in dict_artistas.items() if v > 4}
print(len(dict_artistas))
dict_artistas#.get(3)

630


{'abba': 113,
 'ace of base': 74,
 'adam sandler': 70,
 'adele': 54,
 'aerosmith': 171,
 'air supply': 174,
 'aiza seguerra': 25,
 'alabama': 187,
 'alan parsons project': 102,
 'aled jones': 23,
 'alice cooper': 174,
 'alice in chains': 95,
 'alison krauss': 145,
 'allman brothers band': 116,
 'alphaville': 105,
 'america': 184,
 'amy grant': 147,
 'andrea bocelli': 25,
 'andy williams': 138,
 'annie': 32,
 'ariana grande': 51,
 'ariel rivera': 19,
 'arlo guthrie': 113,
 'arrogant worms': 89,
 'avril lavigne': 143,
 'backstreet boys': 164,
 'barbie': 18,
 'barbra streisand': 157,
 'beach boys': 151,
 'the beatles': 178,
 'beautiful south': 149,
 'beauty and the beast': 12,
 'bee gees': 170,
 'bette midler': 158,
 'bill withers': 35,
 'billie holiday': 150,
 'billy joel': 141,
 'bing crosby': 157,
 'black sabbath': 156,
 'blur': 136,
 'bob dylan': 188,
 'bob marley': 86,
 'bob rivers': 48,
 'bob seger': 158,
 'bon jovi': 181,
 'boney m.': 98,
 'bonnie raitt': 149,
 'bosson': 52,
 'brea

In [18]:
data_test = data_new.sample(1000)

In [19]:
data_test['artist'] = data_test.apply(lambda x: parecido_artistas(x['artist_clean'], dict_artistas), axis=1)
data_test[['artist_clean','artist']].value_counts()

artist_clean        artist            
glen campbell       glen campbell         9
amy grant           amy grant             9
iggy pop            iggy pop              7
robbie williams     robbie williams       7
kris kristofferson  kris kristofferson    7
                                         ..
patti smith         patti smith           1
george harrison     george harrison       1
gary valenciano     gary valenciano       1
garth brooks        garth brooks          1
zz top              zz top                1
Length: 457, dtype: int64

In [20]:
#data_test.to_csv('test.csv', index=False, sep=';')

In [23]:
tqdm.pandas()

data_new['artist_clean_new'] = data_new.progress_apply(lambda x: parecido_artistas(x['artist_clean'], dict_artistas), axis=1)
data_new[['artist_clean','artist_clean_new']].value_counts()

100%|██████████| 57648/57648 [00:04<00:00, 12623.03it/s]


artist_clean           artist_clean_new     
engelbert humperdinck  engelbert humperdinck    125
planetshakers          planetshakers            116
queensryche            queensryche               91
evanescence            evanescence               77
proclaimers            proclaimers               76
soundgarden            soundgarden               72
hooverphonic           hooverphonic              56
whiskeytown            whiskeytown               53
housemartins           housemartins              23
quarterflash           quarterflash              23
youngbloodz            youngbloodz               19
dtype: int64

In [24]:
data_new['check'] = np.where(data_new['artist_clean_new'] == data_new['artist_clean'], 'same', 'different' )
data_new['check'].value_counts()

different    56917
same           731
Name: check, dtype: int64

In [25]:
data_new[data_new['check']== 'different']

Unnamed: 0,artist,song,text,artist_clean,track_clean,check,artist_clean_new
0,ABBA,Ahe's My Kind Of Girl,"Look at her face, it's a wonderful face \r\nA...",,ahe's my kind of girl,different,
1,ABBA,"Andante, Andante","Take it easy with me, please \r\nTouch me gen...",,"andante, andante",different,
2,ABBA,As Good As New,I'll never know why I had to go \r\nWhy I had...,,as good as new,different,
3,ABBA,Bang,Making somebody happy is a question of give an...,,bang,different,
4,ABBA,Bang-A-Boomerang,Making somebody happy is a question of give an...,,bang-a-boomerang,different,
...,...,...,...,...,...,...,...
57645,Ziggy Marley,Good Old Days,Irie days come on play \r\nLet the angels fly...,,good old days,different,
57646,Ziggy Marley,Hand To Mouth,Power to the workers \r\nMore power \r\nPowe...,,hand to mouth,different,
57647,Zwan,Come With Me,all you need \r\nis something i'll believe \...,,come with me,different,
57648,Zwan,Desire,northern star \r\nam i frightened \r\nwhere ...,,desire,different,


In [None]:
print(len(data_new[' "artistname"'].unique().tolist()), len(data_new['artist'].unique().tolist()))

Doing the same process with the `tracks`, in order to remove typos.

In [None]:
data_new.head()

In [None]:
data_new[data_new['track_clean'].str.contains("satisfaction") == True]['track_clean'].unique().tolist()[:10]

In [None]:
fuzz.ratio('(i can’t get no) satisfaction', "(i can't get no) satisfaction/(original single mono version)")

In [None]:
cuenta_tracks = Counter(track for track in data_new['track_clean'])
print(len(cuenta_tracks))
cuenta_tracks

In [None]:
dict_tracks = dict(cuenta_tracks)
dict_tracks = {k:v for k,v in dict_tracks.items() if v > 4}
print(len(dict_tracks))
dict_tracks

In [None]:
def parecido_tracks(col, dict_tracks):   
    maximo = 0
    for key in dict_tracks.keys():
    #print(key)
        #print(genero)
        comparo = fuzz.ratio(key, col)
        if comparo > maximo:
            maximo = comparo
            resultado = key
    if maximo > 60:
        return resultado
    else:
        return np.nan

In [None]:
data_new.head()

In [None]:
tqdm.pandas()

data_new['track'] = data_new.progress_apply(lambda x: parecido_tracks(x['track_clean'], dict_artistas), axis=1)
#data_new[['track_clean','track']].value_counts()

In [None]:
print(len(data_new[' "artistname"'].unique().tolist()), len(data_new['artist'].unique().tolist()))

In [None]:
print(len(data_new[' "trackname"'].unique().tolist()), len(data_new['track'].unique().tolist()))

# Conversion
I will save the input file as .pkl as the original one is too big for github.

In [None]:
data_final = data_new.drop_duplicates(subset=['artist_clean', 'track_clean'])
print(data_final.shape)
data_final.head()

In [None]:
with open('../data/spotify_dataset.pickle', 'wb') as data_nowplaying:
    pickle.dump(data_new[['artist_clean', 'track_clean']], data_nowplaying)