In [1]:
import pandas as pd
import numpy as np
import os 
import pickle
#from fuzzywuzzy import process, fuzz
from rapidfuzz import process, utils, fuzz
from collections import Counter
from tqdm import tqdm
pd.options.display.max_columns = None

# checking the dataset's data
Checking the data from the [given](https://www.kaggle.com/datasets/andrewmvd/spotify-playlists?select=spotify_dataset.csv) database.

In [2]:
data = pd.read_csv('../data/spotify_dataset.csv', sep = ',', on_bad_lines='skip')
print(data.shape)
data.head()

(12891680, 4)


Unnamed: 0,user_id,"""artistname""","""trackname""","""playlistname"""
0,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello,(The Angels Wanna Wear My) Red Shoes,HARD ROCK 2010
1,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello & The Attractions,"(What's So Funny 'Bout) Peace, Love And Unders...",HARD ROCK 2010
2,9cc0cfd4d7d7885102480dd99e7a90d6,Tiffany Page,7 Years Too Late,HARD ROCK 2010
3,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello & The Attractions,Accidents Will Happen,HARD ROCK 2010
4,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello,Alison,HARD ROCK 2010


check if data has duplicated values

In [3]:
data.columns

Index(['user_id', ' "artistname"', ' "trackname"', ' "playlistname"'], dtype='object')

In [4]:
data[['user_id',	' "artistname"',	' "trackname"']].duplicated().sum()

1488094

In [5]:
data = data.drop_duplicates(subset=[' "artistname"',	' "trackname"'])
print(data.shape)
data.head()

(2819059, 4)


Unnamed: 0,user_id,"""artistname""","""trackname""","""playlistname"""
0,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello,(The Angels Wanna Wear My) Red Shoes,HARD ROCK 2010
1,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello & The Attractions,"(What's So Funny 'Bout) Peace, Love And Unders...",HARD ROCK 2010
2,9cc0cfd4d7d7885102480dd99e7a90d6,Tiffany Page,7 Years Too Late,HARD ROCK 2010
3,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello & The Attractions,Accidents Will Happen,HARD ROCK 2010
4,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello,Alison,HARD ROCK 2010


remove unnecesary columns

In [6]:
data_new = data.drop(['user_id', ' "playlistname"'], axis = 1)
print(data_new.shape)
data_new.head()

(2819059, 2)


Unnamed: 0,"""artistname""","""trackname"""
0,Elvis Costello,(The Angels Wanna Wear My) Red Shoes
1,Elvis Costello & The Attractions,"(What's So Funny 'Bout) Peace, Love And Unders..."
2,Tiffany Page,7 Years Too Late
3,Elvis Costello & The Attractions,Accidents Will Happen
4,Elvis Costello,Alison


# check if the data is clean
It is possible that some artist are named similarly and create duplicated values

In [7]:
data_new.dropna(subset=[' "artistname"'], inplace=True)
data_new.dropna(subset=[' "trackname"'], inplace=True)
print(data_new.shape)
data_new.head()

(2790794, 2)


Unnamed: 0,"""artistname""","""trackname"""
0,Elvis Costello,(The Angels Wanna Wear My) Red Shoes
1,Elvis Costello & The Attractions,"(What's So Funny 'Bout) Peace, Love And Unders..."
2,Tiffany Page,7 Years Too Late
3,Elvis Costello & The Attractions,Accidents Will Happen
4,Elvis Costello,Alison


In [8]:
print(len(data_new[' "trackname"'].unique().tolist()), len(data_new[' "artistname"'].unique().tolist()))

2005251 289816


making all `artist` lowercase, removing all spaces and weird characters

In [9]:
data_new['artist_clean'] = data_new[' "artistname"'].str.strip().str.lower().str.replace('&', 'and').str.replace('  ', ' ').str.replace('-', '/').str.replace(' / ', '/')
print(len(data_new['artist_clean'].unique().tolist()), len(data_new[' "artistname"'].unique().tolist()))

281180 289816


doing the same with the `tracks`

In [10]:
data_new['track_clean'] = data_new[' "trackname"'].str.strip().str.lower().str.replace('&', 'and').str.replace('  ', ' ').str.replace('-', '/').str.replace(' / ', '/')
print(len(data_new['track_clean'].unique().tolist()), len(data_new[' "trackname"'].unique().tolist()))

1887608 2005251


In [11]:
data_new

Unnamed: 0,"""artistname""","""trackname""",artist_clean,track_clean
0,Elvis Costello,(The Angels Wanna Wear My) Red Shoes,elvis costello,(the angels wanna wear my) red shoes
1,Elvis Costello & The Attractions,"(What's So Funny 'Bout) Peace, Love And Unders...",elvis costello and the attractions,"(what's so funny 'bout) peace, love and unders..."
2,Tiffany Page,7 Years Too Late,tiffany page,7 years too late
3,Elvis Costello & The Attractions,Accidents Will Happen,elvis costello and the attractions,accidents will happen
4,Elvis Costello,Alison,elvis costello,alison
...,...,...,...,...
12891379,Yoga Pop Ups,Womanizer,yoga pop ups,womanizer
12891449,The Werks,Duck Farm,the werks,duck farm
12891550,Devo,Satisfaction (I Can't Get No),devo,satisfaction (i can't get no)
12891589,Billy Thorpe,Children of the Sun,billy thorpe,children of the sun


In [12]:
data_new = data_new.drop_duplicates(subset=['artist_clean', 'track_clean'])
print(data_new.shape)
data_new.head()

(2691039, 4)


Unnamed: 0,"""artistname""","""trackname""",artist_clean,track_clean
0,Elvis Costello,(The Angels Wanna Wear My) Red Shoes,elvis costello,(the angels wanna wear my) red shoes
1,Elvis Costello & The Attractions,"(What's So Funny 'Bout) Peace, Love And Unders...",elvis costello and the attractions,"(what's so funny 'bout) peace, love and unders..."
2,Tiffany Page,7 Years Too Late,tiffany page,7 years too late
3,Elvis Costello & The Attractions,Accidents Will Happen,elvis costello and the attractions,accidents will happen
4,Elvis Costello,Alison,elvis costello,alison


In [13]:
def parecido_artistas(col, dict_artistas):   
    maximo = 0
    for key in dict_artistas.keys():
    #print(key)
        #print(genero)
        comparo = fuzz.ratio(key, col)
        if comparo > maximo:
            maximo = comparo
            resultado = key
    if maximo > 90:
        return resultado
    else:
        return np.nan

Fixing the `artist`s, in order to remove duplicated artist or typos -- replacing by the most common occurance by similarity.

In [14]:
data_new[data_new['artist_clean'].str.contains('costello') == True]['artist_clean'].unique().tolist()[:10]

['elvis costello',
 'elvis costello and the attractions',
 'elvis costello and the roots',
 'katie costello',
 'elvis costello and the imposters',
 'donnacha costello',
 'abbott and costello',
 'bud abbot and lou costello',
 'burt bacharach and elvis costello',
 'elvis costello/the costello show']

In [15]:
fuzz.ratio('elvis costello and the attractions', 'elvis costello/the attractions')

90.625

In [16]:
cuenta_artistas = Counter(artistas for artistas in data_new['artist_clean'])
print(len(cuenta_artistas))
cuenta_artistas

281180


Counter({'elvis costello': 489,
         'elvis costello and the attractions': 252,
         'tiffany page': 4,
         'lissie': 83,
         'paul mccartney': 958,
         'joe echo': 11,
         'the breakers': 13,
         'the coronas': 23,
         'crowded house': 165,
         'joshua radin': 145,
         'cocktail slippers': 23,
         'crosby, stills and nash': 144,
         'the len price 3': 24,
         'paul mccartney and eric clapton': 3,
         'noah and the whale': 73,
         "noel gallagher's high flying birds": 42,
         'pearl jam': 665,
         'tom petty and the heartbreakers': 334,
         'bruce springsteen': 632,
         'madness': 257,
         'spector': 25,
         'miles kane': 51,
         'tom petty': 116,
         'biffy clyro': 261,
         'elbow': 174,
         'oasis': 533,
         'thunderclap newman': 7,
         '2080': 12,
         'c418': 86,
         'glen porter': 20,
         'makeup and vanity set': 26,
         'phaeleh f

In [17]:
dict_artistas = dict(cuenta_artistas)
dict_artistas = {k:v for k,v in dict_artistas.items() if v > 4}
print(len(dict_artistas))
dict_artistas

80815


{'elvis costello': 489,
 'elvis costello and the attractions': 252,
 'lissie': 83,
 'paul mccartney': 958,
 'joe echo': 11,
 'the breakers': 13,
 'the coronas': 23,
 'crowded house': 165,
 'joshua radin': 145,
 'cocktail slippers': 23,
 'crosby, stills and nash': 144,
 'the len price 3': 24,
 'noah and the whale': 73,
 "noel gallagher's high flying birds": 42,
 'pearl jam': 665,
 'tom petty and the heartbreakers': 334,
 'bruce springsteen': 632,
 'madness': 257,
 'spector': 25,
 'miles kane': 51,
 'tom petty': 116,
 'biffy clyro': 261,
 'elbow': 174,
 'oasis': 533,
 'thunderclap newman': 7,
 '2080': 12,
 'c418': 86,
 'glen porter': 20,
 'makeup and vanity set': 26,
 'solar fields': 94,
 'bonobo': 205,
 'slugabed': 47,
 'emancipator': 66,
 'brian blade and the fellowship band': 19,
 'boom bip': 42,
 'disasterpeace': 181,
 'little people': 54,
 'the glitch mob': 82,
 'vangelis': 350,
 'cell': 22,
 'jay haze': 8,
 'prefuse 73': 135,
 'ben prunty': 39,
 'metaform': 61,
 'i monster': 69,
 '

In [18]:
data_test = data_new.sample(1000)

In [19]:
data_test['artist'] = data_test.apply(lambda x: parecido_artistas(x['artist_clean'], dict_artistas), axis=1)
data_test[['artist_clean','artist']].value_counts()

artist_clean             artist                 
wolfgang amadeus mozart  wolfgang amadeus mozart    4
elvis presley            elvis presley              3
ella fitzgerald          ella fitzgerald            3
augustus pablo           augustus pablo             2
count basie              count basie                2
                                                   ..
hallal music             hallal music               1
hazem beltagui           hazem beltagui             1
helena paparizou         helena paparizou           1
henry jackman            henry jackman              1
아마츄어 증폭기                 아마츄어 증폭기                   1
Length: 877, dtype: int64

In [20]:
#data_test.to_csv('test.csv', index=False, sep=';')

In [21]:
tqdm.pandas()

data_new['artist'] = data_new.progress_apply(lambda x: parecido_artistas(x['artist_clean'], dict_artistas), axis=1)
data_new[['artist_clean','artist']].value_counts()

 15%|█▍        | 401798/2691039 [4:08:44<24:18:58, 26.15it/s] 

In [None]:
print(len(data_new[' "artistname"'].unique().tolist()), len(data_new['artist'].unique().tolist()))

Doing the same process with the `tracks`, in order to remove typos.

In [None]:
data_new.head()

In [None]:
data_new[data_new['track_clean'].str.contains("satisfaction") == True]['track_clean'].unique().tolist()[:10]

In [None]:
fuzz.ratio('(i can’t get no) satisfaction', "(i can't get no) satisfaction/(original single mono version)")

In [None]:
cuenta_tracks = Counter(track for track in data_new['track_clean'])
print(len(cuenta_tracks))
cuenta_tracks

In [None]:
dict_tracks = dict(cuenta_tracks)
dict_tracks = {k:v for k,v in dict_tracks.items() if v > 4}
print(len(dict_tracks))
dict_tracks

In [None]:
def parecido_tracks(col, dict_tracks):   
    maximo = 0
    for key in dict_tracks.keys():
    #print(key)
        #print(genero)
        comparo = fuzz.ratio(key, col)
        if comparo > maximo:
            maximo = comparo
            resultado = key
    if maximo > 60:
        return resultado
    else:
        return np.nan

In [None]:
data_new.head()

In [None]:
tqdm.pandas()

data_new['track'] = data_new.progress_apply(lambda x: parecido_tracks(x['track_clean'], dict_artistas), axis=1)
#data_new[['track_clean','track']].value_counts()

In [None]:
print(len(data_new[' "artistname"'].unique().tolist()), len(data_new['artist'].unique().tolist()))

In [None]:
print(len(data_new[' "trackname"'].unique().tolist()), len(data_new['track'].unique().tolist()))

# Conversion
I will save the input file as .pkl as the original one is too big for github.

In [None]:
data_final = data_new.drop_duplicates(subset=['artist_clean', 'track_clean'])
print(data_final.shape)
data_final.head()

In [None]:
with open('../data/spotify_dataset.pickle', 'wb') as data_nowplaying:
    pickle.dump(data_new[['artist_clean', 'track_clean']], data_nowplaying)