In [2]:
import pandas as pd
import numpy as np
import pickle as pkl
import re
import io
import itertools
import math
import operator
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
pd.options.mode.chained_assignment = None

# Importing Raw Data

In [3]:
#importing track
track=pd.read_csv('csv/tracks.csv')
track.shape

(586672, 20)

In [12]:
#importing unrefined artists dataset with genre, where many artists have null genre 
artist=pd.read_csv('csv/artists.csv')
artist.shape

(1162095, 5)

In [5]:
track.head()

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,35iwgR4jXetI318WEWsa1Q,Carve,6,126903,0,['Uli'],['45tIt06XoI0Iio4LBEVpls'],1922-02-22,0.645,0.445,0,-13.338,1,0.451,0.674,0.744,0.151,0.127,104.851,3
1,021ht4sdgPcrDgSk7JTbKY,Capítulo 2.16 - Banquero Anarquista,0,98200,0,['Fernando Pessoa'],['14jtPCOoNZwquk5wd9DxrY'],1922-06-01,0.695,0.263,0,-22.136,1,0.957,0.797,0.0,0.148,0.655,102.009,1
2,07A5yehtSnoedViJAZkNnc,Vivo para Quererte - Remasterizado,0,181640,0,['Ignacio Corsini'],['5LiOoJbxVSAMkBS2fUm3X2'],1922-03-21,0.434,0.177,1,-21.18,1,0.0512,0.994,0.0218,0.212,0.457,130.418,5
3,08FmqUhxtyLTn6pAh6bk45,El Prisionero - Remasterizado,0,176907,0,['Ignacio Corsini'],['5LiOoJbxVSAMkBS2fUm3X2'],1922-03-21,0.321,0.0946,7,-27.961,1,0.0504,0.995,0.918,0.104,0.397,169.98,3
4,08y9GfoqCWfOGsKdwojr5e,Lady of the Evening,0,163080,0,['Dick Haymes'],['3BiJGZsyX9sJchTqcSA7Su'],1922,0.402,0.158,3,-16.9,0,0.039,0.989,0.13,0.311,0.196,103.22,4


In [6]:
artist.head()

Unnamed: 0,id,followers,genres,name,popularity
0,0DheY5irMjBUeLybbCUEZ2,0.0,[],Armid & Amir Zare Pashai feat. Sara Rouzbehani,0
1,0DlhY15l3wsrnlfGio2bjU,5.0,[],ปูนา ภาวิณี,0
2,0DmRESX2JknGPQyO15yxg7,0.0,[],Sadaa,0
3,0DmhnbHjm1qw6NCYPeZNgJ,0.0,[],Tra'gruda,0
4,0Dn11fWM7vHQ3rinvWEl4E,2.0,[],Ioannis Panoutsopoulos,0


# Refining Data

In [13]:
#artist_refined dataframe where each artists has atleast 1 genre
artist_refined=artist[artist.genres!="[]"]
artist_refined.count()
#this is the count of number of artists which have atleast one assigned genre

id            305595
followers     305590
genres        305595
name          305595
popularity    305595
dtype: int64

In [14]:
artist_refined.reset_index(inplace=True,drop=True)
artist_refined.head()

Unnamed: 0,id,followers,genres,name,popularity
0,0VLMVnVbJyJ4oyZs2L3Yl2,71.0,['carnaval cadiz'],Las Viudas De Los Bisabuelos,6
1,0dt23bs4w8zx154C5xdVyl,63.0,['carnaval cadiz'],Los De Capuchinos,5
2,0pGhoB99qpEJEsBQxgaskQ,64.0,['carnaval cadiz'],Los “Pofesionales”,7
3,3HDrX2OtSuXLW5dLR85uN3,53.0,['carnaval cadiz'],Los Que No Paran De Rajar,6
4,22mLrN5fkppmuUPsHx6i2G,59.0,"['classical harp', 'harp']",Vera Dulova,3


In [15]:
#Updating artist_refined to make it more convenient and not to confuse with 'id' column of track dataframe
artist_refined.rename(columns={'id':'id_artists'},inplace=True)
artist_refined.head(2)

Unnamed: 0,id_artists,followers,genres,name,popularity
0,0VLMVnVbJyJ4oyZs2L3Yl2,71.0,['carnaval cadiz'],Las Viudas De Los Bisabuelos,6
1,0dt23bs4w8zx154C5xdVyl,63.0,['carnaval cadiz'],Los De Capuchinos,5


In [16]:
#now lets check if the data stored in genre is in form of a list or string
artist_refined['genres'].values[0][0]

'['

As we can see 'genres' is not actually a list, it is stored as a string

In [17]:
#converting genre from string to a list 
artist_refined['genre_upd']=artist_refined['genres'].apply(lambda x: [re.sub(' ','_',i) for  i in  re.findall(r"'([^']*)'",x)])
artist_refined['genre_upd'].values[0][0]

'carnaval_cadiz'

we have successfully converted contents of genres from a string to list

now,replacing the contents of 'genres' with 'genre_upd'

In [18]:
artist_refined.drop(['genres'],axis=1,inplace=True)
artist_refined.rename(columns={'genre_upd':'genres'},inplace=True)
artist_refined.head()

Unnamed: 0,id_artists,followers,name,popularity,genres
0,0VLMVnVbJyJ4oyZs2L3Yl2,71.0,Las Viudas De Los Bisabuelos,6,[carnaval_cadiz]
1,0dt23bs4w8zx154C5xdVyl,63.0,Los De Capuchinos,5,[carnaval_cadiz]
2,0pGhoB99qpEJEsBQxgaskQ,64.0,Los “Pofesionales”,7,[carnaval_cadiz]
3,3HDrX2OtSuXLW5dLR85uN3,53.0,Los Que No Paran De Rajar,6,[carnaval_cadiz]
4,22mLrN5fkppmuUPsHx6i2G,59.0,Vera Dulova,3,"[classical_harp, harp]"


In [19]:
artist_refined.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 305595 entries, 0 to 305594
Data columns (total 5 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   id_artists  305595 non-null  object 
 1   followers   305590 non-null  float64
 2   name        305595 non-null  object 
 3   popularity  305595 non-null  int64  
 4   genres      305595 non-null  object 
dtypes: float64(1), int64(1), object(3)
memory usage: 11.7+ MB


In [20]:
artist_refined.sort_values('popularity',inplace=True)
artist_refined.reset_index(drop=True,inplace=True)

In [21]:
artist_refined[artist_refined['popularity']<20].count()

id_artists    175450
followers     175447
name          175450
popularity    175450
genres        175450
dtype: int64

We are going to be dropping all artists with popularity less than 20

In [22]:
artist_refined.drop(artist_refined.index[range(175450)], inplace=True)

In [23]:
artist_refined.isnull().sum()

id_artists    0
followers     2
name          0
popularity    0
genres        0
dtype: int64

In [24]:
artist_refined.reset_index(drop=True,inplace=True)

In [25]:
#removing followers column as we dont require it
artist_refined.drop('followers',axis=1,inplace=True)

In [26]:
#exporting all values from artist_refined into a csv file 
#so that it is easier for people to see what all names aree present in it
artist_refined.to_csv('csv/artists_refined.csv',index=False)

After this , i manually removed some genres from artists_refined.csv which i deemed irrelevant  as such a thorough task would consume a lot of time if executed in python

In [27]:
#repeating the same process of converting string to list of strings in tracks
#as its the same case as before ie multiple values are stored as aconcatenated string instead of list of strings
track['artists'].values[0][0]

'['

In [28]:
track['artists_upd_1']=track['artists'].apply(lambda x: re.findall(r"'([^']*)'",x))
track['artists_upd_1'].values[0][0]

'Uli'

lets check if this worked for every artist

In [29]:
track[track['artists_upd_1'].apply(lambda x: not x)].head(5)

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,artists_upd_1
164,1xEEYhWxT4WhDQdxfPCT8D,Snake Rag,20,194533,0,"[""King Oliver's Creole Jazz Band""]",['08Zk65toyJllap1MnzljxZ'],1923,0.708,0.361,...,-11.764,0,0.0441,0.994,0.883,0.103,0.902,105.695,4,[]
170,3rauXVLOOM5BlxWqUcDpkg,Chimes Blues,14,170827,0,"[""King Oliver's Creole Jazz Band""]",['08Zk65toyJllap1MnzljxZ'],1923,0.546,0.189,...,-15.984,1,0.0581,0.996,0.908,0.339,0.554,80.318,4,[]
172,1UdqHVRFYMZKU2Q7xkLtYc,Pickin' On Your Baby,11,197493,0,"[""Clarence Williams' Blue Five""]",['6RuQvIr0t0otZHnAxXTGkm'],1923,0.52,0.153,...,-14.042,1,0.044,0.995,0.131,0.353,0.319,102.937,4,[]
174,0Vl2DO5U6FjgBpzCtBN3OA,Everybody Loves My Baby,10,152507,0,"[""Clarence Williams' Blue Five""]",['6RuQvIr0t0otZHnAxXTGkm'],1923,0.514,0.193,...,-13.92,0,0.238,0.996,0.199,0.248,0.665,180.674,4,[]
180,5SvyP1ZeJX1jA7AOZD08NA,Tears,10,187227,0,"[""King Oliver's Creole Jazz Band""]",['08Zk65toyJllap1MnzljxZ'],1923,0.359,0.357,...,-11.81,1,0.0511,0.994,0.819,0.29,0.753,205.053,4,[]


As we can observe, this didnt work for artists with an apostrophe in their stage names

Then lets repeat the same for artists that it didnt work for

In [30]:
track['id_artists']=track['id_artists'].apply(lambda x: [re.sub(' ','_',i) for  i in  re.findall(r"'([^']*)'",x)])
track

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,artists_upd_1
0,35iwgR4jXetI318WEWsa1Q,Carve,6,126903,0,['Uli'],[45tIt06XoI0Iio4LBEVpls],1922-02-22,0.645,0.4450,...,-13.338,1,0.4510,0.674,0.744000,0.1510,0.1270,104.851,3,[Uli]
1,021ht4sdgPcrDgSk7JTbKY,Capítulo 2.16 - Banquero Anarquista,0,98200,0,['Fernando Pessoa'],[14jtPCOoNZwquk5wd9DxrY],1922-06-01,0.695,0.2630,...,-22.136,1,0.9570,0.797,0.000000,0.1480,0.6550,102.009,1,[Fernando Pessoa]
2,07A5yehtSnoedViJAZkNnc,Vivo para Quererte - Remasterizado,0,181640,0,['Ignacio Corsini'],[5LiOoJbxVSAMkBS2fUm3X2],1922-03-21,0.434,0.1770,...,-21.180,1,0.0512,0.994,0.021800,0.2120,0.4570,130.418,5,[Ignacio Corsini]
3,08FmqUhxtyLTn6pAh6bk45,El Prisionero - Remasterizado,0,176907,0,['Ignacio Corsini'],[5LiOoJbxVSAMkBS2fUm3X2],1922-03-21,0.321,0.0946,...,-27.961,1,0.0504,0.995,0.918000,0.1040,0.3970,169.980,3,[Ignacio Corsini]
4,08y9GfoqCWfOGsKdwojr5e,Lady of the Evening,0,163080,0,['Dick Haymes'],[3BiJGZsyX9sJchTqcSA7Su],1922,0.402,0.1580,...,-16.900,0,0.0390,0.989,0.130000,0.3110,0.1960,103.220,4,[Dick Haymes]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
586667,5rgu12WBIHQtvej2MdHSH0,云与海,50,258267,0,['阿YueYue'],[1QLBXKM5GCpyQQSVMNZqrZ],2020-09-26,0.560,0.5180,...,-7.471,0,0.0292,0.785,0.000000,0.0648,0.2110,131.896,4,[阿YueYue]
586668,0NuWgxEp51CutD2pJoF4OM,blind,72,153293,0,['ROLE MODEL'],[1dy5WNgIKQU6ezkpZs4y8z],2020-10-21,0.765,0.6630,...,-5.223,1,0.0652,0.141,0.000297,0.0924,0.6860,150.091,4,[ROLE MODEL]
586669,27Y1N4Q4U3EfDU5Ubw8ws2,What They'll Say About Us,70,187601,0,['FINNEAS'],[37M5pPGs6V1fchFJSgCguX],2020-09-02,0.535,0.3140,...,-12.823,0,0.0408,0.895,0.000150,0.0874,0.0663,145.095,4,[FINNEAS]
586670,45XJsGpFTyzbzeWK8VzR8S,A Day At A Time,58,142003,0,"['Gentle Bones', 'Clara Benin']","[4jGPdu95icCKVF31CcFKbS, 5ebPSE9YI5aLeZ1Z2gkqjn]",2021-03-05,0.696,0.6150,...,-6.212,1,0.0345,0.206,0.000003,0.3050,0.4380,90.029,4,"[Gentle Bones, Clara Benin]"


In [31]:
track[track['artists_upd_1'].apply(lambda x: not x)]
track.head()

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,artists_upd_1
0,35iwgR4jXetI318WEWsa1Q,Carve,6,126903,0,['Uli'],[45tIt06XoI0Iio4LBEVpls],1922-02-22,0.645,0.445,...,-13.338,1,0.451,0.674,0.744,0.151,0.127,104.851,3,[Uli]
1,021ht4sdgPcrDgSk7JTbKY,Capítulo 2.16 - Banquero Anarquista,0,98200,0,['Fernando Pessoa'],[14jtPCOoNZwquk5wd9DxrY],1922-06-01,0.695,0.263,...,-22.136,1,0.957,0.797,0.0,0.148,0.655,102.009,1,[Fernando Pessoa]
2,07A5yehtSnoedViJAZkNnc,Vivo para Quererte - Remasterizado,0,181640,0,['Ignacio Corsini'],[5LiOoJbxVSAMkBS2fUm3X2],1922-03-21,0.434,0.177,...,-21.18,1,0.0512,0.994,0.0218,0.212,0.457,130.418,5,[Ignacio Corsini]
3,08FmqUhxtyLTn6pAh6bk45,El Prisionero - Remasterizado,0,176907,0,['Ignacio Corsini'],[5LiOoJbxVSAMkBS2fUm3X2],1922-03-21,0.321,0.0946,...,-27.961,1,0.0504,0.995,0.918,0.104,0.397,169.98,3,[Ignacio Corsini]
4,08y9GfoqCWfOGsKdwojr5e,Lady of the Evening,0,163080,0,['Dick Haymes'],[3BiJGZsyX9sJchTqcSA7Su],1922,0.402,0.158,...,-16.9,0,0.039,0.989,0.13,0.311,0.196,103.22,4,[Dick Haymes]


it looks like it didnt catch all of em

lets write another code to catch the ones that didnt happen(the ones w apostrophes)

and then we'll combine them both

In [32]:
track['artists_upd_2']=track['artists'].apply(lambda x: re.findall('\"(.*?)\"',x))
track.head()

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,...,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,artists_upd_1,artists_upd_2
0,35iwgR4jXetI318WEWsa1Q,Carve,6,126903,0,['Uli'],[45tIt06XoI0Iio4LBEVpls],1922-02-22,0.645,0.445,...,1,0.451,0.674,0.744,0.151,0.127,104.851,3,[Uli],[]
1,021ht4sdgPcrDgSk7JTbKY,Capítulo 2.16 - Banquero Anarquista,0,98200,0,['Fernando Pessoa'],[14jtPCOoNZwquk5wd9DxrY],1922-06-01,0.695,0.263,...,1,0.957,0.797,0.0,0.148,0.655,102.009,1,[Fernando Pessoa],[]
2,07A5yehtSnoedViJAZkNnc,Vivo para Quererte - Remasterizado,0,181640,0,['Ignacio Corsini'],[5LiOoJbxVSAMkBS2fUm3X2],1922-03-21,0.434,0.177,...,1,0.0512,0.994,0.0218,0.212,0.457,130.418,5,[Ignacio Corsini],[]
3,08FmqUhxtyLTn6pAh6bk45,El Prisionero - Remasterizado,0,176907,0,['Ignacio Corsini'],[5LiOoJbxVSAMkBS2fUm3X2],1922-03-21,0.321,0.0946,...,1,0.0504,0.995,0.918,0.104,0.397,169.98,3,[Ignacio Corsini],[]
4,08y9GfoqCWfOGsKdwojr5e,Lady of the Evening,0,163080,0,['Dick Haymes'],[3BiJGZsyX9sJchTqcSA7Su],1922,0.402,0.158,...,0,0.039,0.989,0.13,0.311,0.196,103.22,4,[Dick Haymes],[]


In [33]:
#checking if it did convert the artist names w apostrophe into list of artist names or not
track[track['artists_upd_1'].apply(lambda x: not x)].head(5)

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,...,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,artists_upd_1,artists_upd_2
164,1xEEYhWxT4WhDQdxfPCT8D,Snake Rag,20,194533,0,"[""King Oliver's Creole Jazz Band""]",[08Zk65toyJllap1MnzljxZ],1923,0.708,0.361,...,0,0.0441,0.994,0.883,0.103,0.902,105.695,4,[],[King Oliver's Creole Jazz Band]
170,3rauXVLOOM5BlxWqUcDpkg,Chimes Blues,14,170827,0,"[""King Oliver's Creole Jazz Band""]",[08Zk65toyJllap1MnzljxZ],1923,0.546,0.189,...,1,0.0581,0.996,0.908,0.339,0.554,80.318,4,[],[King Oliver's Creole Jazz Band]
172,1UdqHVRFYMZKU2Q7xkLtYc,Pickin' On Your Baby,11,197493,0,"[""Clarence Williams' Blue Five""]",[6RuQvIr0t0otZHnAxXTGkm],1923,0.52,0.153,...,1,0.044,0.995,0.131,0.353,0.319,102.937,4,[],[Clarence Williams' Blue Five]
174,0Vl2DO5U6FjgBpzCtBN3OA,Everybody Loves My Baby,10,152507,0,"[""Clarence Williams' Blue Five""]",[6RuQvIr0t0otZHnAxXTGkm],1923,0.514,0.193,...,0,0.238,0.996,0.199,0.248,0.665,180.674,4,[],[Clarence Williams' Blue Five]
180,5SvyP1ZeJX1jA7AOZD08NA,Tears,10,187227,0,"[""King Oliver's Creole Jazz Band""]",[08Zk65toyJllap1MnzljxZ],1923,0.359,0.357,...,1,0.0511,0.994,0.819,0.29,0.753,205.053,4,[],[King Oliver's Creole Jazz Band]


We have successfully created 2 disjoint sets of data where the union of both will make a column of artists where each artist name is an item in the list

In [34]:
#merging both artists_upd_1 and artists_upd_2
track['artists_upd']=np.where(track['artists_upd_1'].apply(lambda x: not x),track['artists_upd_2'],track['artists_upd_1'])
track.head(2)

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,...,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,artists_upd_1,artists_upd_2,artists_upd
0,35iwgR4jXetI318WEWsa1Q,Carve,6,126903,0,['Uli'],[45tIt06XoI0Iio4LBEVpls],1922-02-22,0.645,0.445,...,0.451,0.674,0.744,0.151,0.127,104.851,3,[Uli],[],[Uli]
1,021ht4sdgPcrDgSk7JTbKY,Capítulo 2.16 - Banquero Anarquista,0,98200,0,['Fernando Pessoa'],[14jtPCOoNZwquk5wd9DxrY],1922-06-01,0.695,0.263,...,0.957,0.797,0.0,0.148,0.655,102.009,1,[Fernando Pessoa],[],[Fernando Pessoa]


Now we will drop both artists_upd_1 and artists_upd_2 and artists columns as we do not require it anymore,and rename artists_upd as artists

In [35]:
track.drop(['artists_upd_1','artists_upd_2','artists'],axis=1,inplace=True)
track.rename(columns={'artists_upd':'artists'},inplace=True)
#reordering columns for our convenience
track=track[['id','name','popularity','duration_ms','explicit','artists','id_artists','release_date','danceability','energy','key','loudness','mode','speechiness','acousticness','instrumentalness','liveness','valence','tempo','time_signature']]

In [40]:
#now we need to create our own song identifier because there are duplicates of the same song with different ids
#and appear different
track['artists_song']=track.apply(lambda row: str(row['artists'][0]) +" "+ str(row['name']) , axis = 1)
#sorting values for easier access
track.sort_values(['artists_song','release_date'], ascending = False, inplace = True)
track.head(2)

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,artists_song
538433,3u1C6nWVRoP5F0w8gGrDL3,사랑의 미로,25,222380,0,[최진희],[1NSrAf8XJYJVgAXKoxaMet],1987-06-01,0.367,0.194,...,-19.057,1,0.04,0.617,6e-06,0.162,0.367,144.316,4,최진희 사랑의 미로
404349,1Mv4u308L16NZDZiD6HZCy,사랑은 힘든가봐,28,213440,0,[지수],[4c9QIMfEbIIynuaswyxGx9],2005-12-23,0.675,0.785,...,-5.026,0,0.028,0.379,0.0,0.353,0.623,103.008,4,지수 사랑은 힘든가봐


In [35]:

track.head()

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,artists_song
538433,3u1C6nWVRoP5F0w8gGrDL3,사랑의 미로,25,222380,0,[최진희],[1NSrAf8XJYJVgAXKoxaMet],1987-06-01,0.367,0.194,...,-19.057,1,0.04,0.617,6e-06,0.162,0.367,144.316,4,최진희 사랑의 미로
404349,1Mv4u308L16NZDZiD6HZCy,사랑은 힘든가봐,28,213440,0,[지수],[4c9QIMfEbIIynuaswyxGx9],2005-12-23,0.675,0.785,...,-5.026,0,0.028,0.379,0.0,0.353,0.623,103.008,4,지수 사랑은 힘든가봐
210091,1jvoY322nxyKXq8OBhgmSY,어떡하죠,44,244360,0,[지선],[2Mo9NQaNCFCWSR5CnlfmbN],2011-10-13,0.606,0.341,...,-7.094,1,0.0513,0.779,0.0,0.144,0.294,135.667,4,지선 어떡하죠
449602,1KldYdWxVyDrSepd1NY2Qg,어떡하죠,39,195693,0,[지선],[2Mo9NQaNCFCWSR5CnlfmbN],2009-03-06,0.61,0.84,...,-4.543,1,0.0468,0.245,0.0,0.154,0.657,126.992,4,지선 어떡하죠
270610,2ghebdwe2pNXT4eL34T7pW,그아픔까지사랑한거야,32,237688,0,[조정현],[2WTpsPucygbYRnCnoEUkJQ],1989-06-15,0.447,0.215,...,-16.478,1,0.0272,0.568,1e-06,0.0649,0.177,71.979,4,조정현 그아픔까지사랑한거야


In [41]:
track[track['name']=='Adore You']

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,artists_song
86217,5AnCLGg35ziFOloEnXK4uu,Adore You,71,278747,0,[Miley Cyrus],[5YGY8feqx7naU7z4HrwZM6],2013-10-04,0.583,0.655,...,-5.407,1,0.0315,0.111,4e-06,0.113,0.201,119.759,4,Miley Cyrus Adore You
91884,3jjujdWJ72nww5eGnfs2E7,Adore You,88,207133,0,[Harry Styles],[6KImCVD70vtIoJWnq6nGn3],2019-12-13,0.676,0.771,...,-3.675,1,0.0483,0.0237,7e-06,0.102,0.569,99.048,4,Harry Styles Adore You
92524,1M4qEo4HE3PRaCOM7EXNJq,Adore You,74,207133,0,[Harry Styles],[6KImCVD70vtIoJWnq6nGn3],2019-12-06,0.676,0.771,...,-3.675,1,0.0483,0.0237,7e-06,0.102,0.569,99.048,4,Harry Styles Adore You


AHA! we've spotted one of the duplicates
now,lets remove the duplicates using artist_song

In [42]:
track.drop_duplicates('artists_song',inplace=True)
#now lets check again if there are still duplicates
track[track['name']=='Adore You']

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,artists_song
86217,5AnCLGg35ziFOloEnXK4uu,Adore You,71,278747,0,[Miley Cyrus],[5YGY8feqx7naU7z4HrwZM6],2013-10-04,0.583,0.655,...,-5.407,1,0.0315,0.111,4e-06,0.113,0.201,119.759,4,Miley Cyrus Adore You
91884,3jjujdWJ72nww5eGnfs2E7,Adore You,88,207133,0,[Harry Styles],[6KImCVD70vtIoJWnq6nGn3],2019-12-13,0.676,0.771,...,-3.675,1,0.0483,0.0237,7e-06,0.102,0.569,99.048,4,Harry Styles Adore You


We've successfully removed the duplicate values

In [44]:
track.reset_index(inplace=True)
track.sort_values(by =['index'],inplace=True)


In [88]:
track.reset_index(inplace=True,drop=True)
track.drop(['level_0','index'],axis=1,inplace=True)

track

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,artists_song
0,35iwgR4jXetI318WEWsa1Q,Carve,6,126903,0,Uli,45tIt06XoI0Iio4LBEVpls,1922-02-22,0.645,0.4450,...,-13.338,1,0.4510,0.674,0.744000,0.1510,0.1270,104.851,3,Uli Carve
1,021ht4sdgPcrDgSk7JTbKY,Capítulo 2.16 - Banquero Anarquista,0,98200,0,Fernando Pessoa,14jtPCOoNZwquk5wd9DxrY,1922-06-01,0.695,0.2630,...,-22.136,1,0.9570,0.797,0.000000,0.1480,0.6550,102.009,1,Fernando Pessoa Capítulo 2.16 - Banquero Anarq...
2,07A5yehtSnoedViJAZkNnc,Vivo para Quererte - Remasterizado,0,181640,0,Ignacio Corsini,5LiOoJbxVSAMkBS2fUm3X2,1922-03-21,0.434,0.1770,...,-21.180,1,0.0512,0.994,0.021800,0.2120,0.4570,130.418,5,Ignacio Corsini Vivo para Quererte - Remasteri...
3,08FmqUhxtyLTn6pAh6bk45,El Prisionero - Remasterizado,0,176907,0,Ignacio Corsini,5LiOoJbxVSAMkBS2fUm3X2,1922-03-21,0.321,0.0946,...,-27.961,1,0.0504,0.995,0.918000,0.1040,0.3970,169.980,3,Ignacio Corsini El Prisionero - Remasterizado
4,08y9GfoqCWfOGsKdwojr5e,Lady of the Evening,0,163080,0,Dick Haymes,3BiJGZsyX9sJchTqcSA7Su,1922,0.402,0.1580,...,-16.900,0,0.0390,0.989,0.130000,0.3110,0.1960,103.220,4,Dick Haymes Lady of the Evening
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
523470,5rgu12WBIHQtvej2MdHSH0,云与海,50,258267,0,阿YueYue,1QLBXKM5GCpyQQSVMNZqrZ,2020-09-26,0.560,0.5180,...,-7.471,0,0.0292,0.785,0.000000,0.0648,0.2110,131.896,4,阿YueYue 云与海
523471,0NuWgxEp51CutD2pJoF4OM,blind,72,153293,0,ROLE MODEL,1dy5WNgIKQU6ezkpZs4y8z,2020-10-21,0.765,0.6630,...,-5.223,1,0.0652,0.141,0.000297,0.0924,0.6860,150.091,4,ROLE MODEL blind
523472,27Y1N4Q4U3EfDU5Ubw8ws2,What They'll Say About Us,70,187601,0,FINNEAS,37M5pPGs6V1fchFJSgCguX,2020-09-02,0.535,0.3140,...,-12.823,0,0.0408,0.895,0.000150,0.0874,0.0663,145.095,4,FINNEAS What They'll Say About Us
523473,45XJsGpFTyzbzeWK8VzR8S,A Day At A Time,58,142003,0,Gentle Bones,4jGPdu95icCKVF31CcFKbS,2021-03-05,0.696,0.6150,...,-6.212,1,0.0345,0.206,0.000003,0.3050,0.4380,90.029,4,Gentle Bones A Day At A Time


In [47]:
#as now we have removed the duplicate values from track 
#and successfully converted strings that looked like lists to actual list of strings
#as shown below
track['artists'][0][0]

'Uli'

 we can explode this column 

In [48]:
artists_exploded=track[['artists','id_artists']].explode('artists')
artists_exploded

Unnamed: 0,artists,id_artists
0,Uli,[45tIt06XoI0Iio4LBEVpls]
1,Fernando Pessoa,[14jtPCOoNZwquk5wd9DxrY]
2,Ignacio Corsini,[5LiOoJbxVSAMkBS2fUm3X2]
3,Ignacio Corsini,[5LiOoJbxVSAMkBS2fUm3X2]
4,Dick Haymes,[3BiJGZsyX9sJchTqcSA7Su]
...,...,...
523471,ROLE MODEL,[1dy5WNgIKQU6ezkpZs4y8z]
523472,FINNEAS,[37M5pPGs6V1fchFJSgCguX]
523473,Gentle Bones,"[4jGPdu95icCKVF31CcFKbS, 5ebPSE9YI5aLeZ1Z2gkqjn]"
523473,Clara Benin,"[4jGPdu95icCKVF31CcFKbS, 5ebPSE9YI5aLeZ1Z2gkqjn]"


In [50]:
artists_exploded[artists_exploded['artists']=='King Gnu'][:5]

Unnamed: 0,artists,id_artists
499736,King Gnu,[6wxfx1yhyqjCPYwwxJktR2]
499805,King Gnu,[6wxfx1yhyqjCPYwwxJktR2]
499819,King Gnu,[6wxfx1yhyqjCPYwwxJktR2]
499822,King Gnu,[6wxfx1yhyqjCPYwwxJktR2]
499828,King Gnu,[6wxfx1yhyqjCPYwwxJktR2]


In [51]:
artist_refined.shape

(130145, 4)

In [52]:
artists_exploded_enriched=artists_exploded.merge(artist_refined,how='left',left_on='artists',right_on='name')
artists_exploded_enriched_nonull=artists_exploded_enriched[~artists_exploded_enriched.genres.isnull()]

In [53]:
artists_exploded_enriched_nonull.shape

(556473, 6)

In [55]:
artists_exploded_enriched_nonull[artists_exploded_enriched_nonull['id_artists_y'] =='6wxfx1yhyqjCPYwwxJktR2'][:5]

Unnamed: 0,artists,id_artists_x,id_artists_y,name,popularity,genres
656240,King Gnu,[6wxfx1yhyqjCPYwwxJktR2],6wxfx1yhyqjCPYwwxJktR2,King Gnu,70.0,"[j-pop, j-rock]"
656331,King Gnu,[6wxfx1yhyqjCPYwwxJktR2],6wxfx1yhyqjCPYwwxJktR2,King Gnu,70.0,"[j-pop, j-rock]"
656345,King Gnu,[6wxfx1yhyqjCPYwwxJktR2],6wxfx1yhyqjCPYwwxJktR2,King Gnu,70.0,"[j-pop, j-rock]"
656348,King Gnu,[6wxfx1yhyqjCPYwwxJktR2],6wxfx1yhyqjCPYwwxJktR2,King Gnu,70.0,"[j-pop, j-rock]"
656354,King Gnu,[6wxfx1yhyqjCPYwwxJktR2],6wxfx1yhyqjCPYwwxJktR2,King Gnu,70.0,"[j-pop, j-rock]"


group by id_artists_y and we're almost done

In [56]:
artists_genres_consolidated=artists_exploded_enriched_nonull.groupby(['id_artists_y','artists'])['genres'].apply(list).reset_index()
artists_genres_consolidated['consolidated_genre_list']=artists_genres_consolidated['genres'].apply(lambda x: list(set(list(itertools.chain.from_iterable(x)))))

In [57]:
artists_genres_consolidated[artists_genres_consolidated['artists']=='King Gnu']

Unnamed: 0,id_artists_y,artists,genres,consolidated_genre_list
45480,6wxfx1yhyqjCPYwwxJktR2,King Gnu,"[[j-pop, j-rock], [j-pop, j-rock], [j-pop, j-r...","[j-pop, j-rock]"


In [58]:
artists_genres_consolidated['id_artists_y'][0]

'0001ZVMPt41Vwzt1zsmuzp'

In [59]:
artists_genres_consolidated

Unnamed: 0,id_artists_y,artists,genres,consolidated_genre_list
0,0001ZVMPt41Vwzt1zsmuzp,Thyro & Yumi,[[opm]],[opm]
1,000p4jMMhpEHq1h6PFCyO1,Anne Veski,"[[estonian_pop], [estonian_pop], [estonian_pop...",[estonian_pop]
2,001TRduQniM6dsJbQpMsbJ,Javier Limón,"[[flamenco], [flamenco]]",[flamenco]
3,001aJOc7CSQVo3XzoLG4DK,One Way,"[[classic_soul, electro, funk, motown, post-di...","[electro, post-disco, soul, quiet_storm, urban..."
4,0027wHZDQXpRll4ckwDGad,Disco Ensemble,"[[finnish_alternative_rock, suomi_rock], [finn...","[suomi_rock, finnish_alternative_rock]"
...,...,...,...,...
50957,7zwF847GE2hY5ApGSOLmBG,Fayza Ahmed,"[[classic_arab_pop, syrian_pop], [classic_arab...","[syrian_pop, classic_arab_pop]"
50958,7zwiFdY90oXzLh1Wz22oEq,Meditation Music Zone,"[[sleep], [sleep], [sleep], [sleep], [sleep], ...",[sleep]
50959,7zxFc10N9BP2lg73b8cwZ0,Salve Malak,"[[pop_rap_brasileiro], [pop_rap_brasileiro], [...",[pop_rap_brasileiro]
50960,7zyObVag8rUjItn71SkIrh,Survive Said The Prophet,"[[j-poprock, j-rock, japanese_alternative_rock]]","[j-poprock, j-rock, japanese_alternative_rock]"


In [60]:
#renaming the column for our convenience
artists_genres_consolidated.rename(columns={'id_artists_y':'id_artists'},inplace = True)

In [92]:
track.head(2)

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,artists_song
0,35iwgR4jXetI318WEWsa1Q,Carve,6,126903,0,Uli,45tIt06XoI0Iio4LBEVpls,1922-02-22,0.645,0.445,...,-13.338,1,0.451,0.674,0.744,0.151,0.127,104.851,3,Uli Carve
1,021ht4sdgPcrDgSk7JTbKY,Capítulo 2.16 - Banquero Anarquista,0,98200,0,Fernando Pessoa,14jtPCOoNZwquk5wd9DxrY,1922-06-01,0.695,0.263,...,-22.136,1,0.957,0.797,0.0,0.148,0.655,102.009,1,Fernando Pessoa Capítulo 2.16 - Banquero Anarq...


In [62]:
track['id']=[''.join(map(str, l)) for l in track['id']]

In [63]:
#checking for accidental null values
artists_genres_consolidated.isnull().sum()

id_artists                 0
artists                    0
genres                     0
consolidated_genre_list    0
dtype: int64

In [64]:
#this takes a little bit of time
#we could explode the respective'artists' and 'id_artists' columns
#but this would result in the same song being repeated X times 
#where X is the number of artists who participated in the song
#so as to only include the primary artist (under whose discography the song directly falls into)
#we will proceed to use this loop
for i,r in track.iterrows():
#    print(r['artists'][0])
    r['artists']=r['artists'][0]
    r['id_artists']=r['id_artists'][0]
    track['artists'][i]=r['artists']
    track['id_artists'][i]=r['id_artists']

In [65]:
track['id_artists']

0         45tIt06XoI0Iio4LBEVpls
1         14jtPCOoNZwquk5wd9DxrY
2         5LiOoJbxVSAMkBS2fUm3X2
3         5LiOoJbxVSAMkBS2fUm3X2
4         3BiJGZsyX9sJchTqcSA7Su
                   ...          
523470    1QLBXKM5GCpyQQSVMNZqrZ
523471    1dy5WNgIKQU6ezkpZs4y8z
523472    37M5pPGs6V1fchFJSgCguX
523473    4jGPdu95icCKVF31CcFKbS
523474    0i4Qda0k4nf7jnNHmSNpYv
Name: id_artists, Length: 523475, dtype: object

In [66]:
track_refine=track.merge(artists_genres_consolidated[['id_artists','consolidated_genre_list']],on='id_artists',how='left')

In [95]:
track_refine[track_refine['popularity']==99]

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,...,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,artists_song,consolidated_genre_list,year


In [96]:
track_refine.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205892 entries, 0 to 205891
Data columns (total 23 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   id                       205892 non-null  object 
 1   name                     205892 non-null  object 
 2   popularity               205892 non-null  int32  
 3   duration_ms              205892 non-null  int32  
 4   explicit                 205892 non-null  int32  
 5   artists                  205892 non-null  object 
 6   id_artists               205892 non-null  object 
 7   release_date             205892 non-null  object 
 8   danceability             205892 non-null  float32
 9   energy                   205892 non-null  float32
 10  key                      205892 non-null  int32  
 11  loudness                 205892 non-null  float32
 12  mode                     205892 non-null  int32  
 13  speechiness              205892 non-null  float32
 14  acou

In [70]:
#converting all float64 values to float32 values to decreasy memory consumption and time taken for processing
track_refine[track_refine.select_dtypes(np.float64).columns] = track_refine.select_dtypes(np.float64).astype(np.float32)
track_refine[track_refine.select_dtypes(np.int64).columns] = track_refine.select_dtypes(np.int64).astype(np.int32)


In [97]:
track_refine.isnull().sum()

id                         0
name                       0
popularity                 0
duration_ms                0
explicit                   0
artists                    0
id_artists                 0
release_date               0
danceability               0
energy                     0
key                        0
loudness                   0
mode                       0
speechiness                0
acousticness               0
instrumentalness           0
liveness                   0
valence                    0
tempo                      0
time_signature             0
artists_song               0
consolidated_genre_list    0
year                       0
dtype: int64

In [71]:
#lets remove all those null rows
track_refine=track_refine.dropna()

In [72]:
track_refine.reset_index(drop=True,inplace=True)

In [73]:
track_refine['popularity'].describe

<bound method NDFrame.describe of 0          0
1          0
2          0
3          0
4          0
          ..
448087    50
448088    72
448089    70
448090    58
448091    38
Name: popularity, Length: 448092, dtype: int32>

In [98]:
track_refine['year']=track_refine['release_date'].apply(lambda x: x.split('-')[0])
track_refine[track_refine['year']=='1990'].head()

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,...,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,artists_song,consolidated_genre_list,year
21,3VQTfJjEvun4d148dnrGsG,180 Grados,30,187773,0,Sexual Democracia,3m2dBZiLPZUffYga9ocES7,1990-02-27,0.797,0.495,...,0.0389,0.334,2.6e-05,0.194,0.96,144.460999,4,Sexual Democracia 180 Grados,"[nueva_cancion, chilean_rock]",1990
33,3HTcR5slQnXFp8BNIoMRXb,"Por siempre, mi buen amor",30,270600,0,"César ""Banana"" Pueyrredón",7MLo24EMMZaRATfZKpLS2i,1990-04-09,0.441,0.789,...,0.0403,0.14,7e-06,0.248,0.231,78.056,4,"César ""Banana"" Pueyrredón Por siempre, mi buen...",[rock_nacional],1990
200,0XeH22LjRDcaQZ0SxEp81h,"Symphony No. 6 (""Pastoral""), Op. 68, II Andant...",30,385227,0,Leopold Stokowski,52sDxFX9DvIxUupTy8f1yx,1990-01-01,0.161,0.0377,...,0.0457,0.913,0.0957,0.0983,0.109,78.291,4,"Leopold Stokowski Symphony No. 6 (""Pastoral""),...",[orchestral_performance],1990
244,5O9hw26GJHlO3XPZpSSTJk,Drug Train,30,221840,0,Social Distortion,16nn7kCHPWIB6uK09GQCNI,1990-03-27,0.508,0.903,...,0.0762,0.00345,0.008,0.0838,0.504,133.266998,4,Social Distortion Drug Train,"[n, punk, skate_punk, socal_pop_punk]",1990
262,27JODWXo4VNa6s7HqDL9yQ,Beyond Belief,30,304013,0,Petra,1LmsXfZSt1nutb8OCvt00G,1990-07-04,0.665,0.805,...,0.0355,0.103,0.0,0.311,0.855,137.244003,4,Petra Beyond Belief,"[christian_music, worship, christian_metal, cc...",1990


In [99]:
#sorting values by index to simplify our operations ahead
track_refine.sort_values(by=['popularity'],inplace=True)
track_refine.reset_index(drop=True,inplace=True)
track_refine[track_refine['popularity']==18].head()

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,...,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,artists_song,consolidated_genre_list,year


In [76]:
track_refine['year'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 448092 entries, 0 to 448091
Series name: year
Non-Null Count   Dtype 
--------------   ----- 
448092 non-null  object
dtypes: object(1)
memory usage: 3.4+ MB


In [77]:
#as we see 'year' in track_refine is an object, therefore we'll convert it to an int
track_refine['year']=pd.to_numeric(track_refine['year']).astype(dtype=np.int32)

In [78]:
#applying certain filters on the contects of track_refined to keep keep the recommendations relevant
track_refine=track_refine[(track_refine['popularity']>=30)]
track_refine.shape

(222900, 24)

In [79]:
#removing all the tracks which have words like 'Live at', 'Remastered', 'ver', 'Radio Edit' as more often than not,
#these are duplicates of songs already present in data , just a different version
#i tried to insert arguements values in the same function like how it should be done 
#but it kept on giving me errors 
track_refine.drop(index=track_refine[track_refine['name'].str.contains('Radio Edit', regex=False)].index,inplace=True)
track_refine.drop(index=track_refine[track_refine['name'].str.contains('Remaster', regex=False)].index,inplace=True)
track_refine.drop(index=track_refine[track_refine['name'].str.contains('radio edit', regex=False)].index,inplace=True)
track_refine.drop(index=track_refine[track_refine['name'].str.contains('live at', regex=False)].index,inplace=True)
track_refine.drop(index=track_refine[track_refine['name'].str.contains('remaster', regex=False)].index,inplace=True)
track_refine.drop(index=track_refine[track_refine['name'].str.contains('Live at', regex=False)].index,inplace=True)
track_refine.drop(index=track_refine[track_refine['name'].str.contains('Live At', regex=False)].index,inplace=True)
track_refine.drop(index=track_refine[track_refine['name'].str.contains('ver', regex=False)].index,inplace=True)
track_refine.drop(index=track_refine[track_refine['name'].str.contains('Remix', regex=False)].index,inplace=True)
track_refine.drop(index=track_refine[track_refine['name'].str.contains('remix', regex=False)].index,inplace=True)



In [80]:
track_refine.shape

(205892, 24)

In [100]:
track_refine[track_refine['artists']=='ZUTOMAYO'][:5]

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,...,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,artists_song,consolidated_genre_list,year
137254,3fqBoBlZnFGj0ah0airwQ7,君がいて水になる,47,285280,0,ZUTOMAYO,38WbKH6oKAZskBhqDFA8Uj,2018-11-14,0.601,0.554,...,0.0289,0.162,0.000669,0.184,0.611,147.903,4,ZUTOMAYO 君がいて水になる,[j-pop],2018
137260,2zJ2h8Soes4Y0Z8Ke9t1tu,雲丹と栗,47,272547,0,ZUTOMAYO,38WbKH6oKAZskBhqDFA8Uj,2018-11-14,0.8,0.706,...,0.0325,0.463,0.000433,0.397,0.566,104.000999,4,ZUTOMAYO 雲丹と栗,[j-pop],2018
137315,3FYWuYpw0KUTn1fP523jvx,またね幻,47,242453,0,ZUTOMAYO,38WbKH6oKAZskBhqDFA8Uj,2019-06-12,0.413,0.897,...,0.147,0.142,5e-06,0.157,0.617,90.448997,4,ZUTOMAYO またね幻,[j-pop],2019
142950,5oACPbDeeL3fgWDyngag4p,正義,48,271000,0,ZUTOMAYO,38WbKH6oKAZskBhqDFA8Uj,2019-05-22,0.63,0.946,...,0.0764,0.05,0.0,0.248,0.633,121.888,4,ZUTOMAYO 正義,[j-pop],2019
143319,1LojHMUAEwkYewkIk9JPWD,眩しいDNAだけ,48,227173,0,ZUTOMAYO,38WbKH6oKAZskBhqDFA8Uj,2019-02-27,0.714,0.898,...,0.108,0.267,1.1e-05,0.165,0.807,107.975998,4,ZUTOMAYO 眩しいDNAだけ,[j-pop],2019


In [89]:
track_refine.reset_index(drop=True,inplace=True)
track_refine.head()

In [102]:
track_refine.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205892 entries, 0 to 205891
Data columns (total 23 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   id                       205892 non-null  object 
 1   name                     205892 non-null  object 
 2   popularity               205892 non-null  int32  
 3   duration_ms              205892 non-null  int32  
 4   explicit                 205892 non-null  int32  
 5   artists                  205892 non-null  object 
 6   id_artists               205892 non-null  object 
 7   release_date             205892 non-null  object 
 8   danceability             205892 non-null  float32
 9   energy                   205892 non-null  float32
 10  key                      205892 non-null  int32  
 11  loudness                 205892 non-null  float32
 12  mode                     205892 non-null  int32  
 13  speechiness              205892 non-null  float32
 14  acou

# 2. feature engineering

In [103]:
float_cols=track_refine.dtypes[track_refine.dtypes=='float32'].index.values

In [104]:
ohe_cols='popularity'

In [105]:
track_refine['popularity'].describe()

count    205892.000000
mean         44.020506
std          10.667296
min          30.000000
25%          35.000000
50%          42.000000
75%          51.000000
max          98.000000
Name: popularity, dtype: float64

In [106]:
#create 5 points bucket for popularity
track_refine['popularity_red']=track_refine['popularity'].apply(lambda x: int(x/5)).astype(dtype=np.int32)

In [90]:
track_refine.to_csv('csv/track_final.csv',index=False)

In [107]:
track_refine[track_refine['popularity']>=90]

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,...,acousticness,instrumentalness,liveness,valence,tempo,time_signature,artists_song,consolidated_genre_list,year,popularity_red
205866,2QjOHCTQ1Jl3zawyYOpxh6,Sweater Weather,90,240400,0,The Neighbourhood,77SW9BnxLY8rJ0RciFqkHh,2013-04-19,0.612,0.807,...,0.0495,0.0177,0.101,0.398,124.053001,4,The Neighbourhood Sweater Weather,"[modern_alternative_rock, pop, shimmer_pop, mo...",2013,18
205867,079Ey5uxL04AKPQgVQwx5h,Baila Conmigo (with Rauw Alejandro),90,186088,0,Selena Gomez,0C8ZW7ezQVs4URX5aX7Kqx,2021-03-12,0.823,0.544,...,0.0215,0.00315,0.0966,0.664,149.903,4,Selena Gomez Baila Conmigo (with Rauw Alejandro),"[post-teen_pop, pop, dance_pop]",2021,18
205868,61KpQadow081I2AsbeLcsb,deja vu,90,215508,1,Olivia Rodrigo,1McMsnEElThX1knmY4oliG,2021-04-01,0.439,0.61,...,0.593,1.1e-05,0.341,0.172,181.087997,4,Olivia Rodrigo deja vu,"[post-teen_pop, pop]",2021,18
205869,6fRxMU4LWwyaSSowV441IU,Beautiful Mistakes (feat. Megan Thee Stallion),90,227395,0,Maroon 5,04gDigrS5kc9YWfZHwBETP,2021-03-03,0.713,0.676,...,0.0377,0.0,0.154,0.721,99.047997,4,Maroon 5 Beautiful Mistakes (feat. Megan Thee ...,"[pop, pop_rock]",2021,18
205870,4Oun2ylbjFKMPTiaSbbCih,WAP (feat. Megan Thee Stallion),90,187541,1,Cardi B,4kYSro6naA4h99UJvo89HB,2020-08-07,0.935,0.454,...,0.0194,0.0,0.0824,0.357,133.072998,4,Cardi B WAP (feat. Megan Thee Stallion),"[rap, post-teen_pop, pop, pop_rap]",2020,18
205871,2etHQJxIbV0soyPhelVs9Y,Best Friend (feat. Doja Cat),90,155884,1,Saweetie,6cK3NBO6uP7hh0oyuVELFl,2021-01-07,0.84,0.766,...,0.00302,4e-06,0.0684,0.402,94.017998,4,Saweetie Best Friend (feat. Doja Cat),"[trap_queen, post-teen_pop, pop, cali_rap, pop...",2021,18
205872,4MzXwWMhyBbmu6hOcLVD49,DÁKITI,90,205090,1,Bad Bunny,4q3ewBCX7sLwd24euuV69X,2020-11-27,0.731,0.573,...,0.401,5.2e-05,0.113,0.145,109.928001,4,Bad Bunny DÁKITI,"[reggaeton, latin, trap_latino]",2020,18
205873,54bFM56PmE4YLRnqpW6Tha,Therefore I Am,90,174321,0,Billie Eilish,6qqNVTkY8uBg9cP3Jd7DAH,2020-11-12,0.889,0.34,...,0.218,0.13,0.055,0.716,94.009003,4,Billie Eilish Therefore I Am,"[pop, electropop]",2020,18
205874,45bE4HXI0AwGZXfZtMp8JR,you broke me first,91,169266,0,Tate McRae,45dkTj5sMRSjrmBSBeiHym,2020-04-17,0.667,0.373,...,0.785,0.0,0.0906,0.0823,124.148003,4,Tate McRae you broke me first,"[post-teen_pop, pop, electropop, alt_z]",2020,18
205875,3aQem4jVGdhtg116TmJnHz,What’s Next,91,178154,1,Drake,3TVXtAsR1Inumwj472S9r4,2021-03-05,0.781,0.594,...,0.0136,0.0,0.162,0.0628,129.895004,4,Drake What’s Next,"[rap, pop_rap, canadian_pop, toronto_rap, cana...",2021,18


In [108]:
#simple funciton to create one hot encoded features
def ohe_prep(df,column,new_name):
    tf_df=pd.get_dummies(df[column],dtype=np.float32)
    feature_names=tf_df.columns
    tf_df.columns=[new_name+"|"+str(i) for i in feature_names]
    tf_df.reset_index(drop=True,inplace=True)
    return tf_df

In [109]:
def create_feature_set(df,float_cols):
    #tfidf genre lists
    tfidf=TfidfVectorizer(dtype=np.float32)
    tfidf_matrix=tfidf.fit_transform(df['consolidated_genre_list'].apply(lambda x: " ".join(x)))
    genre_df=pd.DataFrame(tfidf_matrix.toarray(),dtype=np.float32)
    genre_df.columns=['genre'+"|"+ i for i in tfidf.get_feature_names_out()]
    genre_df.reset_index(drop = True , inplace = True)
    
    #explicity_ohe=ohe_prep(df,'explicit','exp')
    
    year_ohe=ohe_prep(df,'year','year') * 0.5
    popularity_ohe=ohe_prep(df,'popularity_red','pop') * 0.15
    
    #scale float columns
    floats=df[float_cols].reset_index(drop=True).astype(np.float32)
    scaler=MinMaxScaler()
    floats_scaled=pd.DataFrame(scaler.fit_transform(floats),columns=floats.columns) * 0.2
    
    #concanenate all features
    final=pd.concat([genre_df,floats_scaled,popularity_ohe,year_ohe],axis=1)
    
    #add song id
    final['id']=df['id'].values
    
    return final

In [110]:
#this function consumes a lot of memory but it also compiles very fast considering the size of result dataframe
complete_feature_set=create_feature_set(track_refine,float_cols=float_cols)

In [111]:
complete_feature_set['name']=track_refine['name']
complete_feature_set['artists']=track_refine['artists']
complete_feature_set['id_artists']=track_refine['id_artists']
complete_feature_set

Unnamed: 0,genre|432hz,genre|48g,genre|_brasileira,genre|_hip_hop,genre|a_cappella,genre|abstract,genre|abstract_hip_hop,genre|abstract_idm,genre|accordeon,genre|accordion,...,year|2016,year|2017,year|2018,year|2019,year|2020,year|2021,id,name,artists,id_artists
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,5PsP2lVnRTrwKS1uurWpr0,Gli anni (96),883,6bMul6rmRS03x38tWKYifO
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,3oKOMRuLixq7S3IuYINJKW,El Pájaro Azul,La Arrolladora Banda El Limón De Rene Camacho,5bSfBBCxY8QAk4Pifveisz
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,3sPczv57uOYeYrGCTDjha2,はての浜(久米島),ネイチャー・サウンド・ギャラリー(自然音),48XPDscUY8YE5cAuAp7Zvr
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,4EYepYZhmBDQuINDqpf3sj,"換到千般恨 - 電視劇 ""天蠶變"" 插曲",柳影虹,7j02dSlxLwrHGPKvAWP5ag
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2t07asmUebiYnqkxJVAmbe,Sensitive To Light,Rainbow,6SLAMfhOi7UJI0fMztaK0m
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
205887,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.5,7Bk0uXKk1uPT0XuQbpFzvs,Fiel,Los Legendarios,0n6sKrG0xKAf8xmdqeNGke
205888,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.5,0.0,27OeeYzk6klgBh83TSvGMA,WITHOUT YOU,The Kid LAROI,2tIP7SsRs7vjIcLrU85W8J
205889,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.5,4cG7HUWYHBV6R6tHn1gxrl,Friday (feat. Mufasa & Hypeman) - Dopamine Re-...,Riton,7i9j813KFoSBMldGqlh2Z1
205890,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.5,0.0,0VjIjW4GlUZAMYd2vXMi3b,Blinding Lights,The Weeknd,1Xyo4u8uXC1ZmMpatF05PJ


In [112]:
complete_feature_set.isnull().sum()
#checking for accidental null values

genre|432hz          0
genre|48g            0
genre|_brasileira    0
genre|_hip_hop       0
genre|a_cappella     0
                    ..
year|2021            0
id                   0
name                 0
artists              0
id_artists           0
Length: 4178, dtype: int64

In [113]:
complete_feature_set[complete_feature_set['artists']=='Red Velvet']

Unnamed: 0,genre|432hz,genre|48g,genre|_brasileira,genre|_hip_hop,genre|a_cappella,genre|abstract,genre|abstract_hip_hop,genre|abstract_idm,genre|accordeon,genre|accordion,...,year|2016,year|2017,year|2018,year|2019,year|2020,year|2021,id,name,artists,id_artists
148821,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,41qLzxymjkp0R5vl3REb1S,Be Natural,Red Velvet,1z4g3DjTBBZKhvAroFlhOM
152404,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.5,0.0,0.0,0.0,0.0,1rL3u6JsjQT9Cxg1oHTJEK,Would U,Red Velvet,1z4g3DjTBBZKhvAroFlhOM
168874,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.5,0.0,0.0,0.0,0.0,0.0,6koMMQlsRWBwHZXdtWxgUk,7월 7일 One Of These Nights,Red Velvet,1z4g3DjTBBZKhvAroFlhOM
179050,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,6XP9L7di5JnOc9WaeAW8oe,행복 (Happiness),Red Velvet,1z4g3DjTBBZKhvAroFlhOM
180716,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.5,0.0,0.0,0.0,78HD9IN4cKE1MMHWeVJPWh,"Bad Boy - English Version, Bonus Track",Red Velvet,1z4g3DjTBBZKhvAroFlhOM
187562,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,5vCuawHQ8Poch1odz9JDpB,Ice Cream Cake,Red Velvet,1z4g3DjTBBZKhvAroFlhOM
188843,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.5,0.0,0.0,2cvi3IRTibXzmAm4f8P2HP,Zimzalabim,Red Velvet,1z4g3DjTBBZKhvAroFlhOM
192551,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.5,0.0,0.0,0.0,381g0b6QZxC13SzA2HRMIc,Power Up,Red Velvet,1z4g3DjTBBZKhvAroFlhOM
192858,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,64iDjAuWDogVhuoWhKklF9,Dumb Dumb,Red Velvet,1z4g3DjTBBZKhvAroFlhOM
199057,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.5,0.0,0.0,0.0,0.0,0.0,5HiSc2ZCGn8L3cH3qSwzBT,러시안 룰렛 Russian Roulette,Red Velvet,1z4g3DjTBBZKhvAroFlhOM


In [114]:
complete_feature_set.tail()

Unnamed: 0,genre|432hz,genre|48g,genre|_brasileira,genre|_hip_hop,genre|a_cappella,genre|abstract,genre|abstract_hip_hop,genre|abstract_idm,genre|accordeon,genre|accordion,...,year|2016,year|2017,year|2018,year|2019,year|2020,year|2021,id,name,artists,id_artists
205887,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.5,7Bk0uXKk1uPT0XuQbpFzvs,Fiel,Los Legendarios,0n6sKrG0xKAf8xmdqeNGke
205888,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.5,0.0,27OeeYzk6klgBh83TSvGMA,WITHOUT YOU,The Kid LAROI,2tIP7SsRs7vjIcLrU85W8J
205889,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.5,4cG7HUWYHBV6R6tHn1gxrl,Friday (feat. Mufasa & Hypeman) - Dopamine Re-...,Riton,7i9j813KFoSBMldGqlh2Z1
205890,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.5,0.0,0VjIjW4GlUZAMYd2vXMi3b,Blinding Lights,The Weeknd,1Xyo4u8uXC1ZmMpatF05PJ
205891,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.5,3Ofmpyhv5UAQ70mENzB277,Astronaut In The Ocean,Masked Wolf,1uU7g3DNSbsu0QjSEqZtEd


In [115]:
complete_feature_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205892 entries, 0 to 205891
Columns: 4178 entries, genre|432hz to id_artists
dtypes: float32(4055), float64(119), object(4)
memory usage: 3.3+ GB


In [106]:
#storing dataset in pkl format because its convenient and fast
complete_feature_set.to_pickle('complete_feature_set.pkl')