# Preprocessing - Part I

### Steps included:
* reading and cleaning songs data
* reading and cleaning artists data
* combining each song's information with its artist information

#### Importing Libraries

In [1]:
import pandas as pd
import numpy as np
from collections import Counter
import re
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('retina')

  set_matplotlib_formats('retina')


In [2]:
from sklearn.metrics.pairwise import cosine_distances
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler

#### Loading Songs data

**Columns**

 Primary:
- id (Id of track generated by Spotify)

Numerical:
- acousticness (Ranges from 0 to 1)
- danceability (Ranges from 0 to 1)
- energy (Ranges from 0 to 1)
- duration_ms (Integer typically ranging from 200k to 300k)
- instrumentalness (Ranges from 0 to 1)
- valence (Ranges from 0 to 1)
- popularity (Ranges from 0 to 100)
- tempo (Float typically ranging from 50 to 150)
- liveness (Ranges from 0 to 1)
- loudness (Float typically ranging from -60 to 0)
- speechiness (Ranges from 0 to 1)
- year (Ranges from 1921 to 2020)

Dummy:
- mode (0 = Minor, 1 = Major)
- explicit (0 = No explicit content, 1 = Explicit content)

Categorical:
- key (All keys on octave encoded as values ranging from 0 to 11, starting on C as 0, C# as 1 and so on…)
- artists (List of artists mentioned)
- release_date (Date of release mostly in yyyy-mm-dd format, however precision of date may vary)
- name (Name of the song)

In [3]:
songs = pd.read_csv("data/songs.csv")
songs.head()

Unnamed: 0,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,valence,year
0,0.995,['Carl Woitschach'],0.708,158648,0.195,0,6KbQ3uYMLKb5jDxLF7wYDD,0.563,10,0.151,-12.428,1,Singende Bataillone 1. Teil,0,1928,0.0506,118.469,0.779,1928
1,0.994,"['Robert Schumann', 'Vladimir Horowitz']",0.379,282133,0.0135,0,6KuQTIu1KoTTkLXKrwlLPV,0.901,8,0.0763,-28.454,1,"Fantasiestücke, Op. 111: Più tosto lento",0,1928,0.0462,83.972,0.0767,1928
2,0.604,['Seweryn Goszczyński'],0.749,104300,0.22,0,6L63VW0PibdM1HDSBoqnoM,0.0,5,0.119,-19.924,0,Chapter 1.18 - Zamek kaniowski,0,1928,0.929,107.177,0.88,1928
3,0.995,['Francisco Canaro'],0.781,180760,0.13,0,6M94FkXd15sOAOQYRnWPN8,0.887,1,0.111,-14.734,0,Bebamos Juntos - Instrumental (Remasterizado),0,1928-09-25,0.0926,108.003,0.72,1928
4,0.99,"['Frédéric Chopin', 'Vladimir Horowitz']",0.21,687733,0.204,0,6N6tiFZ9vLTSOIxkj8qKrd,0.908,11,0.098,-16.829,1,"Polonaise-Fantaisie in A-Flat Major, Op. 61",1,1928,0.0424,62.149,0.0693,1928


In [4]:
songs.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
acousticness,169909.0,0.493214,0.376627,0.0,0.0945,0.492,0.888,0.996
danceability,169909.0,0.53815,0.175346,0.0,0.417,0.548,0.667,0.988
duration_ms,169909.0,231406.158973,121321.923219,5108.0,171040.0,208600.0,262960.0,5403500.0
energy,169909.0,0.488593,0.26739,0.0,0.263,0.481,0.71,1.0
explicit,169909.0,0.084863,0.278679,0.0,0.0,0.0,0.0,1.0
instrumentalness,169909.0,0.161937,0.309329,0.0,0.0,0.000204,0.0868,1.0
key,169909.0,5.200519,3.515257,0.0,2.0,5.0,8.0,11.0
liveness,169909.0,0.20669,0.176796,0.0,0.0984,0.135,0.263,1.0
loudness,169909.0,-11.370289,5.666765,-60.0,-14.47,-10.474,-7.118,3.855
mode,169909.0,0.708556,0.454429,0.0,0.0,1.0,1.0,1.0


#### Removing duplicate songs

In [5]:
songs.drop_duplicates(subset=["name", "artists"], keep='first')

Unnamed: 0,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,valence,year
0,0.9950,['Carl Woitschach'],0.708,158648,0.1950,0,6KbQ3uYMLKb5jDxLF7wYDD,0.563000,10,0.1510,-12.428,1,Singende Bataillone 1. Teil,0,1928,0.0506,118.469,0.7790,1928
1,0.9940,"['Robert Schumann', 'Vladimir Horowitz']",0.379,282133,0.0135,0,6KuQTIu1KoTTkLXKrwlLPV,0.901000,8,0.0763,-28.454,1,"Fantasiestücke, Op. 111: Più tosto lento",0,1928,0.0462,83.972,0.0767,1928
2,0.6040,['Seweryn Goszczyński'],0.749,104300,0.2200,0,6L63VW0PibdM1HDSBoqnoM,0.000000,5,0.1190,-19.924,0,Chapter 1.18 - Zamek kaniowski,0,1928,0.9290,107.177,0.8800,1928
3,0.9950,['Francisco Canaro'],0.781,180760,0.1300,0,6M94FkXd15sOAOQYRnWPN8,0.887000,1,0.1110,-14.734,0,Bebamos Juntos - Instrumental (Remasterizado),0,1928-09-25,0.0926,108.003,0.7200,1928
4,0.9900,"['Frédéric Chopin', 'Vladimir Horowitz']",0.210,687733,0.2040,0,6N6tiFZ9vLTSOIxkj8qKrd,0.908000,11,0.0980,-16.829,1,"Polonaise-Fantaisie in A-Flat Major, Op. 61",1,1928,0.0424,62.149,0.0693,1928
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
169903,0.2100,"['LEGADO 7', 'Junior H']",0.795,218501,0.5850,0,52Cpyvd2dKb6XRn313nH87,0.000001,8,0.1120,-4.451,1,Ojos De Maniaco,68,2020-02-28,0.0374,97.479,0.9340,2020
169904,0.1730,"['DripReport', 'Tyga']",0.875,163800,0.4430,1,4KppkflX7I3vJQk7urOJaS,0.000032,1,0.0891,-7.461,1,Skechers (feat. Tyga) - Remix,75,2020-05-15,0.1430,100.012,0.3060,2020
169905,0.0167,"['Leon Bridges', 'Terrace Martin']",0.719,167468,0.3850,0,1ehhGlTvjtHo2e4xJFB0SZ,0.031300,8,0.1110,-10.907,1,Sweeter (feat. Terrace Martin),64,2020-06-08,0.0403,128.000,0.2700,2020
169906,0.5380,"['Kygo', 'Oh Wonder']",0.514,180700,0.5390,0,52eycxprLhK3lPcRLbQiVk,0.002330,7,0.1080,-9.332,1,How Would I Know,70,2020-05-29,0.1050,123.700,0.1530,2020


#### Removing songs with popularity less than 30

In [6]:
songs.drop(songs[songs["popularity"] < 30].index, inplace=True, axis=0)
songs.sort_values(by="popularity", inplace=True, ascending=False)
songs.shape

(95867, 19)

In [7]:
songs["spotify_id"] = songs["id"]
songs["id"] = 0
songs.head(2)

Unnamed: 0,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,valence,year,spotify_id
87942,0.00146,['The Weeknd'],0.514,200040,0.73,0,0,9.5e-05,1,0.0897,-5.934,1,Blinding Lights,100,2020-03-20,0.0598,171.005,0.334,2020,0VjIjW4GlUZAMYd2vXMi3b
87940,0.247,"['DaBaby', 'Roddy Ricch']",0.746,181733,0.69,1,0,0.0,11,0.101,-7.956,1,ROCKSTAR (feat. Roddy Ricch),99,2020-04-17,0.164,89.977,0.497,2020,7ytR5pFWmSjzHJIeQkgog4


#### Removing unnecessary characters from artist name

In [8]:
songs["artists"] = songs["artists"].map(lambda x: x.split(', ')[0])
songs["artists"] = songs["artists"].map(lambda x: x.replace(']', ""))
songs["artists"] = songs["artists"].map(lambda x: x.replace('[', ""))
songs["artists"] = songs["artists"].map(lambda x: x.replace('\'', ""))
songs["artists"] = songs["artists"].map(lambda x: x.replace('\(', ""))
songs["artists"] = songs["artists"].map(lambda x: x.replace('\)', ""))

In [9]:
cols_to_keep = ["id", "name", "artists", "acousticness", "danceability", "energy", "instrumentalness", "liveness", "speechiness", "spotify_id"]
songs = songs[cols_to_keep]
songs.head()

Unnamed: 0,id,name,artists,acousticness,danceability,energy,instrumentalness,liveness,speechiness,spotify_id
87942,0,Blinding Lights,The Weeknd,0.00146,0.514,0.73,9.5e-05,0.0897,0.0598,0VjIjW4GlUZAMYd2vXMi3b
87940,0,ROCKSTAR (feat. Roddy Ricch),DaBaby,0.247,0.746,0.69,0.0,0.101,0.164,7ytR5pFWmSjzHJIeQkgog4
87949,0,death bed (coffee for your head) (feat. beabad...,Powfu,0.731,0.726,0.431,0.0,0.696,0.135,7eJMfftS33KTjuF7lTsMCx
87941,0,THE SCOTTS,THE SCOTTS,0.233,0.716,0.537,0.0,0.157,0.0514,39Yp9wwQiSRIDOvrVg7mbk
87844,0,Supalonely,BENEE,0.305,0.863,0.631,3e-05,0.123,0.0534,4nK5YrxbMGZstTLbvj6Gxw


In [10]:
for i in range(len(songs)):
    songs.iloc[i, 0]=i
songs.head()

Unnamed: 0,id,name,artists,acousticness,danceability,energy,instrumentalness,liveness,speechiness,spotify_id
87942,0,Blinding Lights,The Weeknd,0.00146,0.514,0.73,9.5e-05,0.0897,0.0598,0VjIjW4GlUZAMYd2vXMi3b
87940,1,ROCKSTAR (feat. Roddy Ricch),DaBaby,0.247,0.746,0.69,0.0,0.101,0.164,7ytR5pFWmSjzHJIeQkgog4
87949,2,death bed (coffee for your head) (feat. beabad...,Powfu,0.731,0.726,0.431,0.0,0.696,0.135,7eJMfftS33KTjuF7lTsMCx
87941,3,THE SCOTTS,THE SCOTTS,0.233,0.716,0.537,0.0,0.157,0.0514,39Yp9wwQiSRIDOvrVg7mbk
87844,4,Supalonely,BENEE,0.305,0.863,0.631,3e-05,0.123,0.0534,4nK5YrxbMGZstTLbvj6Gxw


### Loading Artist data

This file contains the audio features of each artist, resulted from the aggregation. The rows represent different artists, columns represent different audio features

In [11]:
artist = pd.read_csv("data/artist.csv")
artist.head()

Unnamed: 0,artists,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,key,mode,count
0,"""Cats"" 1981 Original London Cast",0.575083,0.44275,247260.0,0.386336,0.022717,0.287708,-14.205417,0.180675,115.9835,0.334433,38.0,5,1,12
1,"""Cats"" 1983 Broadway Cast",0.862538,0.441731,287280.0,0.406808,0.081158,0.315215,-10.69,0.176212,103.044154,0.268865,33.076923,5,1,26
2,"""Fiddler On The Roof” Motion Picture Chorus",0.856571,0.348286,328920.0,0.286571,0.024593,0.325786,-15.230714,0.118514,77.375857,0.354857,34.285714,0,1,7
3,"""Fiddler On The Roof” Motion Picture Orchestra",0.884926,0.425074,262890.962963,0.24577,0.073587,0.275481,-15.63937,0.1232,88.66763,0.37203,34.444444,0,1,27
4,"""Joseph And The Amazing Technicolor Dreamcoat""...",0.605444,0.437333,232428.111111,0.429333,0.037534,0.216111,-11.447222,0.086,120.329667,0.458667,42.555556,11,1,9


#### Keeping numerical columns & artist name

In [12]:
cols_to_keep = ["artists", "acousticness", "danceability", "energy", "instrumentalness", "liveness", "speechiness"]
artist = artist[cols_to_keep]
artist.head()

Unnamed: 0,artists,acousticness,danceability,energy,instrumentalness,liveness,speechiness
0,"""Cats"" 1981 Original London Cast",0.575083,0.44275,0.386336,0.022717,0.287708,0.180675
1,"""Cats"" 1983 Broadway Cast",0.862538,0.441731,0.406808,0.081158,0.315215,0.176212
2,"""Fiddler On The Roof” Motion Picture Chorus",0.856571,0.348286,0.286571,0.024593,0.325786,0.118514
3,"""Fiddler On The Roof” Motion Picture Orchestra",0.884926,0.425074,0.24577,0.073587,0.275481,0.1232
4,"""Joseph And The Amazing Technicolor Dreamcoat""...",0.605444,0.437333,0.429333,0.037534,0.216111,0.086


In [13]:
artist.rename({"artists":"name"}, axis=1, inplace=True)
artist.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27621 entries, 0 to 27620
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name              27621 non-null  object 
 1   acousticness      27621 non-null  float64
 2   danceability      27621 non-null  float64
 3   energy            27621 non-null  float64
 4   instrumentalness  27621 non-null  float64
 5   liveness          27621 non-null  float64
 6   speechiness       27621 non-null  float64
dtypes: float64(6), object(1)
memory usage: 1.5+ MB


In [14]:
songs.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 95867 entries, 87942 to 45145
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                95867 non-null  int64  
 1   name              95867 non-null  object 
 2   artists           95867 non-null  object 
 3   acousticness      95867 non-null  float64
 4   danceability      95867 non-null  float64
 5   energy            95867 non-null  float64
 6   instrumentalness  95867 non-null  float64
 7   liveness          95867 non-null  float64
 8   speechiness       95867 non-null  float64
 9   spotify_id        95867 non-null  object 
dtypes: float64(6), int64(1), object(3)
memory usage: 8.0+ MB


In [15]:
songs["acousticness_artist"] = 0
songs["danceability_artist"] = 0
songs["energy_artist"] = 0
songs["instrumentalness_artist"] = 0
songs["liveness_artist"] = 0
songs["speechiness_artist"]= 0

### Combining artist information of each song and saving the data for Part II

In [16]:
for i in range(len(songs)):
    artist_name = songs.iloc[i, 2]
    mask = artist["name"].str.contains(artist_name, case=False, regex=False)
    if(len(artist[mask])):
        songs.iloc[i, 10] = artist[mask].iloc[0, 1]
        songs.iloc[i, 11] = artist[mask].iloc[0, 2]        
        songs.iloc[i, 12] = artist[mask].iloc[0, 3]        
        songs.iloc[i, 13] = artist[mask].iloc[0, 4]        
        songs.iloc[i, 14] = artist[mask].iloc[0, 5]        
        songs.iloc[i, 15] = artist[mask].iloc[0, 6]
        if (i % 1000 == 0):
            print(artist[mask].iloc[0, 0])

The Weeknd
Megan Thee Stallion
Demi Lovato
Post Malone
Jason Mraz
Iamjakehill
Robert Miles
Alfred Drake
The Notorious B.I.G.
George Michael
Richard White
The Spinners
Lil Tecca
Eminem
Kacey Musgraves
Killswitch Engage
Lionel Richie
Vulfpeck
Luther Vandross
East 17
dandelion hands
Big Gigantic
James Horner
Nightmares On Wax
Twenty One Pilots
Billy Crystal
Andrew Bird
Kip Moore
Alexis Korner's Blues Incorporated
Sabaton
Gregory Isaacs
Lonestar
Color Me Badd
Jungle
Roger Waters
Aerosmith
Dougie MacLean
Creedence Clearwater Revival
Wolfgang Amadeus Mozart
Luny Tunes
The Bolshoi
Marco Barrientos
Howard Wales
The Cranberries
Aretha Franklin
Leonard Cohen
Eddie Lovette
Johnny Cash
Depeche Mode
Cindy Bullens
South Park Mexican
Paul Carrack
JJ Heller
José Alfredo Jimenez
Caifanes
Modern Talking
PornoGraffitti
Jimi Hendrix
Gary Moore
Yolanda Del Rio
Pavement
Tribal Seeds
Chuck Riley
The String Cheese Incident
Jacques Dutronc
Dark Angel
A$AP NAST
Al Dexter & His Troopers
Death In June
A Perfect C

In [17]:
songs.to_csv("data/song_artist.csv", index=False)