# Playing Around with Kaggle Spotify Dataset

In this notebook, I play around with the Spotify Dataset on 600k tracks collected by Yamac Eren Ay and hosted on Kaggle here: https://www.kaggle.com/yamaerenay/spotify-dataset-19212020-160k-tracks. In particular, I will be looking at the tracks dataset

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from ast import literal_eval

In [2]:
# Importing the dataset
tracks = pd.read_csv('tracks.csv', parse_dates = ['release_date'], converters={'artists': literal_eval})

In [3]:
# Keeping only the songs from 2011 onwards
from datetime import datetime
tracks11 = tracks.copy().loc[tracks.release_date >= datetime(2011,1,1)]
tracks11.sort_values(by=['release_date'], inplace=True)
tracks11.head(10)

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
405273,5vq7gzERQyaD7hXPXojduk,Syahadah,37,371733,0,[Raihan],['0wWm1rOeaLQ4jkLpn8tmTt'],2011-01-01,0.332,0.119,4,-13.911,1,0.0337,0.865,0.0,0.155,0.374,99.641,4
156524,4uQjPCRQJF2QGTKh2vp45a,Go Go Wine,51,203461,0,[Vybz Kartel],['2NUz5P42WqkxilbI8ocN76'],2011-01-01,0.583,0.683,8,-5.45,1,0.131,0.182,0.0,0.385,0.496,99.805,4
156523,380hFJEcn7oP9LM3C5eCMN,Te Quiero,55,278147,0,[Ricardo Arjona],['0h1zs4CTlU9D2QtgPxptUD'],2011-01-01,0.717,0.635,2,-5.194,0,0.0298,0.0357,8e-06,0.0886,0.699,92.987,4
263499,3bQXzmklJTt1fUPxWfApPg,Nahy 2,32,298773,0,[Richard Muller],['2jldyAonkBDiBG0cNtLgZz'],2011-01-01,0.452,0.386,9,-8.728,1,0.0272,0.038,0.000302,0.188,0.227,75.233,4
351157,3m5WQZeRfYjm0KlvXqOOX8,Far l'amore,48,181693,0,"[Bob Sinclar, Raffaella Carrà]","['5YFS41yoX0YuFY39fq21oN', '6EVyI0S0b1Ld2nm37m...",2011-01-01,0.875,0.821,7,-5.029,1,0.227,0.0424,0.000247,0.291,0.679,127.995,4
351155,1CULOQmwFMft8fhXyg9VTQ,¿Y Ahora Que Hacemos?,41,184360,0,[Jarabe De Palo],['5B6H1Dq77AV1LZWrbNsuH5'],2011-01-01,0.822,0.641,1,-6.461,0,0.0293,0.653,0.00936,0.0996,0.401,114.024,4
568436,1F5u1Pgff0uoHOaL099i4b,Przyjdzie czas,31,219747,0,[Marek Barbasiewicz],['7pCkvkjUKty9IfxyTjFNR4'],2011-01-01,0.643,0.705,9,-9.745,0,0.1,0.319,0.0,0.0905,0.374,121.816,4
535567,0xvHuwgGbnUhLR6REbipOf,Sybil,29,277920,0,[Traktor],['01czh4LGSprbNEuornDtfu'],2011-01-01,0.63,0.938,2,-6.222,1,0.0754,0.0285,0.000169,0.512,0.0967,120.991,4
156529,6ihJx6CCY20eFi5SKsvfqs,Hechizo de Luna,40,292480,0,[Latin Sound],['5IXSOv8ECLeYpvP65Bk2Wf'],2011-01-01,0.791,0.519,8,-8.676,1,0.0468,0.65,0.000249,0.127,0.674,160.03,4
156512,4qan0qNCFAEu6A2hcwPETn,Wepa,58,242000,0,[Gloria Estefan],['5IFCkqu9J6xdWeYMk5I889'],2011-01-01,0.79,0.749,8,-6.504,1,0.297,0.0277,0.0,0.113,0.7,144.933,4


In [4]:
# Dropping all duplicate values based on name and artist ids
tracks11.drop_duplicates(subset=['name','id_artists'], keep='first', inplace=True)

# Describing the data
tracks11.info()

In [9]:
# Getting the main artist's name and also getting the 
tracks11.loc[:,'main_artist'] = tracks11.artists.map(lambda x: x[0])
tracks11['duration_min'] = tracks11['duration_ms'] / 60000

# Getting the dataset head
tracks11.head(10000)

In [11]:
# Correlation matrix
tracks11.corr()

Unnamed: 0,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,duration_min
popularity,1.0,-0.087383,0.187654,0.185874,-0.08521,-0.005272,0.152612,-0.012191,0.060636,0.075982,-0.380033,-0.116607,0.064486,-0.033676,0.012371,-0.087383
duration_ms,-0.087383,1.0,-0.053128,-0.093719,0.010715,0.01124,-0.00223,0.003521,-0.072998,-0.029087,0.064428,0.007311,-0.122416,0.010189,0.00537,1.0
explicit,0.187654,-0.053128,1.0,0.220619,0.024697,0.001424,0.052457,-0.064743,0.38081,-0.074656,-0.125353,-0.032691,-0.013181,-0.02012,0.046354,-0.053128
danceability,0.185874,-0.093719,0.220619,1.0,0.112954,0.016848,0.216085,-0.074756,0.198418,-0.162079,-0.172419,-0.17186,0.430494,-0.082643,0.183658,-0.093719
energy,-0.08521,0.010715,0.024697,0.112954,1.0,0.039432,0.702149,-0.087768,0.070786,-0.664034,-0.044028,0.182396,0.330386,0.183029,0.166912,0.010715
key,-0.005272,0.01124,0.001424,0.016848,0.039432,1.0,0.02717,-0.146386,0.023593,-0.021609,0.000187,-0.000798,0.027804,0.00775,0.009503,0.01124
loudness,0.152612,-0.00223,0.052457,0.216085,0.702149,0.02717,1.0,-0.046562,0.008379,-0.470585,-0.373114,0.048109,0.305283,0.154062,0.168401,-0.00223
mode,-0.012191,0.003521,-0.064743,-0.074756,-0.087768,-0.146386,-0.046562,1.0,-0.077089,0.080303,-0.016225,0.008792,-0.020742,0.004885,-0.027372,0.003521
speechiness,0.060636,-0.072998,0.38081,0.198418,0.070786,0.023593,0.008379,-0.077089,1.0,-0.048254,-0.098879,0.052972,0.070541,0.014084,0.037387,-0.072998
acousticness,0.075982,-0.029087,-0.074656,-0.162079,-0.664034,-0.021609,-0.470585,0.080303,-0.048254,1.0,-0.012642,-0.074889,-0.148802,-0.144357,-0.142685,-0.029087
