# Final Project: Spotify Top Songs Analysis

## Data Loading & Cleaning

### Import Dependencies

In [1]:
import pandas as pd
import os
import numpy as np
import psycopg2
import datetime
import requests

### Import Scraped Spotify Data

Spotify Top 200 Songs Chart Data

Country: United States

Date Range: 1/1/2020 - date of scraping (11/11/21)

In [2]:
# Read in scraped data
scrape_df = pd.read_csv("./Resources/spotifytop200.csv")
scrape_df

Unnamed: 0,song_id,song_url,song,artist,date,position,streams
0,4iN16F8JtVxG2UTzp3avGl,https://open.spotify.com/track/4iN16F8JtVxG2UT...,Smokin Out The Window,"Bruno Mars, Anderson .Paak, Silk Sonic",2020-01_01,1,1331303
1,0gplL1WMoJ6iYaPgMCL0gX,https://open.spotify.com/track/0gplL1WMoJ6iYaP...,Easy On Me,Adele,2020-01_01,2,1111983
2,5Z9KJZvQzH6PFmb8SNkxuk,https://open.spotify.com/track/5Z9KJZvQzH6PFmb...,INDUSTRY BABY (feat. Jack Harlow),Lil Nas X,2020-01_01,3,1073301
3,00Blm7zeNqgYLPtW6zg8cj,https://open.spotify.com/track/00Blm7zeNqgYLPt...,One Right Now (with The Weeknd),Post Malone,2020-01_01,4,966020
4,5PjdY0CKGZdEuoNab3yDmX,https://open.spotify.com/track/5PjdY0CKGZdEuoN...,STAY (with Justin Bieber),The Kid LAROI,2020-01_01,5,961443
...,...,...,...,...,...,...,...
136195,3ee8Jmje8o58CHK66QrVC2,https://open.spotify.com/track/3ee8Jmje8o58CHK...,SAD!,XXXTENTACION,2021-11_11,196,216178
136196,1dIWPXMX4kRHj6Dt2DStUQ,https://open.spotify.com/track/1dIWPXMX4kRHj6D...,Chosen (feat. Ty Dolla $ign),"Blxst, Tyga",2021-11_11,197,215608
136197,2SAqBLGA283SUiwJ3xOUVI,https://open.spotify.com/track/2SAqBLGA283SUiw...,Laugh Now Cry Later (feat. Lil Durk),Drake,2021-11_11,198,214908
136198,3GVkPk8mqxz0itaAriG1L7,https://open.spotify.com/track/3GVkPk8mqxz0ita...,Everybody Dies In Their Nightmares,XXXTENTACION,2021-11_11,199,214824


### Spotify API - Get Tracks' Audio Features

- acousticness (float): confidence measure from 0-1 whether track is acoustic (1.0 highest confidence is acoustic)
- analysis_url (string): url to access full audio analysis of track
- danceability (float): how suitable track is for dancing based on combination of musical elements (tempo, rhythm stability, beat strength, overall regularity), scale of 0-1 (1.0 most danceable) 
- duration_ms (integer): duration of track in milliseconds
- energy (float): measure from 0-1 for perceptual measure of intensity and activity (based on dynamic range, perceived loudness, timbre, onset rate, general entropy)
- id (string): spotify ID for track
- instrumentalness (float): predicts whether track contains no vocals with 1.0 being greatest likelihood the track contains no vocals
- key (integer): key the track is in (uses integer notation, 0 = C, 1 = C♯/D♭, 2 = D, 3 = D♯/E♭, ...)
- liveness (float): detects presence of audience in recording, with higher values representing increased probability track was performed live
- loudness (float): overall loudness of track in decibels (dB), averaged across entire track, with values typically ranging between -60 and 0 dB
- mode (integer): modality (major=1 or minor=0) of track, type of scale from which melodic content is derived
- speechiness (float): detects presence of spoken words in track, with more exclusive speech-like recording with value closer to 1.0
- tempo (float): overall estimated tempo of track in beats per minute (BPM)
- time_signature (integer): estimated overall time signature (meter) of track
- track_href (string): link to web API endpoint for full details of track
- type (string): object type
- uri (string): Spotify URI for track
- valence (float): measure from 0-1 describing musical positiveness conveyed by track

#### Get list of song ids to use for API

In [3]:
# Create separate series of song_ids from scraped dataframe & count duplicates
song_ids = pd.Series(scrape_df['song_id'])
song_ids.value_counts()

2GiJYvgVaD2HtM8GqD9EgQ    681
6SRsiMl7w1USE4mFqrOhHC    681
2HbKqm4o0w5wEeEFXm2sD4    681
2xLMifQCjDGFmkHkpNLD9h    681
4eL3XeuGaEoVT8ttDh3hwY    681
                         ... 
6y6bUurXB0FfhddEU5Qch0    681
0wXuerDYiBnERgIpbb3JBR    681
6K4t31amVTZDgR3sKmwUJJ    681
7kDUspsoYfLkWnZR7qwHZl    681
4qu63nuBpdn0qHUHuObEj1    681
Name: song_id, Length: 200, dtype: int64

In [4]:
# Drop duplicates
song_ids = song_ids.drop_duplicates()
song_ids.value_counts()

2iUmqdfGZcHIhS3b9E9EWq    1
3Vi5XqYrmQgOYBajMWSvCi    1
5wANPM4fQCJwkGd4rN57mH    1
6SRsiMl7w1USE4mFqrOhHC    1
6O5TrlFWTYvznd9fMC0VvU    1
                         ..
0y60itmpH0aPKsFiGxmtnh    1
4pvb0WLRcMtbPGmtejJJ6y    1
2HSmyk2qMN8WQjuGhaQgCk    1
2EjXfH91m7f8HiJN1yQg97    1
2gpWyfu7eZ01zzncHpxOtA    1
Name: song_id, Length: 200, dtype: int64

In [5]:
# Convert to list & split into two
song_ids = song_ids.tolist()
song_ids1 = song_ids[:len(song_ids)//2]
print(len(song_ids1))
song_ids2 = song_ids[len(song_ids)//2:]
print(len(song_ids2))

100
100


In [6]:
# Convert two lists into single string for API request
song_ids1_str = ','.join(song_ids1)
print(song_ids1_str)
song_ids2_str = ','.join(song_ids2) 
print(song_ids2_str)

4iN16F8JtVxG2UTzp3avGl,0gplL1WMoJ6iYaPgMCL0gX,5Z9KJZvQzH6PFmb8SNkxuk,00Blm7zeNqgYLPtW6zg8cj,5PjdY0CKGZdEuoNab3yDmX,02MWAaffLxlfxAUY7c5dvx,2BcMwX1MPV6ZHP4tUT9uq6,4R67rQNSbbsR4TdUVOIdez,04S1pkp1VaIqjg8zZqknR5,0k1WUmIRnG3xU6fvvDVfRG,6f5ExP43esnvdKPddwKXJH,3Vi5XqYrmQgOYBajMWSvCi,50nfwKoDiSYg8zOCREWAm5,4ZtFanR9U6ndgddUvNcjcG,3QFInJAm9eyaho5vBzxInN,07MDkzWARZaLEdKxo6yArG,3DarAbFujv6eYNliUTyqtz,08F16baYbciTva9P4BvpiI,4RVwu0g32PAqgUiJoXsdF8,7rglLriMNBPAyuJOMGwi39,6Uj1ctrBOjOas8xZXGqKk4,3rmo8F54jFF8OgYsqTxm5d,37y7iDayfwm3WXn5BiAoRk,08SB2OtZkaliju77WYEKxk,29TPjc8wxfz4XMn21O7VsZ,40iJIUlhi6renaREYGeIDS,2QjOHCTQ1Jl3zawyYOpxh6,5CZ40GBx1sQ9agT82CLQCT,0e8nrvls4Qqv5Rfa2UhqmO,1SC5rEoYDGUK4NfG82494W,3NqBxTOMCJ3zW9CIP51td4,4yNoUQkYf1QF8iYlEzNynH,4XvcHTUfIlWfyJTRG0aqlo,0WSEq9Ko4kFPt8yo3ICd6T,15HMh4yxdf4wyxSZSlOgGZ,7MAibcTli4IisCtbHKrGMh,5nujrmhLynf4yMoMtj8AQF,18vXApRmJSgQ6wG2ll9AOg,2B4GHvToeLTOBB4QLzW3Ni,2gpWyfu7eZ01zzncHpxOtA,4SqWKzw0CbA05TGszDgMlc,0k4d5YPDr1r7FX77VdqWez,3Wrjm47oTz2sjIgck11l5e,62vpWI1CHwF

#### Spotify API

In [7]:
# Import access tokens
from config import client_id, client_secret

In [8]:
# Pass the access token
auth_url = 'https://accounts.spotify.com/api/token'
auth_response = requests.post(auth_url, {
    'grant_type': 'client_credentials', 
    'client_id': client_id, 
    'client_secret': client_secret
})
auth_response_data = auth_response.json()
access_token = auth_response_data['access_token']

In [9]:
# Set up access token in header for GET request
headers = {'Authorization': 'Bearer {token}'.format(token=access_token)}

In [10]:
# Set up API request
base_url = 'https://api.spotify.com/v1/audio-features/'

In [11]:
# GET request for list 1
data = []
for i in song_ids1:
    req = requests.get(base_url + i, headers=headers)
    req = req.json()
    data.append(req)

In [12]:
# GET request for list 2
for i in song_ids2: 
    req = requests.get(base_url + i, headers=headers)
    req = req.json()
    data.append(req)

In [13]:
# Check results
data

[{'danceability': 0.627,
  'energy': 0.618,
  'key': 2,
  'loudness': -8.529,
  'mode': 1,
  'speechiness': 0.0437,
  'acousticness': 0.0558,
  'instrumentalness': 0,
  'liveness': 0.351,
  'valence': 0.848,
  'tempo': 82.03,
  'type': 'audio_features',
  'id': '4iN16F8JtVxG2UTzp3avGl',
  'uri': 'spotify:track:4iN16F8JtVxG2UTzp3avGl',
  'track_href': 'https://api.spotify.com/v1/tracks/4iN16F8JtVxG2UTzp3avGl',
  'analysis_url': 'https://api.spotify.com/v1/audio-analysis/4iN16F8JtVxG2UTzp3avGl',
  'duration_ms': 197443,
  'time_signature': 4},
 {'danceability': 0.604,
  'energy': 0.366,
  'key': 5,
  'loudness': -7.519,
  'mode': 1,
  'speechiness': 0.0282,
  'acousticness': 0.578,
  'instrumentalness': 0,
  'liveness': 0.133,
  'valence': 0.13,
  'tempo': 141.981,
  'type': 'audio_features',
  'id': '0gplL1WMoJ6iYaPgMCL0gX',
  'uri': 'spotify:track:0gplL1WMoJ6iYaPgMCL0gX',
  'track_href': 'https://api.spotify.com/v1/tracks/0gplL1WMoJ6iYaPgMCL0gX',
  'analysis_url': 'https://api.spotify.

In [22]:
# Convert results to pandas dataframe
features_df = pd.DataFrame.from_dict(data)
features_df = features_df.set_index('id')
features_df

Unnamed: 0_level_0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,uri,track_href,analysis_url,duration_ms,time_signature,error
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
4iN16F8JtVxG2UTzp3avGl,0.627,0.618,2.0,-8.529,1.0,0.0437,0.0558,0.000000,0.3510,0.8480,82.030,audio_features,spotify:track:4iN16F8JtVxG2UTzp3avGl,https://api.spotify.com/v1/tracks/4iN16F8JtVxG...,https://api.spotify.com/v1/audio-analysis/4iN1...,197443.0,4.0,
0gplL1WMoJ6iYaPgMCL0gX,0.604,0.366,5.0,-7.519,1.0,0.0282,0.5780,0.000000,0.1330,0.1300,141.981,audio_features,spotify:track:0gplL1WMoJ6iYaPgMCL0gX,https://api.spotify.com/v1/tracks/0gplL1WMoJ6i...,https://api.spotify.com/v1/audio-analysis/0gpl...,224695.0,4.0,
5Z9KJZvQzH6PFmb8SNkxuk,0.741,0.691,10.0,-7.395,0.0,0.0672,0.0221,0.000000,0.0476,0.8920,150.087,audio_features,spotify:track:5Z9KJZvQzH6PFmb8SNkxuk,https://api.spotify.com/v1/tracks/5Z9KJZvQzH6P...,https://api.spotify.com/v1/audio-analysis/5Z9K...,212353.0,4.0,
00Blm7zeNqgYLPtW6zg8cj,0.687,0.781,1.0,-4.806,1.0,0.0530,0.0361,0.000000,0.0755,0.6880,97.014,audio_features,spotify:track:00Blm7zeNqgYLPtW6zg8cj,https://api.spotify.com/v1/tracks/00Blm7zeNqgY...,https://api.spotify.com/v1/audio-analysis/00Bl...,193507.0,4.0,
5PjdY0CKGZdEuoNab3yDmX,0.591,0.764,1.0,-5.484,1.0,0.0483,0.0383,0.000000,0.1030,0.4780,169.928,audio_features,spotify:track:5PjdY0CKGZdEuoNab3yDmX,https://api.spotify.com/v1/tracks/5PjdY0CKGZdE...,https://api.spotify.com/v1/audio-analysis/5Pjd...,141806.0,4.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3ee8Jmje8o58CHK66QrVC2,0.740,0.613,8.0,-4.880,1.0,0.1450,0.2580,0.003720,0.1230,0.4730,75.023,audio_features,spotify:track:3ee8Jmje8o58CHK66QrVC2,https://api.spotify.com/v1/tracks/3ee8Jmje8o58...,https://api.spotify.com/v1/audio-analysis/3ee8...,166606.0,4.0,
1dIWPXMX4kRHj6Dt2DStUQ,0.571,0.767,2.0,-5.160,1.0,0.2870,0.3360,0.000000,0.0809,0.6050,93.421,audio_features,spotify:track:1dIWPXMX4kRHj6Dt2DStUQ,https://api.spotify.com/v1/tracks/1dIWPXMX4kRH...,https://api.spotify.com/v1/audio-analysis/1dIW...,161684.0,4.0,
2SAqBLGA283SUiwJ3xOUVI,0.761,0.518,0.0,-8.871,1.0,0.1340,0.2440,0.000035,0.1070,0.5220,133.976,audio_features,spotify:track:2SAqBLGA283SUiwJ3xOUVI,https://api.spotify.com/v1/tracks/2SAqBLGA283S...,https://api.spotify.com/v1/audio-analysis/2SAq...,261493.0,4.0,
3GVkPk8mqxz0itaAriG1L7,0.734,0.570,7.0,-7.066,0.0,0.1330,0.8470,0.000021,0.1120,0.6890,129.953,audio_features,spotify:track:3GVkPk8mqxz0itaAriG1L7,https://api.spotify.com/v1/tracks/3GVkPk8mqxz0...,https://api.spotify.com/v1/audio-analysis/3GVk...,95467.0,4.0,


### Clean datasets

#### Clean features_df

In [23]:
# Remove unnecessary columns
features_df = features_df.drop(columns=['type', 'uri', 'track_href', 'analysis_url', 'error'])
features_df.head()

Unnamed: 0_level_0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
4iN16F8JtVxG2UTzp3avGl,0.627,0.618,2.0,-8.529,1.0,0.0437,0.0558,0.0,0.351,0.848,82.03,197443.0,4.0
0gplL1WMoJ6iYaPgMCL0gX,0.604,0.366,5.0,-7.519,1.0,0.0282,0.578,0.0,0.133,0.13,141.981,224695.0,4.0
5Z9KJZvQzH6PFmb8SNkxuk,0.741,0.691,10.0,-7.395,0.0,0.0672,0.0221,0.0,0.0476,0.892,150.087,212353.0,4.0
00Blm7zeNqgYLPtW6zg8cj,0.687,0.781,1.0,-4.806,1.0,0.053,0.0361,0.0,0.0755,0.688,97.014,193507.0,4.0
5PjdY0CKGZdEuoNab3yDmX,0.591,0.764,1.0,-5.484,1.0,0.0483,0.0383,0.0,0.103,0.478,169.928,141806.0,4.0


In [25]:
# Replace values in 'key' column with chord names
features_df['key'] = features_df['key'].map({
    0.0:'C',
    1.0:'C♯/D♭',
    2.0:'D',
    3.0:'D♯/E♭',
    4.0:'E',
    5.0:'F',
    6.0:'F♯/G♭',
    7.0:'G',
    8.0:'G♯/A♭',
    9.0:'A',
    10.0:'A♯/B♭',
    11.0:'B'
})
features_df['key'].value_counts()

C♯/D♭    34
C        27
D        18
G♯/A♭    18
B        15
G        14
F♯/G♭    13
A        13
F        12
E        11
A♯/B♭     8
D♯/E♭     7
Name: key, dtype: int64

In [26]:
# Replace values in 'mode' column with 'major' or 'minor'
features_df['mode'] = features_df['mode'].map({
    1:'major',
    0:'minor'
})
features_df['mode'].value_counts()

major    125
minor     65
Name: mode, dtype: int64

In [27]:
# Check data types
features_df.dtypes

danceability        float64
energy              float64
key                  object
loudness            float64
mode                 object
speechiness         float64
acousticness        float64
instrumentalness    float64
liveness            float64
valence             float64
tempo               float64
duration_ms         float64
time_signature      float64
dtype: object

In [28]:
# Check 'time_signature' values
features_df['time_signature'].value_counts()

4.0    177
3.0     11
5.0      1
1.0      1
Name: time_signature, dtype: int64

In [29]:
# Fix 'time_signature' data type as categorical
features_df['time_signature'] = features_df['time_signature'].astype('category')
features_df.dtypes

danceability         float64
energy               float64
key                   object
loudness             float64
mode                  object
speechiness          float64
acousticness         float64
instrumentalness     float64
liveness             float64
valence              float64
tempo                float64
duration_ms          float64
time_signature      category
dtype: object

In [30]:
# Cleaned features_df
features_df.head()

Unnamed: 0_level_0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
4iN16F8JtVxG2UTzp3avGl,0.627,0.618,D,-8.529,major,0.0437,0.0558,0.0,0.351,0.848,82.03,197443.0,4.0
0gplL1WMoJ6iYaPgMCL0gX,0.604,0.366,F,-7.519,major,0.0282,0.578,0.0,0.133,0.13,141.981,224695.0,4.0
5Z9KJZvQzH6PFmb8SNkxuk,0.741,0.691,A♯/B♭,-7.395,minor,0.0672,0.0221,0.0,0.0476,0.892,150.087,212353.0,4.0
00Blm7zeNqgYLPtW6zg8cj,0.687,0.781,C♯/D♭,-4.806,major,0.053,0.0361,0.0,0.0755,0.688,97.014,193507.0,4.0
5PjdY0CKGZdEuoNab3yDmX,0.591,0.764,C♯/D♭,-5.484,major,0.0483,0.0383,0.0,0.103,0.478,169.928,141806.0,4.0


#### Clean scrape_df

In [32]:
# Set index of scrape_df
scrape_df = scrape_df.set_index('song_id')
scrape_df

Unnamed: 0_level_0,song_url,song,artist,date,position,streams
song_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
4iN16F8JtVxG2UTzp3avGl,https://open.spotify.com/track/4iN16F8JtVxG2UT...,Smokin Out The Window,"Bruno Mars, Anderson .Paak, Silk Sonic",2020-01_01,1,1331303
0gplL1WMoJ6iYaPgMCL0gX,https://open.spotify.com/track/0gplL1WMoJ6iYaP...,Easy On Me,Adele,2020-01_01,2,1111983
5Z9KJZvQzH6PFmb8SNkxuk,https://open.spotify.com/track/5Z9KJZvQzH6PFmb...,INDUSTRY BABY (feat. Jack Harlow),Lil Nas X,2020-01_01,3,1073301
00Blm7zeNqgYLPtW6zg8cj,https://open.spotify.com/track/00Blm7zeNqgYLPt...,One Right Now (with The Weeknd),Post Malone,2020-01_01,4,966020
5PjdY0CKGZdEuoNab3yDmX,https://open.spotify.com/track/5PjdY0CKGZdEuoN...,STAY (with Justin Bieber),The Kid LAROI,2020-01_01,5,961443
...,...,...,...,...,...,...
3ee8Jmje8o58CHK66QrVC2,https://open.spotify.com/track/3ee8Jmje8o58CHK...,SAD!,XXXTENTACION,2021-11_11,196,216178
1dIWPXMX4kRHj6Dt2DStUQ,https://open.spotify.com/track/1dIWPXMX4kRHj6D...,Chosen (feat. Ty Dolla $ign),"Blxst, Tyga",2021-11_11,197,215608
2SAqBLGA283SUiwJ3xOUVI,https://open.spotify.com/track/2SAqBLGA283SUiw...,Laugh Now Cry Later (feat. Lil Durk),Drake,2021-11_11,198,214908
3GVkPk8mqxz0itaAriG1L7,https://open.spotify.com/track/3GVkPk8mqxz0ita...,Everybody Dies In Their Nightmares,XXXTENTACION,2021-11_11,199,214824


In [33]:
# Check data types
scrape_df.dtypes

song_url    object
song        object
artist      object
date        object
position     int64
streams     object
dtype: object

In [40]:
# Update 'date' column as datetime
scrape_df['date'] = pd.to_datetime(scrape_df['date'], format="%Y-%m_%d")

# Update 'streams' column to numerical
scrape_df['streams'] = scrape_df['streams'].replace({',':''}, regex=True).apply(pd.to_numeric, 1)

In [41]:
# Cleaned scrape_df
scrape_df.head()

Unnamed: 0_level_0,song_url,song,artist,date,position,streams
song_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
4iN16F8JtVxG2UTzp3avGl,https://open.spotify.com/track/4iN16F8JtVxG2UT...,Smokin Out The Window,"Bruno Mars, Anderson .Paak, Silk Sonic",2020-01-01,1,1331303
0gplL1WMoJ6iYaPgMCL0gX,https://open.spotify.com/track/0gplL1WMoJ6iYaP...,Easy On Me,Adele,2020-01-01,2,1111983
5Z9KJZvQzH6PFmb8SNkxuk,https://open.spotify.com/track/5Z9KJZvQzH6PFmb...,INDUSTRY BABY (feat. Jack Harlow),Lil Nas X,2020-01-01,3,1073301
00Blm7zeNqgYLPtW6zg8cj,https://open.spotify.com/track/00Blm7zeNqgYLPt...,One Right Now (with The Weeknd),Post Malone,2020-01-01,4,966020
5PjdY0CKGZdEuoNab3yDmX,https://open.spotify.com/track/5PjdY0CKGZdEuoN...,STAY (with Justin Bieber),The Kid LAROI,2020-01-01,5,961443


#### Create new dataframe for compiled data
Consolidating into list of songs & total streams, rankings, etc.
Merging with features_df

In [None]:
song_df = 

### Load Dataset

Spotify Top 200 Songs Charts data (Global, Weekly)

From Kaggle, Data = Jan. 1, 2021 through August 16, 2021
https://www.kaggle.com/sashankpillai/spotify-top-200-charts-20202021/version/2

In [2]:
# Read in Spotify Top 200 (Kaggle) data
song_df = pd.read_csv("./Resources/spotify_dataset.csv", index_col='Index')
song_df

Unnamed: 0_level_0,Highest Charting Position,Number of Times Charted,Week of Highest Charting,Song Name,Streams,Artist,Artist Followers,Song ID,Genre,Release Date,...,Danceability,Energy,Loudness,Speechiness,Acousticness,Liveness,Tempo,Duration (ms),Valence,Chord
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,8,2021-07-23--2021-07-30,Beggin',48633449,Måneskin,3377762,3Wrjm47oTz2sjIgck11l5e,"['indie rock italiano', 'italian pop']",2017-12-08,...,0.714,0.8,-4.808,0.0504,0.127,0.359,134.002,211560,0.589,B
2,2,3,2021-07-23--2021-07-30,STAY (with Justin Bieber),47248719,The Kid LAROI,2230022,5HCyWlXZPP0y6Gqq8TgA20,['australian hip hop'],2021-07-09,...,0.591,0.764,-5.484,0.0483,0.0383,0.103,169.928,141806,0.478,C#/Db
3,1,11,2021-06-25--2021-07-02,good 4 u,40162559,Olivia Rodrigo,6266514,4ZtFanR9U6ndgddUvNcjcG,['pop'],2021-05-21,...,0.563,0.664,-5.044,0.154,0.335,0.0849,166.928,178147,0.688,A
4,3,5,2021-07-02--2021-07-09,Bad Habits,37799456,Ed Sheeran,83293380,6PQ88X9TkUIAUIZJHW2upE,"['pop', 'uk pop']",2021-06-25,...,0.808,0.897,-3.712,0.0348,0.0469,0.364,126.026,231041,0.591,B
5,5,1,2021-07-23--2021-07-30,INDUSTRY BABY (feat. Jack Harlow),33948454,Lil Nas X,5473565,27NovPIUIRrOZoCHxABJwK,"['lgbtq+ hip hop', 'pop rap']",2021-07-23,...,0.736,0.704,-7.409,0.0615,0.0203,0.0501,149.995,212000,0.894,D#/Eb
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1552,195,1,2019-12-27--2020-01-03,New Rules,4630675,Dua Lipa,27167675,2ekn2ttSfGqwhhate0LSR0,"['dance pop', 'pop', 'uk pop']",2017-06-02,...,0.762,0.7,-6.021,0.0694,0.00261,0.153,116.073,209320,0.608,A
1553,196,1,2019-12-27--2020-01-03,Cheirosa - Ao Vivo,4623030,Jorge & Mateus,15019109,2PWjKmjyTZeDpmOUa3a5da,"['sertanejo', 'sertanejo universitario']",2019-10-11,...,0.528,0.87,-3.123,0.0851,0.24,0.333,152.37,181930,0.714,B
1554,197,1,2019-12-27--2020-01-03,Havana (feat. Young Thug),4620876,Camila Cabello,22698747,1rfofaqEpACxVEHIZBJe6W,"['dance pop', 'electropop', 'pop', 'post-teen ...",2018-01-12,...,0.765,0.523,-4.333,0.03,0.184,0.132,104.988,217307,0.394,D
1555,198,1,2019-12-27--2020-01-03,Surtada - Remix Brega Funk,4607385,"Dadá Boladão, Tati Zaqui, OIK",208630,5F8ffc8KWKNawllr5WsW0r,"['brega funk', 'funk carioca']",2019-09-25,...,0.832,0.55,-7.026,0.0587,0.249,0.182,154.064,152784,0.881,F


### Clean Dataset

#### Clean Column Names

In [3]:
# List all columns
def list_columns(df):
    for col in df.columns:
        print(col)
list_columns(song_df)

Highest Charting Position
Number of Times Charted
Week of Highest Charting
Song Name
Streams
Artist
Artist Followers
Song ID
Genre
Release Date
Weeks Charted
Popularity
Danceability
Energy
Loudness
Speechiness
Acousticness
Liveness
Tempo
Duration (ms)
Valence
Chord


In [4]:
# Replace spaces in column names with underscore
song_df.columns = song_df.columns.str.replace(' ','_')
list_columns(song_df)

Highest_Charting_Position
Number_of_Times_Charted
Week_of_Highest_Charting
Song_Name
Streams
Artist
Artist_Followers
Song_ID
Genre
Release_Date
Weeks_Charted
Popularity
Danceability
Energy
Loudness
Speechiness
Acousticness
Liveness
Tempo
Duration_(ms)
Valence
Chord


#### Handle null/whitespace values

In [5]:
# Check for null data
song_df.isnull().sum()

Highest_Charting_Position    0
Number_of_Times_Charted      0
Week_of_Highest_Charting     0
Song_Name                    0
Streams                      0
Artist                       0
Artist_Followers             0
Song_ID                      0
Genre                        0
Release_Date                 0
Weeks_Charted                0
Popularity                   0
Danceability                 0
Energy                       0
Loudness                     0
Speechiness                  0
Acousticness                 0
Liveness                     0
Tempo                        0
Duration_(ms)                0
Valence                      0
Chord                        0
dtype: int64

In [6]:
# Replace whitespace values with NaN
song_df = song_df.replace(r'^\s*$', np.nan, regex=True)
song_df.isnull().sum()

Highest_Charting_Position     0
Number_of_Times_Charted       0
Week_of_Highest_Charting      0
Song_Name                     0
Streams                       0
Artist                        0
Artist_Followers             11
Song_ID                      11
Genre                        11
Release_Date                 11
Weeks_Charted                 0
Popularity                   11
Danceability                 11
Energy                       11
Loudness                     11
Speechiness                  11
Acousticness                 11
Liveness                     11
Tempo                        11
Duration_(ms)                11
Valence                      11
Chord                        11
dtype: int64

In [7]:
# Drop rows with NaN
song_df = song_df.dropna()
song_df

Unnamed: 0_level_0,Highest_Charting_Position,Number_of_Times_Charted,Week_of_Highest_Charting,Song_Name,Streams,Artist,Artist_Followers,Song_ID,Genre,Release_Date,...,Danceability,Energy,Loudness,Speechiness,Acousticness,Liveness,Tempo,Duration_(ms),Valence,Chord
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,8,2021-07-23--2021-07-30,Beggin',48633449,Måneskin,3377762,3Wrjm47oTz2sjIgck11l5e,"['indie rock italiano', 'italian pop']",2017-12-08,...,0.714,0.8,-4.808,0.0504,0.127,0.359,134.002,211560,0.589,B
2,2,3,2021-07-23--2021-07-30,STAY (with Justin Bieber),47248719,The Kid LAROI,2230022,5HCyWlXZPP0y6Gqq8TgA20,['australian hip hop'],2021-07-09,...,0.591,0.764,-5.484,0.0483,0.0383,0.103,169.928,141806,0.478,C#/Db
3,1,11,2021-06-25--2021-07-02,good 4 u,40162559,Olivia Rodrigo,6266514,4ZtFanR9U6ndgddUvNcjcG,['pop'],2021-05-21,...,0.563,0.664,-5.044,0.154,0.335,0.0849,166.928,178147,0.688,A
4,3,5,2021-07-02--2021-07-09,Bad Habits,37799456,Ed Sheeran,83293380,6PQ88X9TkUIAUIZJHW2upE,"['pop', 'uk pop']",2021-06-25,...,0.808,0.897,-3.712,0.0348,0.0469,0.364,126.026,231041,0.591,B
5,5,1,2021-07-23--2021-07-30,INDUSTRY BABY (feat. Jack Harlow),33948454,Lil Nas X,5473565,27NovPIUIRrOZoCHxABJwK,"['lgbtq+ hip hop', 'pop rap']",2021-07-23,...,0.736,0.704,-7.409,0.0615,0.0203,0.0501,149.995,212000,0.894,D#/Eb
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1552,195,1,2019-12-27--2020-01-03,New Rules,4630675,Dua Lipa,27167675,2ekn2ttSfGqwhhate0LSR0,"['dance pop', 'pop', 'uk pop']",2017-06-02,...,0.762,0.7,-6.021,0.0694,0.00261,0.153,116.073,209320,0.608,A
1553,196,1,2019-12-27--2020-01-03,Cheirosa - Ao Vivo,4623030,Jorge & Mateus,15019109,2PWjKmjyTZeDpmOUa3a5da,"['sertanejo', 'sertanejo universitario']",2019-10-11,...,0.528,0.87,-3.123,0.0851,0.24,0.333,152.37,181930,0.714,B
1554,197,1,2019-12-27--2020-01-03,Havana (feat. Young Thug),4620876,Camila Cabello,22698747,1rfofaqEpACxVEHIZBJe6W,"['dance pop', 'electropop', 'pop', 'post-teen ...",2018-01-12,...,0.765,0.523,-4.333,0.03,0.184,0.132,104.988,217307,0.394,D
1555,198,1,2019-12-27--2020-01-03,Surtada - Remix Brega Funk,4607385,"Dadá Boladão, Tati Zaqui, OIK",208630,5F8ffc8KWKNawllr5WsW0r,"['brega funk', 'funk carioca']",2019-09-25,...,0.832,0.55,-7.026,0.0587,0.249,0.182,154.064,152784,0.881,F


#### Fix Data Types

In [8]:
# Split 'Week_of_Highest_Charting' into two columns for start and end of the week
song_df[['Week_of_Highest_Charting_start', 'Week_of_Highest_Charting_end']] = song_df['Week_of_Highest_Charting'].str.split(pat="--", expand=True)
song_df = song_df.drop(columns='Week_of_Highest_Charting')
song_df.head()

Unnamed: 0_level_0,Highest_Charting_Position,Number_of_Times_Charted,Song_Name,Streams,Artist,Artist_Followers,Song_ID,Genre,Release_Date,Weeks_Charted,...,Loudness,Speechiness,Acousticness,Liveness,Tempo,Duration_(ms),Valence,Chord,Week_of_Highest_Charting_start,Week_of_Highest_Charting_end
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,8,Beggin',48633449,Måneskin,3377762,3Wrjm47oTz2sjIgck11l5e,"['indie rock italiano', 'italian pop']",2017-12-08,2021-07-23--2021-07-30\n2021-07-16--2021-07-23...,...,-4.808,0.0504,0.127,0.359,134.002,211560,0.589,B,2021-07-23,2021-07-30
2,2,3,STAY (with Justin Bieber),47248719,The Kid LAROI,2230022,5HCyWlXZPP0y6Gqq8TgA20,['australian hip hop'],2021-07-09,2021-07-23--2021-07-30\n2021-07-16--2021-07-23...,...,-5.484,0.0483,0.0383,0.103,169.928,141806,0.478,C#/Db,2021-07-23,2021-07-30
3,1,11,good 4 u,40162559,Olivia Rodrigo,6266514,4ZtFanR9U6ndgddUvNcjcG,['pop'],2021-05-21,2021-07-23--2021-07-30\n2021-07-16--2021-07-23...,...,-5.044,0.154,0.335,0.0849,166.928,178147,0.688,A,2021-06-25,2021-07-02
4,3,5,Bad Habits,37799456,Ed Sheeran,83293380,6PQ88X9TkUIAUIZJHW2upE,"['pop', 'uk pop']",2021-06-25,2021-07-23--2021-07-30\n2021-07-16--2021-07-23...,...,-3.712,0.0348,0.0469,0.364,126.026,231041,0.591,B,2021-07-02,2021-07-09
5,5,1,INDUSTRY BABY (feat. Jack Harlow),33948454,Lil Nas X,5473565,27NovPIUIRrOZoCHxABJwK,"['lgbtq+ hip hop', 'pop rap']",2021-07-23,2021-07-23--2021-07-30,...,-7.409,0.0615,0.0203,0.0501,149.995,212000,0.894,D#/Eb,2021-07-23,2021-07-30


In [9]:
# Check data types
song_df.dtypes

Highest_Charting_Position          int64
Number_of_Times_Charted            int64
Song_Name                         object
Streams                           object
Artist                            object
Artist_Followers                  object
Song_ID                           object
Genre                             object
Release_Date                      object
Weeks_Charted                     object
Popularity                        object
Danceability                      object
Energy                            object
Loudness                          object
Speechiness                       object
Acousticness                      object
Liveness                          object
Tempo                             object
Duration_(ms)                     object
Valence                           object
Chord                             object
Week_of_Highest_Charting_start    object
Week_of_Highest_Charting_end      object
dtype: object

In [10]:
# Update 'Week_of_Highest_Charting_start', 'Week_of_Highest_Charting_end', 'Release Date' columns as datetime data type
song_df['Week_of_Highest_Charting_start'] = pd.to_datetime(song_df['Week_of_Highest_Charting_start'])
song_df['Week_of_Highest_Charting_end'] = pd.to_datetime(song_df['Week_of_Highest_Charting_end'])
song_df['Release_Date'] = pd.to_datetime(song_df['Release_Date'], format="%Y-%m-%d")

# Change 'Streams', 'Artist_Followers', 'Duration_(ms)' columns to numerical data type
song_df['Streams'] = song_df['Streams'].replace({',':''}, regex=True).apply(pd.to_numeric, 1)
song_df['Artist_Followers'] = song_df['Artist_Followers'].apply(pd.to_numeric, 1)
song_df['Duration_(ms)'] = song_df['Duration_(ms)'].apply(pd.to_numeric, 1)

# Change song component columns to float data type
song_df = song_df.astype({
    'Popularity':float, 
    'Danceability':float, 
    'Energy':float, 
    'Loudness':float, 
    'Speechiness':float, 
    'Acousticness': float, 
    'Liveness': float, 
    'Tempo': float, 
    'Valence': float})

song_df.dtypes

Highest_Charting_Position                  int64
Number_of_Times_Charted                    int64
Song_Name                                 object
Streams                                    int64
Artist                                    object
Artist_Followers                           int64
Song_ID                                   object
Genre                                     object
Release_Date                      datetime64[ns]
Weeks_Charted                             object
Popularity                               float64
Danceability                             float64
Energy                                   float64
Loudness                                 float64
Speechiness                              float64
Acousticness                             float64
Liveness                                 float64
Tempo                                    float64
Duration_(ms)                              int64
Valence                                  float64
Chord               

#### Translate 'Weeks_Charted' column into numerical data (vs. list of dates)

In [11]:
# Check 'Weeks_Charted' column values
song_df['Weeks_Charted'].value_counts()

2020-12-18--2020-12-25                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  

In [12]:
# Create new dataframe with 'Weeks_Charted' data in separate columns for each week
weeks_charted_df = song_df['Weeks_Charted'].str.split(pat="\n", expand=True)
weeks_charted_df = weeks_charted_df.add_prefix('week_')
weeks_charted_df['number_weeks_charted'] = weeks_charted_df.count(axis=1)
weeks_charted_df

Unnamed: 0_level_0,week_0,week_1,week_2,week_3,week_4,week_5,week_6,week_7,week_8,week_9,...,week_133,week_134,week_135,week_136,week_137,week_138,week_139,week_140,week_141,number_weeks_charted
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2021-07-23--2021-07-30,2021-07-16--2021-07-23,2021-07-09--2021-07-16,2021-07-02--2021-07-09,2021-06-25--2021-07-02,2021-06-18--2021-06-25,2021-06-11--2021-06-18,2021-06-04--2021-06-11,,,...,,,,,,,,,,8
2,2021-07-23--2021-07-30,2021-07-16--2021-07-23,2021-07-09--2021-07-16,,,,,,,,...,,,,,,,,,,3
3,2021-07-23--2021-07-30,2021-07-16--2021-07-23,2021-07-09--2021-07-16,2021-07-02--2021-07-09,2021-06-25--2021-07-02,2021-06-18--2021-06-25,2021-06-11--2021-06-18,2021-06-04--2021-06-11,2021-05-28--2021-06-04,2021-05-21--2021-05-28,...,,,,,,,,,,11
4,2021-07-23--2021-07-30,2021-07-16--2021-07-23,2021-07-09--2021-07-16,2021-07-02--2021-07-09,2021-06-25--2021-07-02,,,,,,...,,,,,,,,,,5
5,2021-07-23--2021-07-30,,,,,,,,,,...,,,,,,,,,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1552,2019-12-27--2020-01-03,,,,,,,,,,...,,,,,,,,,,1
1553,2019-12-27--2020-01-03,,,,,,,,,,...,,,,,,,,,,1
1554,2019-12-27--2020-01-03,,,,,,,,,,...,,,,,,,,,,1
1555,2019-12-27--2020-01-03,,,,,,,,,,...,,,,,,,,,,1


In [13]:
# Add 'number_weeks_charted' column to song_df instead of 'Weeks_Charted' column
song_df['number_weeks_charted'] = weeks_charted_df['number_weeks_charted']
song_df = song_df.drop(columns='Weeks_Charted')
song_df.head()

Unnamed: 0_level_0,Highest_Charting_Position,Number_of_Times_Charted,Song_Name,Streams,Artist,Artist_Followers,Song_ID,Genre,Release_Date,Popularity,...,Speechiness,Acousticness,Liveness,Tempo,Duration_(ms),Valence,Chord,Week_of_Highest_Charting_start,Week_of_Highest_Charting_end,number_weeks_charted
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,8,Beggin',48633449,Måneskin,3377762,3Wrjm47oTz2sjIgck11l5e,"['indie rock italiano', 'italian pop']",2017-12-08,100.0,...,0.0504,0.127,0.359,134.002,211560,0.589,B,2021-07-23,2021-07-30,8
2,2,3,STAY (with Justin Bieber),47248719,The Kid LAROI,2230022,5HCyWlXZPP0y6Gqq8TgA20,['australian hip hop'],2021-07-09,99.0,...,0.0483,0.0383,0.103,169.928,141806,0.478,C#/Db,2021-07-23,2021-07-30,3
3,1,11,good 4 u,40162559,Olivia Rodrigo,6266514,4ZtFanR9U6ndgddUvNcjcG,['pop'],2021-05-21,99.0,...,0.154,0.335,0.0849,166.928,178147,0.688,A,2021-06-25,2021-07-02,11
4,3,5,Bad Habits,37799456,Ed Sheeran,83293380,6PQ88X9TkUIAUIZJHW2upE,"['pop', 'uk pop']",2021-06-25,98.0,...,0.0348,0.0469,0.364,126.026,231041,0.591,B,2021-07-02,2021-07-09,5
5,5,1,INDUSTRY BABY (feat. Jack Harlow),33948454,Lil Nas X,5473565,27NovPIUIRrOZoCHxABJwK,"['lgbtq+ hip hop', 'pop rap']",2021-07-23,96.0,...,0.0615,0.0203,0.0501,149.995,212000,0.894,D#/Eb,2021-07-23,2021-07-30,1


#### Final Checks/Cleaning of Dataframe

In [14]:
# Check data types
song_df.dtypes

Highest_Charting_Position                  int64
Number_of_Times_Charted                    int64
Song_Name                                 object
Streams                                    int64
Artist                                    object
Artist_Followers                           int64
Song_ID                                   object
Genre                                     object
Release_Date                      datetime64[ns]
Popularity                               float64
Danceability                             float64
Energy                                   float64
Loudness                                 float64
Speechiness                              float64
Acousticness                             float64
Liveness                                 float64
Tempo                                    float64
Duration_(ms)                              int64
Valence                                  float64
Chord                                     object
Week_of_Highest_Char

In [15]:
# Reorder song_df columns
song_df = song_df[[
    'Song_ID', 
    'Song_Name', 
    'Artist', 
    'Number_of_Times_Charted', 
    'Highest_Charting_Position', 
    'Week_of_Highest_Charting_start', 
    'Week_of_Highest_Charting_end',
    'number_weeks_charted', 
    'Streams', 
    'Artist_Followers', 
    'Genre', 
    'Release_Date', 
    'Popularity', 
    'Acousticness', 
    'Danceability', 
    'Energy', 
    'Liveness', 
    'Loudness', 
    'Speechiness', 
    'Valence', 
    'Duration_(ms)', 
    'Tempo', 
    'Chord'
]]
song_df = song_df.rename(columns={'Duration_(ms)':'Duration_ms'})
song_df = song_df.rename(columns=str.lower)
song_df.head()

Unnamed: 0_level_0,song_id,song_name,artist,number_of_times_charted,highest_charting_position,week_of_highest_charting_start,week_of_highest_charting_end,number_weeks_charted,streams,artist_followers,...,acousticness,danceability,energy,liveness,loudness,speechiness,valence,duration_ms,tempo,chord
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3Wrjm47oTz2sjIgck11l5e,Beggin',Måneskin,8,1,2021-07-23,2021-07-30,8,48633449,3377762,...,0.127,0.714,0.8,0.359,-4.808,0.0504,0.589,211560,134.002,B
2,5HCyWlXZPP0y6Gqq8TgA20,STAY (with Justin Bieber),The Kid LAROI,3,2,2021-07-23,2021-07-30,3,47248719,2230022,...,0.0383,0.591,0.764,0.103,-5.484,0.0483,0.478,141806,169.928,C#/Db
3,4ZtFanR9U6ndgddUvNcjcG,good 4 u,Olivia Rodrigo,11,1,2021-06-25,2021-07-02,11,40162559,6266514,...,0.335,0.563,0.664,0.0849,-5.044,0.154,0.688,178147,166.928,A
4,6PQ88X9TkUIAUIZJHW2upE,Bad Habits,Ed Sheeran,5,3,2021-07-02,2021-07-09,5,37799456,83293380,...,0.0469,0.808,0.897,0.364,-3.712,0.0348,0.591,231041,126.026,B
5,27NovPIUIRrOZoCHxABJwK,INDUSTRY BABY (feat. Jack Harlow),Lil Nas X,1,5,2021-07-23,2021-07-30,1,33948454,5473565,...,0.0203,0.736,0.704,0.0501,-7.409,0.0615,0.894,212000,149.995,D#/Eb


### Upload to PostgreSQL database

In [16]:
from sqlalchemy import create_engine
from getpass import getpass

# Set up connection to database
password = getpass('Enter database password.')
engine = create_engine(f'postgresql://postgres:{password}@localhost:5432/project_spotify_db')

Enter database password.········


In [20]:
# Upload song_df to database ("top_songs" table)
song_df.to_sql(name='top_songs', con=engine, if_exists='append')