# Final Project: Spotify Top Songs Analysis

## Data Loading & Cleaning

### Import Dependencies

In [1]:
import pandas as pd
import os
import numpy as np
import psycopg2
import datetime
import requests

### Import Scraped Spotify Data

Spotify Top 200 Songs Chart Data

Country: United States

Date Range: 1/1/2020 - date of scraping (11/11/21)

In [2]:
# Read in scraped data
scrape_df = pd.read_csv("./Resources/spotifytop200.csv")
scrape_df

Unnamed: 0,song_id,song_url,song,artist,date,position,streams
0,4iN16F8JtVxG2UTzp3avGl,https://open.spotify.com/track/4iN16F8JtVxG2UT...,Smokin Out The Window,"Bruno Mars, Anderson .Paak, Silk Sonic",2020-01_01,1,1331303
1,0gplL1WMoJ6iYaPgMCL0gX,https://open.spotify.com/track/0gplL1WMoJ6iYaP...,Easy On Me,Adele,2020-01_01,2,1111983
2,5Z9KJZvQzH6PFmb8SNkxuk,https://open.spotify.com/track/5Z9KJZvQzH6PFmb...,INDUSTRY BABY (feat. Jack Harlow),Lil Nas X,2020-01_01,3,1073301
3,00Blm7zeNqgYLPtW6zg8cj,https://open.spotify.com/track/00Blm7zeNqgYLPt...,One Right Now (with The Weeknd),Post Malone,2020-01_01,4,966020
4,5PjdY0CKGZdEuoNab3yDmX,https://open.spotify.com/track/5PjdY0CKGZdEuoN...,STAY (with Justin Bieber),The Kid LAROI,2020-01_01,5,961443
...,...,...,...,...,...,...,...
136195,3ee8Jmje8o58CHK66QrVC2,https://open.spotify.com/track/3ee8Jmje8o58CHK...,SAD!,XXXTENTACION,2021-11_11,196,216178
136196,1dIWPXMX4kRHj6Dt2DStUQ,https://open.spotify.com/track/1dIWPXMX4kRHj6D...,Chosen (feat. Ty Dolla $ign),"Blxst, Tyga",2021-11_11,197,215608
136197,2SAqBLGA283SUiwJ3xOUVI,https://open.spotify.com/track/2SAqBLGA283SUiw...,Laugh Now Cry Later (feat. Lil Durk),Drake,2021-11_11,198,214908
136198,3GVkPk8mqxz0itaAriG1L7,https://open.spotify.com/track/3GVkPk8mqxz0ita...,Everybody Dies In Their Nightmares,XXXTENTACION,2021-11_11,199,214824


### Spotify API - Get Tracks' Audio Features

- acousticness (float): confidence measure from 0-1 whether track is acoustic (1.0 highest confidence is acoustic)
- analysis_url (string): url to access full audio analysis of track
- danceability (float): how suitable track is for dancing based on combination of musical elements (tempo, rhythm stability, beat strength, overall regularity), scale of 0-1 (1.0 most danceable) 
- duration_ms (integer): duration of track in milliseconds
- energy (float): measure from 0-1 for perceptual measure of intensity and activity (based on dynamic range, perceived loudness, timbre, onset rate, general entropy)
- id (string): spotify ID for track
- instrumentalness (float): predicts whether track contains no vocals with 1.0 being greatest likelihood the track contains no vocals
- key (integer): key the track is in (uses integer notation, 0 = C, 1 = C♯/D♭, 2 = D, 3 = D♯/E♭, ...)
- liveness (float): detects presence of audience in recording, with higher values representing increased probability track was performed live
- loudness (float): overall loudness of track in decibels (dB), averaged across entire track, with values typically ranging between -60 and 0 dB
- mode (integer): modality (major=1 or minor=0) of track, type of scale from which melodic content is derived
- speechiness (float): detects presence of spoken words in track, with more exclusive speech-like recording with value closer to 1.0
- tempo (float): overall estimated tempo of track in beats per minute (BPM)
- time_signature (integer): estimated overall time signature (meter) of track
- track_href (string): link to web API endpoint for full details of track
- type (string): object type
- uri (string): Spotify URI for track
- valence (float): measure from 0-1 describing musical positiveness conveyed by track

#### Get list of song ids to use for API

In [3]:
# Create separate series of song_ids from scraped dataframe & count duplicates
song_ids = pd.Series(scrape_df['song_id'])
song_ids.value_counts()

6G2xXQRSZlEzYsUEb3uvuR    681
2xLMifQCjDGFmkHkpNLD9h    681
3NqBxTOMCJ3zW9CIP51td4    681
0jA4h2SD50Oak4C3Vn905a    681
003vvx7Niy0yvhvHt4a68B    681
                         ... 
2vmfvSoZBFAt9hhRoEByLi    681
7zjEyeBsaw9gV0jofJLfOM    681
1f3yAtsJtY87CTmM8RLnxf    681
0qXP5fMhxGzxALOkXYUxfP    681
0ZLuW8uOXdFNWcI40C0OC2    681
Name: song_id, Length: 200, dtype: int64

In [4]:
# Drop duplicates
song_ids = song_ids.drop_duplicates()
song_ids.value_counts()

0rKtyWc8bvkriBthvHKY8d    1
2vXKRlJBXyOcvZYTdNeckS    1
7AQim7LbvFVZJE3O8TYgf2    1
2vWBUC9djv6BtiGlmKiQaH    1
7sO5G9EABYOXQKNPNiE9NR    1
                         ..
3GVkPk8mqxz0itaAriG1L7    1
7KA4W4McWYRpgf0fWsJZWB    1
5QO79kh1waicV47BqGRL3g    1
2vmfvSoZBFAt9hhRoEByLi    1
7vQbuQcyTflfCIOu3Uzzya    1
Name: song_id, Length: 200, dtype: int64

In [5]:
# Convert to list & split into two
song_ids = song_ids.tolist()
song_ids1 = song_ids[:len(song_ids)//2]
print(len(song_ids1))
song_ids2 = song_ids[len(song_ids)//2:]
print(len(song_ids2))

100
100


In [6]:
# Convert two lists into single string for API request
song_ids1_str = ','.join(song_ids1)
print(song_ids1_str)
song_ids2_str = ','.join(song_ids2) 
print(song_ids2_str)

4iN16F8JtVxG2UTzp3avGl,0gplL1WMoJ6iYaPgMCL0gX,5Z9KJZvQzH6PFmb8SNkxuk,00Blm7zeNqgYLPtW6zg8cj,5PjdY0CKGZdEuoNab3yDmX,02MWAaffLxlfxAUY7c5dvx,2BcMwX1MPV6ZHP4tUT9uq6,4R67rQNSbbsR4TdUVOIdez,04S1pkp1VaIqjg8zZqknR5,0k1WUmIRnG3xU6fvvDVfRG,6f5ExP43esnvdKPddwKXJH,3Vi5XqYrmQgOYBajMWSvCi,50nfwKoDiSYg8zOCREWAm5,4ZtFanR9U6ndgddUvNcjcG,3QFInJAm9eyaho5vBzxInN,07MDkzWARZaLEdKxo6yArG,3DarAbFujv6eYNliUTyqtz,08F16baYbciTva9P4BvpiI,4RVwu0g32PAqgUiJoXsdF8,7rglLriMNBPAyuJOMGwi39,6Uj1ctrBOjOas8xZXGqKk4,3rmo8F54jFF8OgYsqTxm5d,37y7iDayfwm3WXn5BiAoRk,08SB2OtZkaliju77WYEKxk,29TPjc8wxfz4XMn21O7VsZ,40iJIUlhi6renaREYGeIDS,2QjOHCTQ1Jl3zawyYOpxh6,5CZ40GBx1sQ9agT82CLQCT,0e8nrvls4Qqv5Rfa2UhqmO,1SC5rEoYDGUK4NfG82494W,3NqBxTOMCJ3zW9CIP51td4,4yNoUQkYf1QF8iYlEzNynH,4XvcHTUfIlWfyJTRG0aqlo,0WSEq9Ko4kFPt8yo3ICd6T,15HMh4yxdf4wyxSZSlOgGZ,7MAibcTli4IisCtbHKrGMh,5nujrmhLynf4yMoMtj8AQF,18vXApRmJSgQ6wG2ll9AOg,2B4GHvToeLTOBB4QLzW3Ni,2gpWyfu7eZ01zzncHpxOtA,4SqWKzw0CbA05TGszDgMlc,0k4d5YPDr1r7FX77VdqWez,3Wrjm47oTz2sjIgck11l5e,62vpWI1CHwF

#### Spotify API

In [7]:
# Import access tokens
from config import client_id, client_secret

In [8]:
# Pass the access token
auth_url = 'https://accounts.spotify.com/api/token'
auth_response = requests.post(auth_url, {
    'grant_type': 'client_credentials', 
    'client_id': client_id, 
    'client_secret': client_secret
})
auth_response_data = auth_response.json()
access_token = auth_response_data['access_token']

In [9]:
# Set up access token in header for GET request
headers = {'Authorization': 'Bearer {token}'.format(token=access_token)}

In [10]:
# Set up API request
base_url = 'https://api.spotify.com/v1/audio-features/'

In [11]:
# GET request for list 1
data = []
for i in song_ids1:
    req = requests.get(base_url + i, headers=headers)
    req = req.json()
    data.append(req)

In [12]:
# GET request for list 2
for i in song_ids2: 
    req = requests.get(base_url + i, headers=headers)
    req = req.json()
    data.append(req)

In [13]:
# Check results
data

[{'danceability': 0.627,
  'energy': 0.618,
  'key': 2,
  'loudness': -8.529,
  'mode': 1,
  'speechiness': 0.0437,
  'acousticness': 0.0558,
  'instrumentalness': 0,
  'liveness': 0.351,
  'valence': 0.848,
  'tempo': 82.03,
  'type': 'audio_features',
  'id': '4iN16F8JtVxG2UTzp3avGl',
  'uri': 'spotify:track:4iN16F8JtVxG2UTzp3avGl',
  'track_href': 'https://api.spotify.com/v1/tracks/4iN16F8JtVxG2UTzp3avGl',
  'analysis_url': 'https://api.spotify.com/v1/audio-analysis/4iN16F8JtVxG2UTzp3avGl',
  'duration_ms': 197443,
  'time_signature': 4},
 {'danceability': 0.604,
  'energy': 0.366,
  'key': 5,
  'loudness': -7.519,
  'mode': 1,
  'speechiness': 0.0282,
  'acousticness': 0.578,
  'instrumentalness': 0,
  'liveness': 0.133,
  'valence': 0.13,
  'tempo': 141.981,
  'type': 'audio_features',
  'id': '0gplL1WMoJ6iYaPgMCL0gX',
  'uri': 'spotify:track:0gplL1WMoJ6iYaPgMCL0gX',
  'track_href': 'https://api.spotify.com/v1/tracks/0gplL1WMoJ6iYaPgMCL0gX',
  'analysis_url': 'https://api.spotify.

In [14]:
# Convert results to pandas dataframe
features_df = pd.DataFrame.from_dict(data)
features_df = features_df.set_index('id')
features_df

Unnamed: 0_level_0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,uri,track_href,analysis_url,duration_ms,time_signature,error
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
4iN16F8JtVxG2UTzp3avGl,0.627,0.618,2.0,-8.529,1.0,0.0437,0.0558,0.000000,0.3510,0.8480,82.030,audio_features,spotify:track:4iN16F8JtVxG2UTzp3avGl,https://api.spotify.com/v1/tracks/4iN16F8JtVxG...,https://api.spotify.com/v1/audio-analysis/4iN1...,197443.0,4.0,
0gplL1WMoJ6iYaPgMCL0gX,0.604,0.366,5.0,-7.519,1.0,0.0282,0.5780,0.000000,0.1330,0.1300,141.981,audio_features,spotify:track:0gplL1WMoJ6iYaPgMCL0gX,https://api.spotify.com/v1/tracks/0gplL1WMoJ6i...,https://api.spotify.com/v1/audio-analysis/0gpl...,224695.0,4.0,
5Z9KJZvQzH6PFmb8SNkxuk,0.741,0.691,10.0,-7.395,0.0,0.0672,0.0221,0.000000,0.0476,0.8920,150.087,audio_features,spotify:track:5Z9KJZvQzH6PFmb8SNkxuk,https://api.spotify.com/v1/tracks/5Z9KJZvQzH6P...,https://api.spotify.com/v1/audio-analysis/5Z9K...,212353.0,4.0,
00Blm7zeNqgYLPtW6zg8cj,0.687,0.781,1.0,-4.806,1.0,0.0530,0.0361,0.000000,0.0755,0.6880,97.014,audio_features,spotify:track:00Blm7zeNqgYLPtW6zg8cj,https://api.spotify.com/v1/tracks/00Blm7zeNqgY...,https://api.spotify.com/v1/audio-analysis/00Bl...,193507.0,4.0,
5PjdY0CKGZdEuoNab3yDmX,0.591,0.764,1.0,-5.484,1.0,0.0483,0.0383,0.000000,0.1030,0.4780,169.928,audio_features,spotify:track:5PjdY0CKGZdEuoNab3yDmX,https://api.spotify.com/v1/tracks/5PjdY0CKGZdE...,https://api.spotify.com/v1/audio-analysis/5Pjd...,141806.0,4.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3ee8Jmje8o58CHK66QrVC2,0.740,0.613,8.0,-4.880,1.0,0.1450,0.2580,0.003720,0.1230,0.4730,75.023,audio_features,spotify:track:3ee8Jmje8o58CHK66QrVC2,https://api.spotify.com/v1/tracks/3ee8Jmje8o58...,https://api.spotify.com/v1/audio-analysis/3ee8...,166606.0,4.0,
1dIWPXMX4kRHj6Dt2DStUQ,0.571,0.767,2.0,-5.160,1.0,0.2870,0.3360,0.000000,0.0809,0.6050,93.421,audio_features,spotify:track:1dIWPXMX4kRHj6Dt2DStUQ,https://api.spotify.com/v1/tracks/1dIWPXMX4kRH...,https://api.spotify.com/v1/audio-analysis/1dIW...,161684.0,4.0,
2SAqBLGA283SUiwJ3xOUVI,0.761,0.518,0.0,-8.871,1.0,0.1340,0.2440,0.000035,0.1070,0.5220,133.976,audio_features,spotify:track:2SAqBLGA283SUiwJ3xOUVI,https://api.spotify.com/v1/tracks/2SAqBLGA283S...,https://api.spotify.com/v1/audio-analysis/2SAq...,261493.0,4.0,
3GVkPk8mqxz0itaAriG1L7,0.734,0.570,7.0,-7.066,0.0,0.1330,0.8470,0.000021,0.1120,0.6890,129.953,audio_features,spotify:track:3GVkPk8mqxz0itaAriG1L7,https://api.spotify.com/v1/tracks/3GVkPk8mqxz0...,https://api.spotify.com/v1/audio-analysis/3GVk...,95467.0,4.0,


### Clean datasets

#### Clean features_df

In [15]:
# Remove unnecessary columns
features_df = features_df.drop(columns=['type', 'uri', 'track_href', 'analysis_url', 'error'])
features_df.head()

Unnamed: 0_level_0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
4iN16F8JtVxG2UTzp3avGl,0.627,0.618,2.0,-8.529,1.0,0.0437,0.0558,0.0,0.351,0.848,82.03,197443.0,4.0
0gplL1WMoJ6iYaPgMCL0gX,0.604,0.366,5.0,-7.519,1.0,0.0282,0.578,0.0,0.133,0.13,141.981,224695.0,4.0
5Z9KJZvQzH6PFmb8SNkxuk,0.741,0.691,10.0,-7.395,0.0,0.0672,0.0221,0.0,0.0476,0.892,150.087,212353.0,4.0
00Blm7zeNqgYLPtW6zg8cj,0.687,0.781,1.0,-4.806,1.0,0.053,0.0361,0.0,0.0755,0.688,97.014,193507.0,4.0
5PjdY0CKGZdEuoNab3yDmX,0.591,0.764,1.0,-5.484,1.0,0.0483,0.0383,0.0,0.103,0.478,169.928,141806.0,4.0


In [16]:
# Replace values in 'key' column with chord names
features_df['key'] = features_df['key'].map({
    0.0:'C',
    1.0:'C♯/D♭',
    2.0:'D',
    3.0:'D♯/E♭',
    4.0:'E',
    5.0:'F',
    6.0:'F♯/G♭',
    7.0:'G',
    8.0:'G♯/A♭',
    9.0:'A',
    10.0:'A♯/B♭',
    11.0:'B'
})
features_df['key'].value_counts()

C♯/D♭    34
C        26
G♯/A♭    18
B        16
D        16
A        14
F♯/G♭    14
G        13
E        12
F        12
A♯/B♭     8
D♯/E♭     7
Name: key, dtype: int64

In [17]:
# Replace values in 'mode' column with 'major' or 'minor'
features_df['mode'] = features_df['mode'].map({
    1:'major',
    0:'minor'
})
features_df['mode'].value_counts()

major    122
minor     68
Name: mode, dtype: int64

In [18]:
# Check data types
features_df.dtypes

danceability        float64
energy              float64
key                  object
loudness            float64
mode                 object
speechiness         float64
acousticness        float64
instrumentalness    float64
liveness            float64
valence             float64
tempo               float64
duration_ms         float64
time_signature      float64
dtype: object

In [19]:
# Check 'time_signature' values
features_df['time_signature'].value_counts()

4.0    178
3.0     11
1.0      1
Name: time_signature, dtype: int64

In [20]:
# Fix 'time_signature' data type as categorical
features_df['time_signature'] = features_df['time_signature'].astype('category')
features_df.dtypes

danceability         float64
energy               float64
key                   object
loudness             float64
mode                  object
speechiness          float64
acousticness         float64
instrumentalness     float64
liveness             float64
valence              float64
tempo                float64
duration_ms          float64
time_signature      category
dtype: object

In [21]:
# Reset features_df index name
features_df.index.name = 'song_id'
features_df.head()

Unnamed: 0_level_0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
song_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
4iN16F8JtVxG2UTzp3avGl,0.627,0.618,D,-8.529,major,0.0437,0.0558,0.0,0.351,0.848,82.03,197443.0,4.0
0gplL1WMoJ6iYaPgMCL0gX,0.604,0.366,F,-7.519,major,0.0282,0.578,0.0,0.133,0.13,141.981,224695.0,4.0
5Z9KJZvQzH6PFmb8SNkxuk,0.741,0.691,A♯/B♭,-7.395,minor,0.0672,0.0221,0.0,0.0476,0.892,150.087,212353.0,4.0
00Blm7zeNqgYLPtW6zg8cj,0.687,0.781,C♯/D♭,-4.806,major,0.053,0.0361,0.0,0.0755,0.688,97.014,193507.0,4.0
5PjdY0CKGZdEuoNab3yDmX,0.591,0.764,C♯/D♭,-5.484,major,0.0483,0.0383,0.0,0.103,0.478,169.928,141806.0,4.0


#### Clean scrape_df

In [22]:
# Set index of scrape_df
scrape_df = scrape_df.set_index('song_id')
scrape_df

Unnamed: 0_level_0,song_url,song,artist,date,position,streams
song_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
4iN16F8JtVxG2UTzp3avGl,https://open.spotify.com/track/4iN16F8JtVxG2UT...,Smokin Out The Window,"Bruno Mars, Anderson .Paak, Silk Sonic",2020-01_01,1,1331303
0gplL1WMoJ6iYaPgMCL0gX,https://open.spotify.com/track/0gplL1WMoJ6iYaP...,Easy On Me,Adele,2020-01_01,2,1111983
5Z9KJZvQzH6PFmb8SNkxuk,https://open.spotify.com/track/5Z9KJZvQzH6PFmb...,INDUSTRY BABY (feat. Jack Harlow),Lil Nas X,2020-01_01,3,1073301
00Blm7zeNqgYLPtW6zg8cj,https://open.spotify.com/track/00Blm7zeNqgYLPt...,One Right Now (with The Weeknd),Post Malone,2020-01_01,4,966020
5PjdY0CKGZdEuoNab3yDmX,https://open.spotify.com/track/5PjdY0CKGZdEuoN...,STAY (with Justin Bieber),The Kid LAROI,2020-01_01,5,961443
...,...,...,...,...,...,...
3ee8Jmje8o58CHK66QrVC2,https://open.spotify.com/track/3ee8Jmje8o58CHK...,SAD!,XXXTENTACION,2021-11_11,196,216178
1dIWPXMX4kRHj6Dt2DStUQ,https://open.spotify.com/track/1dIWPXMX4kRHj6D...,Chosen (feat. Ty Dolla $ign),"Blxst, Tyga",2021-11_11,197,215608
2SAqBLGA283SUiwJ3xOUVI,https://open.spotify.com/track/2SAqBLGA283SUiw...,Laugh Now Cry Later (feat. Lil Durk),Drake,2021-11_11,198,214908
3GVkPk8mqxz0itaAriG1L7,https://open.spotify.com/track/3GVkPk8mqxz0ita...,Everybody Dies In Their Nightmares,XXXTENTACION,2021-11_11,199,214824


In [23]:
# Check data types
scrape_df.dtypes

song_url    object
song        object
artist      object
date        object
position     int64
streams     object
dtype: object

In [24]:
# Update 'date' column as datetime
scrape_df['date'] = pd.to_datetime(scrape_df['date'], format="%Y-%m_%d")

# Update 'streams' column to numerical
scrape_df['streams'] = scrape_df['streams'].replace({',':''}, regex=True).apply(pd.to_numeric, 1)

In [25]:
# Remove unnecessary columns
scrape_df = scrape_df.drop(columns=['song_url'])
scrape_df.head()

Unnamed: 0_level_0,song,artist,date,position,streams
song_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
4iN16F8JtVxG2UTzp3avGl,Smokin Out The Window,"Bruno Mars, Anderson .Paak, Silk Sonic",2020-01-01,1,1331303
0gplL1WMoJ6iYaPgMCL0gX,Easy On Me,Adele,2020-01-01,2,1111983
5Z9KJZvQzH6PFmb8SNkxuk,INDUSTRY BABY (feat. Jack Harlow),Lil Nas X,2020-01-01,3,1073301
00Blm7zeNqgYLPtW6zg8cj,One Right Now (with The Weeknd),Post Malone,2020-01-01,4,966020
5PjdY0CKGZdEuoNab3yDmX,STAY (with Justin Bieber),The Kid LAROI,2020-01-01,5,961443


#### Create new dataframe song_df for compiled data
Consolidating into list of songs & total streams, rankings, etc.
Merging with features_df

In [26]:
# Check number of unique songs
len(scrape_df['song'].unique())

200

In [27]:
# Dataframe of total streams for each song
total_streams_df = scrape_df.groupby('song_id')['streams'].sum()
total_streams_df = pd.DataFrame(total_streams_df)
total_streams_df

Unnamed: 0_level_0,streams
song_id,Unnamed: 1_level_1
003vvx7Niy0yvhvHt4a68B,198736911
00Blm7zeNqgYLPtW6zg8cj,657859620
02MWAaffLxlfxAUY7c5dvx,617101770
04S1pkp1VaIqjg8zZqknR5,520777044
04sN26COy28wTXYj3dMoiZ,164300103
...,...
7qEHsqek33rTcFNT9PFqLf,168997641
7rglLriMNBPAyuJOMGwi39,374349786
7sO5G9EABYOXQKNPNiE9NR,150734583
7vQbuQcyTflfCIOu3Uzzya,149358282


In [28]:
# Dataframe of first highest position in chart for each song
highest_position_df = scrape_df.groupby('song_id')['position'].min()
highest_position_df = pd.DataFrame(highest_position_df)
highest_position_df

Unnamed: 0_level_0,position
song_id,Unnamed: 1_level_1
003vvx7Niy0yvhvHt4a68B,82
00Blm7zeNqgYLPtW6zg8cj,4
02MWAaffLxlfxAUY7c5dvx,6
04S1pkp1VaIqjg8zZqknR5,9
04sN26COy28wTXYj3dMoiZ,138
...,...
7qEHsqek33rTcFNT9PFqLf,128
7rglLriMNBPAyuJOMGwi39,20
7sO5G9EABYOXQKNPNiE9NR,178
7vQbuQcyTflfCIOu3Uzzya,185


In [29]:
# Dataframe of song_id, song, artist data
track_artist_df = scrape_df.copy()
track_artist_df = track_artist_df.drop(columns=['date', 'position', 'streams'])
track_artist_df = track_artist_df.drop_duplicates()
track_artist_df

Unnamed: 0_level_0,song,artist
song_id,Unnamed: 1_level_1,Unnamed: 2_level_1
4iN16F8JtVxG2UTzp3avGl,Smokin Out The Window,"Bruno Mars, Anderson .Paak, Silk Sonic"
0gplL1WMoJ6iYaPgMCL0gX,Easy On Me,Adele
5Z9KJZvQzH6PFmb8SNkxuk,INDUSTRY BABY (feat. Jack Harlow),Lil Nas X
00Blm7zeNqgYLPtW6zg8cj,One Right Now (with The Weeknd),Post Malone
5PjdY0CKGZdEuoNab3yDmX,STAY (with Justin Bieber),The Kid LAROI
...,...,...
3ee8Jmje8o58CHK66QrVC2,SAD!,XXXTENTACION
1dIWPXMX4kRHj6Dt2DStUQ,Chosen (feat. Ty Dolla $ign),"Blxst, Tyga"
2SAqBLGA283SUiwJ3xOUVI,Laugh Now Cry Later (feat. Lil Durk),Drake
3GVkPk8mqxz0itaAriG1L7,Everybody Dies In Their Nightmares,XXXTENTACION


In [30]:
# Merge dataframes with song and artist info
song_df = track_artist_df.join(total_streams_df, how='outer')
song_df = song_df.join(highest_position_df, how='outer')
song_df

Unnamed: 0_level_0,song,artist,streams,position
song_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
003vvx7Niy0yvhvHt4a68B,Mr. Brightside,The Killers,198736911,82
00Blm7zeNqgYLPtW6zg8cj,One Right Now (with The Weeknd),Post Malone,657859620,4
02MWAaffLxlfxAUY7c5dvx,Heat Waves,Glass Animals,617101770,6
04S1pkp1VaIqjg8zZqknR5,Enchanted,Taylor Swift,520777044,9
04sN26COy28wTXYj3dMoiZ,Bored,Billie Eilish,164300103,138
...,...,...,...,...
7qEHsqek33rTcFNT9PFqLf,Someone You Loved,Lewis Capaldi,168997641,128
7rglLriMNBPAyuJOMGwi39,Cold Heart - PNAU Remix,"Elton John, Dua Lipa",374349786,20
7sO5G9EABYOXQKNPNiE9NR,Ric Flair Drip (& Metro Boomin),Offset,150734583,178
7vQbuQcyTflfCIOu3Uzzya,Jingle Bell Rock,BobHelms,149358282,185


In [31]:
# Merge dataframe with features_df
song_df = song_df.join(features_df, how='outer')
song_df

Unnamed: 0_level_0,song,artist,streams,position,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
song_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
003vvx7Niy0yvhvHt4a68B,Mr. Brightside,The Killers,198736911.0,82.0,0.352,0.911,C♯/D♭,-5.230,major,0.0747,0.00121,0.000000,0.0995,0.236,148.033,222973.0,4.0
00Blm7zeNqgYLPtW6zg8cj,One Right Now (with The Weeknd),Post Malone,657859620.0,4.0,0.687,0.781,C♯/D♭,-4.806,major,0.0530,0.03610,0.000000,0.0755,0.688,97.014,193507.0,4.0
02MWAaffLxlfxAUY7c5dvx,Heat Waves,Glass Animals,617101770.0,6.0,0.761,0.525,B,-6.900,major,0.0944,0.44000,0.000007,0.0921,0.531,80.870,238805.0,4.0
04S1pkp1VaIqjg8zZqknR5,Enchanted,Taylor Swift,520777044.0,9.0,0.535,0.618,G♯/A♭,-3.913,major,0.0273,0.07160,0.000388,0.1690,0.228,81.975,352200.0,4.0
04sN26COy28wTXYj3dMoiZ,Bored,Billie Eilish,164300103.0,138.0,0.614,0.318,G,-12.695,major,0.0478,0.89600,0.002390,0.0795,0.112,119.959,180933.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,,,,,,,,,,,,,,,,,
,,,,,,,,,,,,,,,,,
,,,,,,,,,,,,,,,,,
,,,,,,,,,,,,,,,,,


In [32]:
# Check for null rows
song_df.isnull().sum()

song                10
artist              10
streams             10
position            10
danceability        20
energy              20
key                 20
loudness            20
mode                20
speechiness         20
acousticness        20
instrumentalness    20
liveness            20
valence             20
tempo               20
duration_ms         20
time_signature      20
dtype: int64

In [33]:
# Print out NaN rows to check data
na_df = song_df[song_df.isna().any(axis=1)]
na_df

Unnamed: 0_level_0,song,artist,streams,position,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
song_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0Fs9cdPDhptWEDJmiCbkEW,She Knows (feat. Amber Coffman & Cults),J. Cole,170907846.0,125.0,,,,,,,,,,,,,
0ofHAoxe9vBkTCp2UQIavz,Dreams - 2004 Remaster,Fleetwood Mac,168433773.0,129.0,,,,,,,,,,,,,
1BYZxKSf0aTxp8ZFoeyM3d,we fell in love in october,girl in red,166697904.0,134.0,,,,,,,,,,,,,
2HbKqm4o0w5wEeEFXm2sD4,Money Trees,"Kendrick Lamar, Jay Rock",170257491.0,126.0,,,,,,,,,,,,,
2eAZfqOm4EnOF9VvN50Tyc,The Way Life Goes (feat. Oh Wonder),Lil Uzi Vert,169492728.0,127.0,,,,,,,,,,,,,
2wAJTrFhCnQyNSD3oUgTZO,Work Out,J. Cole,167299227.0,133.0,,,,,,,,,,,,,
4kTLpAbhuEGHAAdDjOIWaa,Pursuit Of Happiness (Nightmare),"Kid Cudi, MGMT, Ratatat",167349621.0,131.0,,,,,,,,,,,,,
6Hj9jySrnFppAI0sEMCZpJ,Robbery,Juice WRLD,167325105.0,132.0,,,,,,,,,,,,,
6M3PsepEj5gyJoIi7Xvr7u,Take My Breath,The Weeknd,167514423.0,130.0,,,,,,,,,,,,,
7qEHsqek33rTcFNT9PFqLf,Someone You Loved,Lewis Capaldi,168997641.0,128.0,,,,,,,,,,,,,


In [34]:
# Drop duplicates
song_df = song_df.dropna()
song_df

Unnamed: 0_level_0,song,artist,streams,position,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
song_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
003vvx7Niy0yvhvHt4a68B,Mr. Brightside,The Killers,198736911.0,82.0,0.352,0.911,C♯/D♭,-5.230,major,0.0747,0.00121,0.000000,0.0995,0.236,148.033,222973.0,4.0
00Blm7zeNqgYLPtW6zg8cj,One Right Now (with The Weeknd),Post Malone,657859620.0,4.0,0.687,0.781,C♯/D♭,-4.806,major,0.0530,0.03610,0.000000,0.0755,0.688,97.014,193507.0,4.0
02MWAaffLxlfxAUY7c5dvx,Heat Waves,Glass Animals,617101770.0,6.0,0.761,0.525,B,-6.900,major,0.0944,0.44000,0.000007,0.0921,0.531,80.870,238805.0,4.0
04S1pkp1VaIqjg8zZqknR5,Enchanted,Taylor Swift,520777044.0,9.0,0.535,0.618,G♯/A♭,-3.913,major,0.0273,0.07160,0.000388,0.1690,0.228,81.975,352200.0,4.0
04sN26COy28wTXYj3dMoiZ,Bored,Billie Eilish,164300103.0,138.0,0.614,0.318,G,-12.695,major,0.0478,0.89600,0.002390,0.0795,0.112,119.959,180933.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7nc7mlSdWYeFom84zZ8Wr8,Tell Em,"Cochise, $NOT",225173331.0,63.0,0.672,0.717,F,-7.476,major,0.2260,0.10300,0.000000,0.3980,0.473,157.905,180380.0,4.0
7rglLriMNBPAyuJOMGwi39,Cold Heart - PNAU Remix,"Elton John, Dua Lipa",374349786.0,20.0,0.795,0.800,C♯/D♭,-6.320,major,0.0309,0.03540,0.000073,0.0915,0.934,116.032,202735.0,4.0
7sO5G9EABYOXQKNPNiE9NR,Ric Flair Drip (& Metro Boomin),Offset,150734583.0,178.0,0.880,0.428,A,-8.280,major,0.2060,0.14900,0.000051,0.1140,0.333,100.007,172800.0,4.0
7vQbuQcyTflfCIOu3Uzzya,Jingle Bell Rock,BobHelms,149358282.0,185.0,0.754,0.424,D,-8.463,major,0.0363,0.64300,0.000000,0.0652,0.806,119.705,130973.0,4.0


### Upload to PostgreSQL database

In [16]:
from sqlalchemy import create_engine
from getpass import getpass

# Set up connection to database
password = getpass('Enter database password.')
engine = create_engine(f'postgresql://postgres:{password}@localhost:5432/project_spotify_db')

Enter database password.········


In [20]:
# Upload song_df to database ("top_songs" table)
song_df.to_sql(name='top_songs', con=engine, if_exists='append')

In [None]:
# Upload scrape_df to database ("raw_scrape" table)


In [None]:
# Upload features_df to database ("features" table)


In [37]:
# Upload total_streams_df to database ("total_streams" table)


In [None]:
# Upload highest_position_df to database ("highest_position" table)


In [None]:
# Upload track_artist_df to database ("track_artist" table)
