# Final Project: Spotify Top Songs Analysis

## Machine Learning Model: 

### Can we predict whether a song has ranked within top 20 positions, based on its musical attributes (such as key, energy, mode, loudness, etc.)

In [1]:
# Import dependencies
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

### Load in data from database

In [2]:
from sqlalchemy import create_engine
from config import db_pswd

# Set up connection to database
engine = create_engine(f'postgresql://postgres:{db_pswd}@localhost:5432/project_spotify_db')

In [3]:
# Read in Spotify song data from database
song_df = pd.read_sql('SELECT * FROM songs;', engine, index_col='song_id')
song_df.head()

Unnamed: 0_level_0,song,artist,streams,position,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
song_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
003vvx7Niy0yvhvHt4a68B,Mr. Brightside,The Killers,82395453,69,0.352,0.911,C♯/D♭,-5.23,major,0.0747,0.00121,0.0,0.0995,0.236,148.033,222973.0,4.0
00Blm7zeNqgYLPtW6zg8cj,One Right Now (with The Weeknd),Post Malone,8118535,1,0.687,0.781,C♯/D♭,-4.806,major,0.053,0.0361,0.0,0.0755,0.688,97.014,193507.0,4.0
00selpxxljfn9n5Pf4K3VR,Show U Off,Brent Faiyaz,260432,138,0.583,0.405,C,-11.295,major,0.0534,0.643,0.00391,0.108,0.549,84.997,251133.0,4.0
01FvQEvHETjWqcDpQDJdTb,Your Bartender,Morgan Wallen,6031746,28,0.555,0.771,E,-5.237,major,0.0282,0.247,7.3e-05,0.149,0.442,139.971,185093.0,4.0
02MWAaffLxlfxAUY7c5dvx,Heat Waves,Glass Animals,234172425,4,0.761,0.525,B,-6.9,major,0.0944,0.44,7e-06,0.0921,0.531,80.87,238805.0,4.0


### Check dataframe for data types & Add column for whether song has been in top 20

In [4]:
# Add column for Yes/No encoding of whether song hit top 20 positions
# Save as new df for machine learning
song_ml_df = song_df.copy()

bins = [0, 20, 200]
group_names = [1, 0]

song_ml_df['top_twenty'] = pd.cut(song_ml_df['position'], bins, labels=group_names)
song_ml_df.sample(10)

Unnamed: 0_level_0,song,artist,streams,position,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,top_twenty
song_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
13HsOwrwTA5HgLuoaDHncP,RUNITUP (feat. Teezo Touchdown),"Tyler, The Creator",6129691,13,0.665,0.469,A♯/B♭,-7.778,minor,0.13,0.565,0.000317,0.219,0.142,115.852,229565.0,4.0,1
2BcMwX1MPV6ZHP4tUT9uq6,Knife Talk (with 21 Savage ft. Project Pat),Drake,87046906,2,0.849,0.424,F,-9.579,minor,0.324,0.0635,0.0,0.0834,0.153,145.887,242966.0,4.0,1
2QqJTIlGKRLJC3onkavYEz,Flex (feat. Juice WRLD),Polo G,780897,189,0.752,0.673,A♯/B♭,-5.792,minor,0.153,0.0538,0.0,0.133,0.35,164.009,163902.0,4.0,0
2Nc1v8I86FUGorwjXKo0in,"Belly, The Weeknd - Die For It (ft. Nas)",Belly,2012875,34,0.657,0.492,F,-9.612,minor,0.136,0.392,0.0,0.425,0.193,97.107,199867.0,4.0,0
249gnXrbfmV8NG6jTEMSwD,Life Goes On,BTS,243374,173,0.566,0.716,C♯/D♭,-5.733,major,0.0424,0.00691,0.0,0.37,0.45,81.068,207481.0,4.0,0
1zzxoZVylsna2BQB65Ppcb,X Gon' Give It To Ya,DMX,3724103,6,0.761,0.899,A♯/B♭,-3.09,minor,0.183,0.0135,0.0,0.0719,0.673,95.027,217587.0,4.0,1
15hJmqqEtASVXl6sM7i4UF,trademark usa,BaKeem,2019573,36,0.615,0.6,A♯/B♭,-5.62,major,0.27,0.107,2e-06,0.283,0.0661,130.027,270671.0,4.0,0
2AziR5bWTpniyTm2GQNer7,Comedy,Bo Burnham,1929816,108,0.47,0.565,F,-8.43,major,0.173,0.465,0.0,0.225,0.351,180.061,319714.0,4.0,0
4ak9GGe6afmi2HbxEjvhIC,Male Fantasy,Billie Eilish,4036176,37,0.687,0.288,D,-10.415,major,0.045,0.863,0.0,0.0986,0.308,110.927,194887.0,4.0,0
6DtlH2Ax805sJmaty5mtfK,Forget It,$uicideboy$,248540,143,0.711,0.505,G♯/A♭,-5.798,major,0.101,0.16,0.000289,0.121,0.363,140.006,198983.0,4.0,0


In [5]:
# Check data types
song_ml_df.dtypes

song                  object
artist                object
streams                int64
position               int64
danceability         float64
energy               float64
key                   object
loudness             float64
mode                  object
speechiness          float64
acousticness         float64
instrumentalness     float64
liveness             float64
valence              float64
tempo                float64
duration_ms          float64
time_signature        object
top_twenty          category
dtype: object

In [6]:
# Change 'top_twenty' data type to integer
song_ml_df['top_twenty'] = song_ml_df['top_twenty'].astype('int64')
song_ml_df.dtypes

song                 object
artist               object
streams               int64
position              int64
danceability        float64
energy              float64
key                  object
loudness            float64
mode                 object
speechiness         float64
acousticness        float64
instrumentalness    float64
liveness            float64
valence             float64
tempo               float64
duration_ms         float64
time_signature       object
top_twenty            int64
dtype: object

### Preprocess dataframe for machine learning model

In [7]:
# Remove index of dataframe
song_ml_df = song_ml_df.reset_index()
song_ml_df.head()

Unnamed: 0,song_id,song,artist,streams,position,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,top_twenty
0,003vvx7Niy0yvhvHt4a68B,Mr. Brightside,The Killers,82395453,69,0.352,0.911,C♯/D♭,-5.23,major,0.0747,0.00121,0.0,0.0995,0.236,148.033,222973.0,4.0,0
1,00Blm7zeNqgYLPtW6zg8cj,One Right Now (with The Weeknd),Post Malone,8118535,1,0.687,0.781,C♯/D♭,-4.806,major,0.053,0.0361,0.0,0.0755,0.688,97.014,193507.0,4.0,1
2,00selpxxljfn9n5Pf4K3VR,Show U Off,Brent Faiyaz,260432,138,0.583,0.405,C,-11.295,major,0.0534,0.643,0.00391,0.108,0.549,84.997,251133.0,4.0,0
3,01FvQEvHETjWqcDpQDJdTb,Your Bartender,Morgan Wallen,6031746,28,0.555,0.771,E,-5.237,major,0.0282,0.247,7.3e-05,0.149,0.442,139.971,185093.0,4.0,0
4,02MWAaffLxlfxAUY7c5dvx,Heat Waves,Glass Animals,234172425,4,0.761,0.525,B,-6.9,major,0.0944,0.44,7e-06,0.0921,0.531,80.87,238805.0,4.0,1


In [8]:
# Drop the non-beneficial ID columns('song_id', 'song', 'artist') & 'position' (redundant with 'number_one' column)
song_ml_df = song_ml_df.drop(['song_id', 'song', 'artist', 'position'], 1)
song_ml_df.head()

Unnamed: 0,streams,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,top_twenty
0,82395453,0.352,0.911,C♯/D♭,-5.23,major,0.0747,0.00121,0.0,0.0995,0.236,148.033,222973.0,4.0,0
1,8118535,0.687,0.781,C♯/D♭,-4.806,major,0.053,0.0361,0.0,0.0755,0.688,97.014,193507.0,4.0,1
2,260432,0.583,0.405,C,-11.295,major,0.0534,0.643,0.00391,0.108,0.549,84.997,251133.0,4.0,0
3,6031746,0.555,0.771,E,-5.237,major,0.0282,0.247,7.3e-05,0.149,0.442,139.971,185093.0,4.0,0
4,234172425,0.761,0.525,B,-6.9,major,0.0944,0.44,7e-06,0.0921,0.531,80.87,238805.0,4.0,1


In [9]:
# Drop 'streams' column, so only song attributes/features & results columns remain
song_ml_df = song_ml_df.drop(['streams'], 1)
song_ml_df.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,top_twenty
0,0.352,0.911,C♯/D♭,-5.23,major,0.0747,0.00121,0.0,0.0995,0.236,148.033,222973.0,4.0,0
1,0.687,0.781,C♯/D♭,-4.806,major,0.053,0.0361,0.0,0.0755,0.688,97.014,193507.0,4.0,1
2,0.583,0.405,C,-11.295,major,0.0534,0.643,0.00391,0.108,0.549,84.997,251133.0,4.0,0
3,0.555,0.771,E,-5.237,major,0.0282,0.247,7.3e-05,0.149,0.442,139.971,185093.0,4.0,0
4,0.761,0.525,B,-6.9,major,0.0944,0.44,7e-06,0.0921,0.531,80.87,238805.0,4.0,1


In [10]:
# Generate categorical variable list
song_cat = song_ml_df.dtypes[song_ml_df.dtypes == "object"].index.tolist()

# Check number of unique values in each column
song_ml_df[song_cat].nunique()

key               12
mode               2
time_signature     4
dtype: int64

#### Encode Dataframe's non-numerical data

In [11]:
# Create instance for label encoder
labelencoder = LabelEncoder()

In [12]:
# Encode categorical columns and store as another column
song_ml_rf_df = song_ml_df.copy()
song_ml_rf_df['key_type'] = labelencoder.fit_transform(song_ml_rf_df['key'])
song_ml_rf_df['mode_type'] = labelencoder.fit_transform(song_ml_rf_df['mode'])
song_ml_rf_df['time_sig_type'] = labelencoder.fit_transform(song_ml_rf_df['time_signature'])

song_ml_rf_df.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,top_twenty,key_type,mode_type,time_sig_type
0,0.352,0.911,C♯/D♭,-5.23,major,0.0747,0.00121,0.0,0.0995,0.236,148.033,222973.0,4.0,0,4,0,2
1,0.687,0.781,C♯/D♭,-4.806,major,0.053,0.0361,0.0,0.0755,0.688,97.014,193507.0,4.0,1,4,0,2
2,0.583,0.405,C,-11.295,major,0.0534,0.643,0.00391,0.108,0.549,84.997,251133.0,4.0,0,3,0,2
3,0.555,0.771,E,-5.237,major,0.0282,0.247,7.3e-05,0.149,0.442,139.971,185093.0,4.0,0,7,0,2
4,0.761,0.525,B,-6.9,major,0.0944,0.44,7e-06,0.0921,0.531,80.87,238805.0,4.0,1,2,0,2


#### Text/category columns encoded with numerics: 

key
- A: 0
- A♯/B♭: 1
- B: 2
- C: 3
- C♯/D♭: 4
- D: 5
- D♯/E♭: 6
- E: 7
- F: 8
- F♯/G♭: 9
- G: 10
- G♯/A♭: 11

mode
- major: 0
- minor: 1

time_signature
- 2.0: 0
- 3.0: 1
- 4.0: 2
- 5.0: 3

In [13]:
# Drop the original columns
song_ml_rf_df = song_ml_rf_df.drop(["key", "mode", "time_signature"], 1)
song_ml_rf_df.head()

Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,top_twenty,key_type,mode_type,time_sig_type
0,0.352,0.911,-5.23,0.0747,0.00121,0.0,0.0995,0.236,148.033,222973.0,0,4,0,2
1,0.687,0.781,-4.806,0.053,0.0361,0.0,0.0755,0.688,97.014,193507.0,1,4,0,2
2,0.583,0.405,-11.295,0.0534,0.643,0.00391,0.108,0.549,84.997,251133.0,0,3,0,2
3,0.555,0.771,-5.237,0.0282,0.247,7.3e-05,0.149,0.442,139.971,185093.0,0,7,0,2
4,0.761,0.525,-6.9,0.0944,0.44,7e-06,0.0921,0.531,80.87,238805.0,1,2,0,2


In [14]:
# Upload song_df to database ("song_ml" table)
song_ml_rf_df.to_sql(name='song_ml', con=engine, if_exists='append')

#### Split and scale data for model

In [15]:
# Assign preprocessed data into features and target arrays
y = song_ml_rf_df["top_twenty"].ravel()
X = song_ml_rf_df.drop(["top_twenty"], 1)

In [16]:
# Split preprocessed data into training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [17]:
# Create StandardScaler instances
scaler = StandardScaler()

In [18]:
# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

In [19]:
# Scale data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

### Random Forest Model

In [20]:
# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=1)

In [21]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

#### Evaluate Model

In [22]:
# Make predictions using testing data
y_pred = rf_model.predict(X_test_scaled)

In [23]:
# Calculate the model's accuracy score
print(f'Accuracy Score: {accuracy_score(y_test, y_pred)}')

Accuracy Score: 0.8320610687022901


In [24]:
# Training score & Testing score
print(f'Training Data Score: {rf_model.score(X_train_scaled, y_train)}')
print(f'Testing Data Score: {rf_model.score(X_test_scaled, y_test)}')

Training Data Score: 1.0
Testing Data Score: 0.8320610687022901


In [25]:
# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=["Actual 0", "Actual 1"], columns = ["Predicted 0", "Predicted 1"])
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,326,3
Actual 1,63,1


In [26]:
# Classification report
print(f'Classification Report:\n{classification_report(y_test, y_pred)}')

Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.99      0.91       329
           1       0.25      0.02      0.03        64

    accuracy                           0.83       393
   macro avg       0.54      0.50      0.47       393
weighted avg       0.74      0.83      0.76       393



In [27]:
# Calculate feature importance in Random Forest model, sorted.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.11104948746185679, 'loudness'),
 (0.10640431169768423, 'valence'),
 (0.10137944837572448, 'tempo'),
 (0.09812816620737758, 'duration_ms'),
 (0.0969214636940149, 'acousticness'),
 (0.09664611882647652, 'speechiness'),
 (0.09226758301610166, 'liveness'),
 (0.09152283260674737, 'danceability'),
 (0.09150957349496623, 'energy'),
 (0.05198547522184088, 'key_type'),
 (0.04259416429470098, 'instrumentalness'),
 (0.012601606613076228, 'mode_type'),
 (0.006989768489432388, 'time_sig_type')]

### Make Predictions

In [28]:
song_ml_df.columns

Index(['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'duration_ms', 'time_signature', 'top_twenty'],
      dtype='object')

#### Generate random numbers for each input

In [29]:
# Import dependency
import random

In [30]:
# Generate random number (float) between 0 and 1 for song attributes
acousticness_input = random.random()
danceability_input = random.random()
energy_input = random.random()
liveness_input = random.random()
instrumentalness_input = random.random()
loudness_input = random.random()
speechiness_input = random.random()
valence_input = random.random()

print(f'Acousticness: {acousticness_input}')
print(f'Danceability: {danceability_input}')
print(f'Energy: {energy_input}')
print(f'Instrumentalness: {instrumentalness_input}')
print(f'Liveness: {liveness_input}')
print(f'Loudness: {loudness_input}')
print(f'Speechiness: {speechiness_input}')
print(f'Valence: {valence_input}')

Acousticness: 0.5517875304037524
Danceability: 0.6269968194744252
Energy: 0.4271944880224733
Instrumentalness: 0.8370405093693107
Liveness: 0.33250670096962853
Loudness: 0.3566803576583796
Speechiness: 0.7970677718674362
Valence: 0.23590052889358093


In [31]:
# Generate random number (int) for song duration (max. 10min)
duration_ms_input = random.randint(0, 600001)
print(f'Duration(ms): {duration_ms_input}')

Duration(ms): 557997


In [32]:
# Generate random number (int) for tempo/bpm (max. 250)
tempo_input = random.randint(0, 251)
print(f'Tempo(bpm): {tempo_input}')

Tempo(bpm): 104


In [33]:
# Generate random binary series for chord columns (only 1 column has "1" to mark it as the song's chord)
list_of_chords = [
    "chord_A",
    "chord_A#/Bb",
    "chord_B",
    "chord_C",
    "chord_C#/Db",
    "chord_D",
    "chord_D#/Eb",
    "chord_E",
    "chord_F",
    "chord_F#/Gb",
    "chord_G",
    "chord_G#/Ab"
]
chord_index = random.randrange(len(list_of_chords))
chord_input = list_of_chords[chord_index]
print(f'Chord: {chord_input}')

Chord: chord_B
