# Final Project: Spotify Top Songs Analysis

## Machine Learning Model: 

### Can we predict whether a song has ranked within top 20 positions, based on its musical attributes (such as key, energy, mode, loudness, etc.)

In [1]:
# Import dependencies
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from collections import Counter
from imblearn.combine import SMOTEENN
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

### Load in data from database

In [2]:
from sqlalchemy import create_engine
from config import db_pswd

# Set up connection to database
engine = create_engine(f'postgresql://postgres:{db_pswd}@localhost:5432/project_spotify_db')

In [3]:
# Read in Spotify song data from database
song_df = pd.read_sql('SELECT * FROM songs;', engine, index_col='song_id')
song_df.head()

Unnamed: 0_level_0,song,artist,streams,position,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
song_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
003vvx7Niy0yvhvHt4a68B,Mr. Brightside,The Killers,84064119,69,0.352,0.911,C♯/D♭,-5.23,major,0.0747,0.00121,0.0,0.0995,0.236,148.033,222973,4
00Blm7zeNqgYLPtW6zg8cj,One Right Now (with The Weeknd),Post Malone,12262323,1,0.687,0.781,C♯/D♭,-4.806,major,0.053,0.0361,0.0,0.0755,0.688,97.014,193507,4
00selpxxljfn9n5Pf4K3VR,Show U Off,Brent Faiyaz,260432,138,0.583,0.405,C,-11.295,major,0.0534,0.643,0.00391,0.108,0.549,84.997,251133,4
01FvQEvHETjWqcDpQDJdTb,Your Bartender,Morgan Wallen,6031746,28,0.555,0.771,E,-5.237,major,0.0282,0.247,7.3e-05,0.149,0.442,139.971,185093,4
01K4zKU104LyJ8gMb7227B,Nothing New (feat. Phoebe Bridgers) (Taylor’s ...,Taylor Swift,7050525,4,0.606,0.377,C,-9.455,major,0.0275,0.817,0.0,0.154,0.446,101.96,258813,4


### Check dataframe for data types & Add column for whether song has been in top 20

In [4]:
# Add column for Yes/No encoding of whether song hit top 20 positions
# Save as new df for machine learning
song_ml_df = song_df.copy()

bins = [0, 20, 200]
group_names = [1, 0]

song_ml_df['top_twenty'] = pd.cut(song_ml_df['position'], bins, labels=group_names)
song_ml_df.sample(10)

Unnamed: 0_level_0,song,artist,streams,position,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,top_twenty
song_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1QL7nSDZCwZMnbisV4KOXt,t r a n s p a r e n t s o u l feat. Travis Barker,WILLOW,30176534,19,0.39,0.83,B,-5.201,minor,0.0563,0.0021,0.0,0.156,0.191,90.021,168030,4,1
3WIifw9lqdgSZpt9renBAg,Yosemite,Lana Del Rey,243996,163,0.584,0.283,A,-12.893,minor,0.0309,0.901,0.0332,0.127,0.278,106.068,306205,4,0
2GPTiPR5db727wu1tFvFt4,New Again,Kanye West,6891489,21,0.679,0.673,F♯/G♭,-5.17,minor,0.189,0.066,0.0,0.479,0.295,91.112,183297,4,0
0y60itmpH0aPKsFiGxmtnh,Wait a Minute!,WILLOW,29001684,101,0.764,0.705,D♯/E♭,-5.279,minor,0.0278,0.0371,1.9e-05,0.0943,0.672,101.003,196520,4,0
3AmqJsyZDFA8EHC461R7bY,Born 2 Be Great,Lil Tjay,346642,73,0.593,0.512,G♯/A♭,-7.837,minor,0.0737,0.58,0.0,0.0996,0.192,95.733,170000,4,0
5dOcS75jq0kLKbvCY9P2Ex,Chi-Raq (with G Herbo),Nicki Minaj,429625,61,0.762,0.439,A♯/B♭,-14.007,minor,0.419,0.328,4e-05,0.107,0.354,145.818,231984,4,0
3MkXV52jXtsG4pvKp4cGE8,Chasing After You,"Ryan Hurd, Maren Morris",2066148,103,0.601,0.675,E,-6.358,major,0.0399,0.698,0.0,0.125,0.446,131.933,207667,4,0
4zm8xZiV5FxJu62EHEvZaT,777,"Bruno Mars, Anderson .Paak, Silk Sonic",2755200,46,0.836,0.622,C♯/D♭,-11.003,major,0.0946,0.00269,0.000566,0.335,0.892,102.015,165001,4,0
081Shn2hU3YxvQmGaCPegr,Unloyal (with Ari Lennox),Summer Walker,1662111,29,0.805,0.51,E,-6.179,minor,0.0569,0.0357,0.000469,0.108,0.36,127.983,207295,4,0
6MDdceLYec4AxohmorE4vH,Face Of My City (feat. Lil Baby),Jack Harlow,1277899,158,0.74,0.566,C♯/D♭,-8.389,minor,0.32,0.0034,0.0,0.589,0.342,156.197,123447,4,0


In [5]:
# Check data types
song_ml_df.dtypes

song                  object
artist                object
streams                int64
position               int64
danceability         float64
energy               float64
key                   object
loudness             float64
mode                  object
speechiness          float64
acousticness         float64
instrumentalness     float64
liveness             float64
valence              float64
tempo                float64
duration_ms            int64
time_signature        object
top_twenty          category
dtype: object

### Preprocess dataframe for machine learning model

In [6]:
# Drop the non-beneficial ID columns(song', 'artist') & 'position' (redundant with 'top_twenty' column)
song_ml_df = song_ml_df.drop(['song', 'artist', 'position'], 1)

# Drop 'streams', 'duration_ms', 'time_signature', 'loudness', 'liveness' columns
song_ml_df = song_ml_df.drop(['streams', 'duration_ms', 'time_signature', 'loudness', 'liveness'], 1)

song_ml_df.head()

Unnamed: 0_level_0,danceability,energy,key,mode,speechiness,acousticness,instrumentalness,valence,tempo,top_twenty
song_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
003vvx7Niy0yvhvHt4a68B,0.352,0.911,C♯/D♭,major,0.0747,0.00121,0.0,0.236,148.033,0
00Blm7zeNqgYLPtW6zg8cj,0.687,0.781,C♯/D♭,major,0.053,0.0361,0.0,0.688,97.014,1
00selpxxljfn9n5Pf4K3VR,0.583,0.405,C,major,0.0534,0.643,0.00391,0.549,84.997,0
01FvQEvHETjWqcDpQDJdTb,0.555,0.771,E,major,0.0282,0.247,7.3e-05,0.442,139.971,0
01K4zKU104LyJ8gMb7227B,0.606,0.377,C,major,0.0275,0.817,0.0,0.446,101.96,1


In [7]:
# Generate categorical variable list
song_cat = song_ml_df.dtypes[song_ml_df.dtypes == "object"].index.tolist()

# Check number of unique values in each column
song_ml_df[song_cat].nunique()

key     12
mode     2
dtype: int64

#### Encode Dataframe's non-numerical data

In [8]:
# Create instance for label encoder
labelencoder = LabelEncoder()

In [9]:
# Encode categorical columns and store as another column
song_ml_rf_df = song_ml_df.copy()
song_ml_rf_df['key_type'] = labelencoder.fit_transform(song_ml_rf_df['key'])
song_ml_rf_df['mode_type'] = labelencoder.fit_transform(song_ml_rf_df['mode'])

# Drop the original columns
song_ml_rf_df = song_ml_rf_df.drop(["key", "mode"], 1)

song_ml_rf_df.head()

Unnamed: 0_level_0,danceability,energy,speechiness,acousticness,instrumentalness,valence,tempo,top_twenty,key_type,mode_type
song_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
003vvx7Niy0yvhvHt4a68B,0.352,0.911,0.0747,0.00121,0.0,0.236,148.033,0,4,0
00Blm7zeNqgYLPtW6zg8cj,0.687,0.781,0.053,0.0361,0.0,0.688,97.014,1,4,0
00selpxxljfn9n5Pf4K3VR,0.583,0.405,0.0534,0.643,0.00391,0.549,84.997,0,3,0
01FvQEvHETjWqcDpQDJdTb,0.555,0.771,0.0282,0.247,7.3e-05,0.442,139.971,0,7,0
01K4zKU104LyJ8gMb7227B,0.606,0.377,0.0275,0.817,0.0,0.446,101.96,1,3,0


#### Text/category columns encoded with numerics: 

key
- A: 0
- A♯/B♭: 1
- B: 2
- C: 3
- C♯/D♭: 4
- D: 5
- D♯/E♭: 6
- E: 7
- F: 8
- F♯/G♭: 9
- G: 10
- G♯/A♭: 11

mode
- major: 0
- minor: 1

In [10]:
# Upload song_df to database ("song_ml" table)
song_ml_rf_df.to_sql(name='song_ml', con=engine, if_exists='append')

#### Assign features and target variables

In [11]:
# Assign preprocessed data into features and target arrays
y = song_ml_rf_df["top_twenty"].ravel()
X = song_ml_rf_df.drop(["top_twenty"], 1)

In [12]:
# Check dataset split
Counter(y)

Counter({0: 1445, 1: 298})

### Split, resample, and scale data for model
Using SMOTEENN combination sampling to address disparate class sizes

In [13]:
# Split preprocessed data into training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [14]:
# Check dataset split of training set
Counter(y_train)

Counter({0: 1087, 1: 220})

In [15]:
# Data resampled with SMOTEENN
smote_enn = SMOTEENN(random_state=1)
X_resampled, y_resampled = smote_enn.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({0: 375, 1: 699})

In [16]:
# Create StandardScaler instances
scaler = StandardScaler()

In [17]:
# Fit the StandardScaler
X_scaler = scaler.fit(X_resampled)

In [18]:
# Scale data
X_train_scaled = X_scaler.transform(X_resampled)
X_test_scaled = X_scaler.transform(X_test)

### Random Forest Model

In [19]:
# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=1)

In [20]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_resampled)

#### Evaluate Model

In [21]:
# Make predictions using testing data
y_pred = rf_model.predict(X_test_scaled)
y_pred

array([0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1,
       1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1,
       0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0,

In [22]:
# Calculate the model's accuracy score
print(f'Accuracy Score: {accuracy_score(y_test, y_pred)}')

Accuracy Score: 0.5711009174311926


In [23]:
# Training score & Testing score
print(f'Training Data Score: {rf_model.score(X_train_scaled, y_resampled)}')
print(f'Testing Data Score: {rf_model.score(X_test_scaled, y_test)}')

Training Data Score: 1.0
Testing Data Score: 0.5711009174311926


In [24]:
# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=["Actual 0", "Actual 1"], columns = ["Predicted 0", "Predicted 1"])
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,216,142
Actual 1,45,33


In [25]:
# Classification report
print(f'Classification Report:\n{classification_report(y_test, y_pred)}')

Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.60      0.70       358
           1       0.19      0.42      0.26        78

    accuracy                           0.57       436
   macro avg       0.51      0.51      0.48       436
weighted avg       0.71      0.57      0.62       436



In [26]:
# Calculate feature importance in Random Forest model, sorted.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.16832789889585317, 'tempo'),
 (0.1421131763853942, 'speechiness'),
 (0.14143978525840228, 'energy'),
 (0.12504549740255289, 'danceability'),
 (0.12391007180208717, 'valence'),
 (0.11002648871876022, 'acousticness'),
 (0.1033703405062424, 'key_type'),
 (0.05441335846414095, 'instrumentalness'),
 (0.03135338256656677, 'mode_type')]

### Remove a couple low-contributing features to see if it improves model
Removing 'mode_type' & 'instrumentalness'

In [27]:
# Reprocess dataframe with reduced features
song_ml_rf2_df = song_ml_df.copy()
song_ml_rf2_df = song_ml_rf2_df.drop(['mode', 'instrumentalness'], 1)
song_ml_rf2_df.head()

Unnamed: 0_level_0,danceability,energy,key,speechiness,acousticness,valence,tempo,top_twenty
song_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
003vvx7Niy0yvhvHt4a68B,0.352,0.911,C♯/D♭,0.0747,0.00121,0.236,148.033,0
00Blm7zeNqgYLPtW6zg8cj,0.687,0.781,C♯/D♭,0.053,0.0361,0.688,97.014,1
00selpxxljfn9n5Pf4K3VR,0.583,0.405,C,0.0534,0.643,0.549,84.997,0
01FvQEvHETjWqcDpQDJdTb,0.555,0.771,E,0.0282,0.247,0.442,139.971,0
01K4zKU104LyJ8gMb7227B,0.606,0.377,C,0.0275,0.817,0.446,101.96,1


In [28]:
# Encode categorical columns and store as another column; drop the original column
song_ml_rf2_df['key_type'] = labelencoder.fit_transform(song_ml_rf2_df['key'])
song_ml_rf2_df = song_ml_rf2_df.drop(["key"], 1)
song_ml_rf2_df.head()

Unnamed: 0_level_0,danceability,energy,speechiness,acousticness,valence,tempo,top_twenty,key_type
song_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
003vvx7Niy0yvhvHt4a68B,0.352,0.911,0.0747,0.00121,0.236,148.033,0,4
00Blm7zeNqgYLPtW6zg8cj,0.687,0.781,0.053,0.0361,0.688,97.014,1,4
00selpxxljfn9n5Pf4K3VR,0.583,0.405,0.0534,0.643,0.549,84.997,0,3
01FvQEvHETjWqcDpQDJdTb,0.555,0.771,0.0282,0.247,0.442,139.971,0,7
01K4zKU104LyJ8gMb7227B,0.606,0.377,0.0275,0.817,0.446,101.96,1,3


In [29]:
# Assign preprocessed data into features and target arrays
y = song_ml_rf2_df["top_twenty"].ravel()
X = song_ml_rf2_df.drop(["top_twenty"], 1)

In [30]:
# Check dataset split
Counter(y)

Counter({0: 1445, 1: 298})

In [31]:
# Split preprocessed data into training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [32]:
# Check dataset split of training set
Counter(y_train)

Counter({0: 1087, 1: 220})

In [33]:
# Data resampled with SMOTEENN
smote_enn = SMOTEENN(random_state=1)
X_resampled, y_resampled = smote_enn.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({0: 382, 1: 660})

In [34]:
# Fit the StandardScaler
X_scaler = scaler.fit(X_resampled)

In [35]:
# Scale data
X_train_scaled = X_scaler.transform(X_resampled)
X_test_scaled = X_scaler.transform(X_test)

In [36]:
# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=1)

In [37]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_resampled)

In [38]:
# Make predictions using testing data
y_pred = rf_model.predict(X_test_scaled)
y_pred

array([1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0,

In [39]:
# Calculate the model's accuracy score
print(f'Accuracy Score: {accuracy_score(y_test, y_pred)}')

Accuracy Score: 0.5573394495412844


In [40]:
# Training score & Testing score
print(f'Training Data Score: {rf_model.score(X_train_scaled, y_resampled)}')
print(f'Testing Data Score: {rf_model.score(X_test_scaled, y_test)}')

Training Data Score: 1.0
Testing Data Score: 0.5573394495412844


In [41]:
# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=["Actual 0", "Actual 1"], columns = ["Predicted 0", "Predicted 1"])
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,217,141
Actual 1,52,26


In [42]:
# Classification report
print(f'Classification Report:\n{classification_report(y_test, y_pred)}')

Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.61      0.69       358
           1       0.16      0.33      0.21        78

    accuracy                           0.56       436
   macro avg       0.48      0.47      0.45       436
weighted avg       0.69      0.56      0.61       436



In [43]:
# Calculate feature importance in Random Forest model, sorted.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.18404619148076182, 'tempo'),
 (0.15665814725751953, 'speechiness'),
 (0.13857775676137618, 'energy'),
 (0.1366599481572417, 'valence'),
 (0.13602176206089722, 'danceability'),
 (0.12568012894912012, 'key_type'),
 (0.12235606533308335, 'acousticness')]

*Removing columns didn't help with improving accuracy, so wasn't incorporated in final model. 

### Testing Deep Learning Model

In [95]:
# Import deep learning model dependencies
from sklearn.preprocessing import OneHotEncoder
import tensorflow as tf

In [96]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(song_ml_df[song_cat]), index=song_ml_df.index)

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names(song_cat)
encode_df.head()

Unnamed: 0_level_0,key_A,key_A♯/B♭,key_B,key_C,key_C♯/D♭,key_D,key_D♯/E♭,key_E,key_F,key_F♯/G♭,key_G,key_G♯/A♭,mode_major,mode_minor
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
003vvx7Niy0yvhvHt4a68B,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
00Blm7zeNqgYLPtW6zg8cj,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
00selpxxljfn9n5Pf4K3VR,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
01FvQEvHETjWqcDpQDJdTb,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
01K4zKU104LyJ8gMb7227B,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [97]:
# Merge one-hot encoded features into song_ml_df & drop originals
song_ml_dl_df = song_ml_df.merge(encode_df, left_index=True, right_index=True)
song_ml_dl_df = song_ml_dl_df.drop(song_cat, 1)
song_ml_dl_df.head()

Unnamed: 0_level_0,danceability,energy,speechiness,acousticness,instrumentalness,valence,tempo,top_twenty,key_A,key_A♯/B♭,...,key_C♯/D♭,key_D,key_D♯/E♭,key_E,key_F,key_F♯/G♭,key_G,key_G♯/A♭,mode_major,mode_minor
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
003vvx7Niy0yvhvHt4a68B,0.352,0.911,0.0747,0.00121,0.0,0.236,148.033,0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
00Blm7zeNqgYLPtW6zg8cj,0.687,0.781,0.053,0.0361,0.0,0.688,97.014,1,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
00selpxxljfn9n5Pf4K3VR,0.583,0.405,0.0534,0.643,0.00391,0.549,84.997,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
01FvQEvHETjWqcDpQDJdTb,0.555,0.771,0.0282,0.247,7.3e-05,0.442,139.971,0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
01K4zKU104LyJ8gMb7227B,0.606,0.377,0.0275,0.817,0.0,0.446,101.96,1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [98]:
# Assign preprocessed data into features and target arrays
y = song_ml_dl_df["top_twenty"]
X = song_ml_dl_df.drop(["top_twenty"], 1)

In [99]:
# Split preprocessed data into training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [100]:
# Create StandardScaler instances
scaler = StandardScaler()

In [101]:
# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

In [102]:
# Scale data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [104]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = 21
hidden_nodes_layer1 = 40

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_2 (Dense)              (None, 40)                880       
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 41        
Total params: 921
Trainable params: 921
Non-trainable params: 0
_________________________________________________________________


In [105]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [106]:
# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [107]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled, y_test, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

14/14 - 0s - loss: 0.5233 - accuracy: 0.8188
Loss: 0.5232744216918945, Accuracy: 0.8188073635101318


Accuracy is better than random forest model - however, the accuracy score reaches ~0.8-0.3 after only a few epochs, and stays that way for the remainder, suggesting potential overfitting.