In [77]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import pandas as pd
from pathlib import Path
import hvplot.pandas
import tensorflow as tf

In [5]:
# Read in the CSV file as a Pandas DataFrame
songs_df = pd.read_csv(Path("data/Holiday_Songs_Spotify.csv"))
songs_df.head()

Unnamed: 0.1,Unnamed: 0,track_uri,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,...,tempo,duration_ms,time_signature,key_mode,playlist_name,playlist_img,track_name,artist_name,album_name,album_img
0,1,00IqwkT0PZhJ86PJajRCqk,0.195,0.348,A#,-10.106,major,0.0332,0.82,0.0,...,166.824,213107,3,A# major,new_holiday_songs,https://mosaic.scdn.co/640/5f79f928a45b878579e...,Silver Bells,Johnny Mathis,Merry Christmas,https://i.scdn.co/image/b878b9e27201163be07e74...
1,2,01h424WG38dgY34vkI3Yd0,0.225,0.248,A,-15.871,major,0.0337,0.912,0.000143,...,96.013,183613,4,A major,new_holiday_songs,https://mosaic.scdn.co/640/5f79f928a45b878579e...,White Christmas,Bing Crosby,White Christmas,https://i.scdn.co/image/3bb0daf5f87a737ce67ace...
2,3,08BhfyKUXxZrnyHrDavNHP,0.444,0.288,F#,-11.793,major,0.0326,0.911,7e-06,...,108.043,199093,3,F# major,new_holiday_songs,https://mosaic.scdn.co/640/5f79f928a45b878579e...,The Christmas Waltz,Tony Bennett,A Swingin' Christmas Featuring The Count Basie...,https://i.scdn.co/image/96aa4fb09e7fe9d38599c8...
3,4,095XSaT8I2uI6Uldj2QrSl,0.687,0.496,A,-4.708,major,0.0339,0.434,0.00144,...,97.575,178680,4,A major,new_holiday_songs,https://mosaic.scdn.co/640/5f79f928a45b878579e...,Stop the Cavalry,Jona Lewie,On The Other Hand There's A Fist (Remastered),https://i.scdn.co/image/50c0ea35cacbf7c5d495c7...
4,5,09b2gJR45Pyip2rx9CnXW1,0.477,0.841,F#,-5.172,major,0.0358,0.000165,0.0971,...,119.954,203404,4,F# major,new_holiday_songs,https://mosaic.scdn.co/640/5f79f928a45b878579e...,I Don't Want to Go Home for Christmas,Independent Counsel of Funk,I Don't Want to Go Home for Christmas,https://i.scdn.co/image/77eb7c17cafe55037a1ab2...


In [None]:
#Preprocess the data

In [8]:
#drop unneeded columns
features_df = songs_df.drop(['Unnamed: 0','key_mode','time_signature', 'playlist_name', 'playlist_img',
                            'track_name', 'artist_name','album_name', 'album_img', 'track_uri',
                            'duration_ms'],axis=1)
features_df.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,0.195,0.348,A#,-10.106,major,0.0332,0.82,0.0,0.126,0.262,166.824
1,0.225,0.248,A,-15.871,major,0.0337,0.912,0.000143,0.404,0.19,96.013
2,0.444,0.288,F#,-11.793,major,0.0326,0.911,7e-06,0.0987,0.356,108.043
3,0.687,0.496,A,-4.708,major,0.0339,0.434,0.00144,0.0613,0.746,97.575
4,0.477,0.841,F#,-5.172,major,0.0358,0.000165,0.0971,0.257,0.571,119.954


In [9]:
#make dummies for key and mode columns
# dummies = df['YEAR'].astype(str).str.get_dummies()
#df = pd.concat([df.drop(columns='YEAR'), dummies], axis=1)

dummies_key = features_df['key'].astype(str).str.get_dummies()
print(dummies_key)
dummies_mode =features_df['mode'].astype(str).str.get_dummies()
print(dummies_mode)

     A  A#  B  C  C#  D  D#  E  F  F#  G  G#
0    0   1  0  0   0  0   0  0  0   0  0   0
1    1   0  0  0   0  0   0  0  0   0  0   0
2    0   0  0  0   0  0   0  0  0   1  0   0
3    1   0  0  0   0  0   0  0  0   0  0   0
4    0   0  0  0   0  0   0  0  0   1  0   0
..  ..  .. .. ..  .. ..  .. .. ..  .. ..  ..
162  0   0  0  0   0  0   0  0  1   0  0   0
163  0   0  0  0   0  0   0  0  1   0  0   0
164  0   0  0  0   0  0   0  0  0   0  1   0
165  0   0  0  0   0  0   1  0  0   0  0   0
166  0   0  0  0   0  1   0  0  0   0  0   0

[167 rows x 12 columns]
     major  minor
0        1      0
1        1      0
2        1      0
3        1      0
4        1      0
..     ...    ...
162      1      0
163      1      0
164      1      0
165      1      0
166      1      0

[167 rows x 2 columns]


In [26]:
data = [features_df, dummies_key, dummies_mode]
features_df_clean = pd.concat(data, axis=1)
features_df_clean = features_df_clean.drop(['key','mode'],axis=1)

features_df_clean                                            

Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,A,...,C#,D,D#,E,F,F#,G,G#,major,minor
0,0.195,0.348,-10.106,0.0332,0.820000,0.000000,0.1260,0.262,166.824,0,...,0,0,0,0,0,0,0,0,1,0
1,0.225,0.248,-15.871,0.0337,0.912000,0.000143,0.4040,0.190,96.013,1,...,0,0,0,0,0,0,0,0,1,0
2,0.444,0.288,-11.793,0.0326,0.911000,0.000007,0.0987,0.356,108.043,0,...,0,0,0,0,0,1,0,0,1,0
3,0.687,0.496,-4.708,0.0339,0.434000,0.001440,0.0613,0.746,97.575,1,...,0,0,0,0,0,0,0,0,1,0
4,0.477,0.841,-5.172,0.0358,0.000165,0.097100,0.2570,0.571,119.954,0,...,0,0,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162,0.572,0.895,-2.654,0.0644,0.013400,0.000000,0.2160,0.922,145.008,0,...,0,0,0,0,1,0,0,0,1,0
163,0.570,0.292,-14.408,0.0293,0.841000,0.000000,0.1370,0.399,72.534,0,...,0,0,0,0,1,0,0,0,1,0
164,0.838,0.346,-8.406,0.0471,0.827000,0.011100,0.2130,0.930,113.988,0,...,0,0,0,0,0,0,1,0,1,0
165,0.350,0.377,-11.658,0.0340,0.353000,0.000000,0.1900,0.277,124.559,0,...,0,0,1,0,0,0,0,0,1,0


In [27]:
#Number of rows and columns
features_df_clean.shape

(167, 23)

In [28]:
# Determine the number of unique values in each column.
features_df_clean.nunique()

danceability        144
energy              151
loudness            166
speechiness         138
acousticness        155
instrumentalness     64
liveness            140
valence             157
tempo               165
A                     2
A#                    2
B                     2
C                     2
C#                    2
D                     2
D#                    2
E                     2
F                     2
F#                    2
G                     2
G#                    2
major                 2
minor                 2
dtype: int64

In [30]:
#  Prepare the data. Use the StandardScaler module and fit_transform function to
from sklearn.preprocessing import StandardScaler
# scale all columns with numerical values
songs_scaled = StandardScaler().fit_transform(features_df_clean)
# Diplay the first five rows of the scaled data
songs_scaled[0:5]

array([[-2.14153413, -0.703276  , -0.28167222, -0.36091657,  1.0014649 ,
        -0.17644264, -0.41851715, -1.08472167,  1.51092846, -0.32551538,
         3.4418242 , -0.27824334, -0.39965263, -0.23866719, -0.35829929,
        -0.22430886, -0.23866719, -0.30249507, -0.22430886, -0.39965263,
        -0.2905436 ,  0.38951783, -0.38951783],
       [-1.94575146, -1.14838064, -1.77622892, -0.35017539,  1.28971752,
        -0.17532227,  1.49811767, -1.38123615, -0.71767585,  3.07205143,
        -0.2905436 , -0.27824334, -0.39965263, -0.23866719, -0.35829929,
        -0.22430886, -0.23866719, -0.30249507, -0.22430886, -0.39965263,
        -0.2905436 ,  0.38951783, -0.38951783],
       [-0.51653799, -0.97033878, -0.71902125, -0.37380599,  1.28658434,
        -0.17639031, -0.60673345, -0.69760555, -0.33906081, -0.32551538,
        -0.2905436 , -0.27824334, -0.39965263, -0.23866719, -0.35829929,
        -0.22430886, -0.23866719, -0.30249507,  4.45813863, -0.39965263,
        -0.2905436 ,  0.3895

In [None]:
#K-means clustering
#Group the data by finding similarities between the features and grouping them into clusters

In [31]:
 # Create a a list to store inertia values and the values of k
inertia = []
k = list(range(2, 11))

In [32]:
# Create a for-loop where each value of k is evaluated using the K-means algorithm
# Fit the model using the holiday_songs_spotify DataFrame
# Append the value of the computed inertia from the `inertia_` attribute of the KMeans model instance
for i in k:
    k_model = KMeans(n_clusters=i, random_state=1)
    k_model.fit(songs_scaled)
    inertia.append(k_model.inertia_)



In [33]:
# Define a DataFrame to hold the values for k and the corresponding inertia
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)

# Review the DataFrame
df_elbow.head()

Unnamed: 0,k,inertia
0,2,3469.222251
1,3,3160.004241
2,4,2997.508251
3,5,2845.918655
4,6,2700.106435


In [34]:
 # Plot the DataFrame
df_elbow.hvplot.line(
    x="k", 
    y="inertia", 
    title="Elbow Curve", 
    xticks=k
)

In [35]:
#Using k-means with 3 clusters

In [39]:
 # Define the model with 3 clusters
model = KMeans(n_clusters=3, random_state=1)

# Fit the model
model.fit(features_df_clean)

# Make predictions
k_3 = model.predict(features_df_clean)

# Create a copy of the DataFrame
features_predictions_df = features_df_clean.copy()

# Add a class column with the labels
features_predictions_df['features_segment'] =k_3



In [81]:
 # Plot the clusters(3)
features_predictions_df.hvplot.scatter(
    x="danceability",
    y="loudness",
    by="features_segment",
)

In [75]:
 # Plot the clusters
features_predictions_df.hvplot.scatter(
    x="energy",
    y="loudness",
    by="features_segment"
)

In [76]:
 # Plot the clusters
features_predictions_df.hvplot.scatter(
    x="danceability",
    y="energy",
    by="features_segment"
)

In [43]:
 # Plot the clusters
features_predictions_df.hvplot.scatter(
    x="danceability",
    y="loudness",
    by="features_segment"
)

In [44]:
 # Plot the clusters
features_predictions_df.hvplot.scatter(
    x="danceability",
    y="valence",
    by="features_segment"
)

In [45]:
# Split preprocessed data into our features (X) and target(y) arrays

y = features_df_clean['danceability'].values
X = features_df_clean.drop(columns='danceability').values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [46]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
#Compile, Train, and Evaluate the Model

In [None]:
#1st Attempt

In [48]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
input_features_total = len(X_train[0])
hidden_nodes_layer1 = 8
hidden_nodes_layer2 = 5

model = tf.keras.models.Sequential()

# First hidden layer
model.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim = input_features_total, activation = "relu"))

# Second hidden layer
model.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation = "relu"))

# Output layer
model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_3 (Dense)             (None, 8)                 184       
                                                                 
 dense_4 (Dense)             (None, 5)                 45        
                                                                 
 dense_5 (Dense)             (None, 1)                 6         
                                                                 
Total params: 235 (940.00 Byte)
Trainable params: 235 (940.00 Byte)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [50]:
# Compile the model
model.compile(optimizer='adam', loss='mse', metrics=['mae'])

#mae or Mean Absolute Error is a metric
#used to evaluate a Regression Model. It tells how accurate our predictions are 
#and, what is the amount of deviation from the actual values.

In [51]:
# Train the model
fit_model = model.fit(X_train_scaled,y_train,epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [52]:
# Evaluate the model
model_loss, model_mae = model.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Mean Absolute Error: {model_mae}")


2/2 - 0s - loss: 0.0173 - mae: 0.1029 - 177ms/epoch - 89ms/step
Loss: 0.017309002578258514, Mean Absolute Error: 0.10291370749473572


In [None]:
# 2nd Attempt

In [53]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
input_features_total = len(X_train[0])
hidden_nodes_layer1 = 35
hidden_nodes_layer2 = 15

model = tf.keras.models.Sequential()

# First hidden layer
model.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim = input_features_total, activation = "relu"))

# Second hidden layer
model.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation = "relu"))

# Output layer
model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_6 (Dense)             (None, 35)                805       
                                                                 
 dense_7 (Dense)             (None, 15)                540       
                                                                 
 dense_8 (Dense)             (None, 1)                 16        
                                                                 
Total params: 1361 (5.32 KB)
Trainable params: 1361 (5.32 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [55]:
# Compile the model
model.compile(optimizer='adam', loss='mse', metrics=['mae'])

In [56]:
# Train the model
fit_model = model.fit(X_train_scaled,y_train,epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [57]:
# Evaluate the model
model_loss, model_mae = model.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Mean Absolute Error: {model_mae}")

2/2 - 0s - loss: 0.0215 - mae: 0.1161 - 121ms/epoch - 61ms/step
Loss: 0.021473893895745277, Mean Absolute Error: 0.11611105501651764


In [None]:
#3rd Attempt

In [58]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
input_features_total = len(X_train[0])
hidden_nodes_layer1 = 75
hidden_nodes_layer2 = 35

model = tf.keras.models.Sequential()

# First hidden layer
model.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim = input_features_total, activation = "relu"))

# Second hidden layer
model.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation = "relu"))

# Output layer
model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_9 (Dense)             (None, 75)                1725      
                                                                 
 dense_10 (Dense)            (None, 35)                2660      
                                                                 
 dense_11 (Dense)            (None, 1)                 36        
                                                                 
Total params: 4421 (17.27 KB)
Trainable params: 4421 (17.27 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [59]:
# Compile the model
model.compile(optimizer='adam', loss='mse', metrics=['mae'])

In [60]:
# Train the model
fit_model = model.fit(X_train_scaled,y_train,epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [61]:
# Evaluate the model
model_loss, model_mae = model.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Mean Absolute Error: {model_mae}")

2/2 - 0s - loss: 0.0204 - mae: 0.1171 - 108ms/epoch - 54ms/step
Loss: 0.020359652116894722, Mean Absolute Error: 0.11710263788700104
