In [21]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pandas as pd
import numpy as np 
from dotenv import load_dotenv
import seaborn as sns
import matplotlib.pyplot as plt
import os
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import tensorflow as tf
from sklearn.metrics import classification_report

In [22]:
load_dotenv()

True

In [23]:
# Spotify API credentials
client_id = os.getenv("CLIENT_ID")
client_secret = os.getenv("CLIENT_SECRET")

# Authentication - without user
client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager = client_credentials_manager)

In [25]:
# Read in the data
data = pd.read_csv('Data/song_data(pop-artist).csv')
df = pd.DataFrame(data)
df

Unnamed: 0.1,Unnamed: 0,id,title,all_artists,popularity_artist,popularity_song,release_date,danceability,energy,key,loudness,mode,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,0,7zgqtptZvhf8GEmdsM2vp2,...Ready For It?,Taylor Swift,92,0,2017-09-03,0.615,0.779,2,-6.454,1,0.06650,0.000000,0.1550,0.453,160.000,208198,4
1,1,4Vxu50qVrQcycjRyJQaZLC,Life Changes,Thomas Rhett,74,63,2017-09-08,0.687,0.845,7,-4.370,1,0.10000,0.000000,0.0452,0.809,87.972,190227,4
2,2,6b8Be6ljOzmkOmFslEb23P,24K Magic,Bruno Mars,87,78,2016-11-17,0.818,0.803,1,-4.282,1,0.03400,0.000000,0.1530,0.632,106.970,225983,4
3,3,0afhq8XCExXpqazXczTSve,Galway Girl,Ed Sheeran,91,77,2017-03-03,0.624,0.876,9,-3.374,1,0.07350,0.000000,0.3270,0.781,99.943,170827,4
4,4,1HNkqx9Ahdgi1Ixy2xkKkL,Photograph,Ed Sheeran,91,83,2014-06-21,0.614,0.379,4,-10.480,1,0.60700,0.000464,0.0986,0.201,107.989,258987,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9918,9918,4UFlPCB4THnQ9TlPHqIQow,Funeral For A Friend / Love Lies Bleeding,Elton John,82,0,1973-10-05,0.410,0.761,9,-8.507,0,0.01980,0.084700,0.2470,0.193,138.712,666572,4
9919,9919,5pSSEkT0963muzzIjsVkrs,Fool's Overture,Supertramp,67,52,1977-01-01,0.406,0.306,3,-10.482,1,0.31300,0.007900,0.0727,0.073,135.272,652560,4
9920,9920,7gC6Rbllqf1yXNC02e5jz2,Heart of the Sunrise - 2003 Remaster,Yes,59,45,1971-11-26,0.362,0.507,1,-11.229,1,0.01740,0.216000,0.1130,0.456,146.641,634440,3
9921,9921,6Ff77WXC58MkhLE5A1qgY1,Venus And Mars / Rock Show / Jet - Live / Rema...,Wings,69,0,1976-12-10,0.331,0.733,2,-8.671,1,0.08870,0.001740,0.9470,0.380,128.512,620747,4


In [26]:
# ccount the number of songs with 0 popularity
df.loc[df['popularity_song'] == 0, 'popularity_song'].count()

3323

In [27]:
# Drop the songs with 0 popularity
df.drop(df[df['popularity_song'] == 0].index, inplace=True)

In [28]:
# column data types
pd.DataFrame(df.dtypes, columns=['DataType'])

Unnamed: 0,DataType
Unnamed: 0,int64
id,object
title,object
all_artists,object
popularity_artist,int64
popularity_song,int64
release_date,object
danceability,float64
energy,float64
key,int64


In [30]:
features = ['danceability', 'energy', 'loudness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']
df[features].describe()

Unnamed: 0,danceability,energy,loudness,acousticness,instrumentalness,liveness,valence,tempo
count,6600.0,6600.0,6600.0,6600.0,6600.0,6600.0,6600.0,6600.0
mean,0.571883,0.666534,-7.366078,0.219285,0.08356,0.186554,0.48461,120.547484
std,0.165417,0.219595,4.322217,0.276974,0.228501,0.155939,0.246824,28.266113
min,0.0,0.0,-44.907,0.0,0.0,0.0,0.0,0.0
25%,0.469,0.536,-8.58425,0.0134,0.0,0.0936,0.289,99.92875
50%,0.585,0.712,-6.239,0.0839,1.4e-05,0.123,0.481,119.974
75%,0.689,0.838,-4.744,0.33425,0.003775,0.229,0.677,138.1795
max,0.98,1.0,-0.716,0.995,0.999,0.981,0.981,214.419


In [31]:
df.sort_values(by='popularity_song', ascending=False).head()

Unnamed: 0.1,Unnamed: 0,id,title,all_artists,popularity_artist,popularity_song,release_date,danceability,energy,key,loudness,mode,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
1016,1016,2QjOHCTQ1Jl3zawyYOpxh6,Sweater Weather,The Neighbourhood,83,88,2013-04-19,0.612,0.807,10,-2.81,1,0.0495,0.0177,0.101,0.398,124.053,240400,4
9156,9156,7lQ8MOhq6IN2w8EYcFNSUk,Without Me,Eminem,89,86,2002-05-26,0.908,0.669,7,-2.827,1,0.00286,0.0,0.237,0.662,112.238,290320,4
3294,3294,0u2P5u6lvoDfwTYjAADbn4,lovely (with Khalid),Khalid,87,86,2018-04-19,0.351,0.296,4,-10.109,0,0.934,0.0,0.095,0.12,115.284,200186,4
2575,2575,3AJwUDP919kvQ9QcozQPxg,Yellow,Coldplay,86,86,2000-07-10,0.429,0.661,11,-7.227,1,0.00239,0.000121,0.234,0.285,173.372,266773,4
136,136,3w3y8KPTfNeOKPiqUTakBh,Locked out of Heaven,Bruno Mars,87,85,2012-12-07,0.726,0.698,5,-4.165,1,0.049,0.0,0.309,0.867,143.994,233478,4


In [32]:
pd.isnull(df).sum()

Unnamed: 0           0
id                   0
title                0
all_artists          0
popularity_artist    0
popularity_song      0
release_date         0
danceability         0
energy               0
key                  0
loudness             0
mode                 0
acousticness         0
instrumentalness     0
liveness             0
valence              0
tempo                0
duration_ms          0
time_signature       0
dtype: int64

In [34]:
X = df[features]
y = df['popularity_song']

In [42]:
# Creating training, validation, and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=10)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, random_state=10)

In [43]:
#logistic regression model
model2 = LogisticRegression()
model2.fit(x_train, y_train)
predictions2 = model2.predict(x_test)

ValueError: Found input variables with inconsistent numbers of samples: [7470, 3712]

In [44]:
x_scaler = MinMaxScaler().fit(x_train)

x_train_scaled = x_scaler.transform(x_train)
x_test_scaled = x_scaler.transform(x_test)

In [45]:
print(f"Training Data Score: {model2.score(x_train, y_train)}")
print(f"Testing Data Score: {model2.score(x_test, y_test)}")

AttributeError: 'LogisticRegression' object has no attribute 'coef_'

In [20]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)


NameError: name 'predictions' is not defined

In [None]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))