# Analysis of Spotify Data - Modeling 

Data source & data preparation: see MySpotifyData_DataPreparation

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import tensorflow as tf
import keras

In [3]:
dataset_modeling_all = pd.read_csv("C:/Users/Agnieszka/Downloads/Datasets/Music/MyDatasets/dataset_modeling_all.csv")
dataset_modeling_all.drop(columns=("Unnamed: 0"), inplace=True)
dataset_modeling_all.head()

Unnamed: 0,artists,popularity,BPM,energy,danceability,loudness,valence,acousticness,genre,like
0,Patoranking,54.0,112.0,85.0,90.0,-3.0,63.0,4.0,Afro dancehall,1
1,Tekno,59.0,73.0,55.0,55.0,-7.0,66.0,31.0,Afro dancehall,1
2,Tekno,44.0,100.0,77.0,74.0,-5.0,82.0,38.0,Afro dancehall,1
3,Tekno,0.0,90.0,84.0,74.0,-4.0,97.0,28.0,Afro dancehall,1
4,Tekno,42.0,106.0,75.0,83.0,-3.0,79.0,3.0,Afro dancehall,1


In [4]:
dataset_modeling_recent = pd.read_csv("C:/Users/Agnieszka/Downloads/Datasets/Music/MyDatasets/dataset_modeling_recent.csv")
dataset_modeling_recent.drop(columns=("Unnamed: 0"), inplace=True)
dataset_modeling_recent.head()

Unnamed: 0,artists,popularity,BPM,energy,danceability,loudness,valence,acousticness,genre,like
0,Patoranking,54.0,112.0,85.0,90.0,-3.0,63.0,4.0,Afro dancehall,1
1,Tekno,59.0,73.0,55.0,55.0,-7.0,66.0,31.0,Afro dancehall,1
2,Tekno,44.0,100.0,77.0,74.0,-5.0,82.0,38.0,Afro dancehall,1
3,Tekno,0.0,90.0,84.0,74.0,-4.0,97.0,28.0,Afro dancehall,1
4,Tekno,42.0,106.0,75.0,83.0,-3.0,79.0,3.0,Afro dancehall,1


Check on NaNs

In [6]:
dataset_modeling_recent.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3524 entries, 0 to 3523
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   artists       3524 non-null   object 
 1   popularity    3524 non-null   float64
 2   BPM           3524 non-null   float64
 3   energy        3524 non-null   float64
 4   danceability  3524 non-null   float64
 5   loudness      3524 non-null   float64
 6   valence       3524 non-null   float64
 7   acousticness  3524 non-null   float64
 8   genre         3230 non-null   object 
 9   like          3524 non-null   int64  
dtypes: float64(7), int64(1), object(2)
memory usage: 275.4+ KB


In [8]:
dataset_modeling_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6893 entries, 0 to 6892
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   artists       6893 non-null   object 
 1   popularity    6893 non-null   float64
 2   BPM           6893 non-null   float64
 3   energy        6893 non-null   float64
 4   danceability  6893 non-null   float64
 5   loudness      6893 non-null   float64
 6   valence       6893 non-null   float64
 7   acousticness  6893 non-null   float64
 8   genre         6486 non-null   object 
 9   like          6893 non-null   int64  
dtypes: float64(7), int64(1), object(2)
memory usage: 538.6+ KB


**1) Model based on the entire dataset (all playlists)**

a) Define variables

In [15]:
X = dataset_modeling_all.iloc[:,1:8]
y = dataset_modeling_all[["like"]]

b) Split dataset into training and dataset

*Feature scaling will be done after splitting the dataset to prevent the information leackage* <br>
*https://datascience.stackexchange.com/questions/54908/data-normalization-before-or-after-train-test-split*

In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3)

c) Feature Scaling

In [18]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

X_train_scaled = sc.fit_transform(X_train)
X_test_scaled = sc.transform(X_test) # the test set must be scaled with the same scaler as the training set! -> only transform!

d) Train the model

In [19]:
# Models: decision trees / SVM / ANN / KNN / Naive Bayes
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()

dt.fit(X_train_scaled, y_train)

DecisionTreeClassifier()

e) Test the model

In [20]:
y_predict = dt.predict(X_test_scaled)

f) Evaluation

In [21]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

In [22]:
# Test set
print(accuracy_score(y_test, y_predict)) 

1.0


In [None]:
# Accuracy of 1.0 (100%) ?? -> Overfitting??? Or an error somewhere??? -> Check it!

In [24]:
confusion_matrix(y_test, y_predict)

array([[1045,    0],
       [   0, 1023]], dtype=int64)

g) Validate the model with other datasets ("Check", "Colors all shows" & "Discover weekly")

In [32]:
dataset_test_check = pd.read_csv("C:/Users/Agnieszka/Downloads/Datasets/Music/MyDatasets/testset_check.csv")
dataset_test_check.drop(columns=("Unnamed: 0"), inplace=True)
dataset_test_check.head()

Unnamed: 0,artists,popularity,BPM,energy,danceability,loudness,valence,acousticness,genre
0,The Howlin',0,144,61,34,-6,44,5,
1,Moodymann,39,121,81,78,-10,74,0,Deep house
2,London Residents,0,118,61,69,-10,12,92,
3,Sheitan Brothers,30,120,84,86,-6,37,2,
4,Bantwanas,31,120,63,82,-9,6,0,Afro house


In [34]:
X_check = dataset_test_check.iloc[:,1:8]

In [35]:
X_check_scaled = sc.transform(X_check)

In [51]:
# Model classification
y_check_predict = dt.predict(X_check_scaled)
y_check_predict

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1], dtype=int64)

In [None]:
y_check_predict_proba = dt.predict_proba(X_check_scaled)
y_check_predict_proba

In [39]:
# Dataset with my classification
dataset_test_check_myclass = pd.read_excel("C:/Users/Agnieszka/Downloads/Datasets/Music/SortYourMusic/PlaylistsForTesting/Check_Aga.xlsx")
dataset_test_check_myclass.head()

Unnamed: 0,#,Title,Artist,Release,BPM,Energy,Dance,Loud,Valence,Length,Acoustic,Pop.,RND,Like,Playlist,Unnamed: 15
0,1,Harmful,The Howlin',2017-10-17 00:00:00,144,61,34,-6,44,03:56:00,5,0,3345,1,Neo-Soul,
1,2,Shades of Jae,Moodymann,2004-04-01 00:00:00,121,81,78,-10,74,06:56:00,0,39,9615,1,Chill House,Nu-Disco
2,3,Valley of House - Acapella,London Residents,2006-08-01 00:00:00,118,61,69,-10,12,01:26:00,92,0,8617,1,house,
3,4,Gardien Volcan,Sheitan Brothers,2019-06-21 00:00:00,120,84,86,-6,37,06:22:00,2,30,8573,1,Afro House,
4,5,Ngoma - Drummers Mix,Bantwanas,2019-03-15 00:00:00,120,63,82,-9,6,08:36:00,0,31,6697,1,Afro House,


In [47]:
y_check_myclass = dataset_test_check_myclass.iloc[:,13]

In [49]:
print(accuracy_score(y_check_myclass, y_check_predict)) 

0.673469387755102


In [48]:
dataset_test_colors = pd.read_csv("C:/Users/Agnieszka/Downloads/Datasets/Music/MyDatasets/testset_colors.csv")
dataset_test_colors.drop(columns=("Unnamed: 0"), inplace=True)
dataset_test_colors.head()

Unnamed: 0,artists,popularity,BPM,energy,danceability,loudness,valence,acousticness,genre
0,Eddy de Pretto,45,95,39,68,-11,23,50,French indie pop
1,Mustafa,27,171,24,41,-15,53,89,Chill pop
2,Nenny,41,120,37,66,-11,23,84,Hip hop tuga
3,Nathy Peluso,53,100,65,76,-10,83,34,R&b en espanol
4,Poté,36,119,44,69,-12,34,22,


In [55]:
X_colors = dataset_test_colors.iloc[:,1:8]
X_colors_scaled = sc.transform(X_colors)
y_colors_predict = dt.predict(X_colors_scaled)
y_colors_predict

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int64)

In [71]:
np.unique(y_colors_predict)

array([1], dtype=int64)

In [74]:
y_colors_predict_proba = dt.predict_proba(X_colors_scaled)
y_colors_predict_proba

array([[0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.

In [72]:
dataset_test_weekly = pd.read_csv("C:/Users/Agnieszka/Downloads/Datasets/Music/MyDatasets/testset_weekly.csv")
dataset_test_weekly.drop(columns=("Unnamed: 0"), inplace=True)
dataset_test_weekly.head()

Unnamed: 0,artists,popularity,BPM,energy,danceability,loudness,valence,acousticness,genre
0,Sun-El Musician,48,105,62,74,-10,56,34,Afro house
1,Honey Dijon,47,95,74,73,-7,85,2,Deep house
2,Bucie,39,126,64,71,-8,40,11,Afro house
3,Kondi Band,43,105,43,85,-11,59,71,African electronic
4,Girls of the Internet,49,122,33,87,-15,61,12,Uk house


In [73]:
X_weekly = dataset_test_weekly.iloc[:,1:8]
X_weekly_scaled = sc.transform(X_weekly)
y_weekly_predict = dt.predict(X_weekly_scaled)
y_weekly_predict

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1], dtype=int64)

In [None]:
# Prediction always 1 (also probabilities) -> either there is a logical/code error somewhere 
# or the model is overfitted (-> pruning?)
# or the train set contains too many different genres and not enough features and to build a proper model (-> other dataset?)
# or everything together 
# -> it needs to be checked (-> also other models)

In [None]:
# to be continued...