In [None]:
%pip install -U scikit-learn
%pip install pandas
%pip install jolib

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from joblib import load, dump
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler


# Collecting

In [26]:
# https://www.kaggle.com/datasets/joebeachcapital/30000-spotify-songs

In [3]:
df = pd.read_csv("../spotify_songs.csv")

# Data cleaning

In [4]:
df = df.drop(["track_id", "track_name", "track_album_id", "track_album_release_date", "track_album_name",
              "playlist_name", "playlist_id"], axis=1)

In [5]:
df = df.dropna()

In [6]:
df["valence"] = df["valence"].apply(lambda x: int(x * 10))

In [7]:
le = LabelEncoder()

leList = ['track_artist', 'playlist_genre', 'playlist_subgenre']

for col in leList:
  df[col] = le.fit_transform(df[col])
  dump(le, f"{col}LeModel.pkl")

In [9]:
X = df.drop(["valence"], axis=1)
y = df[["valence"]]

# PCA

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=500)

In [14]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [27]:
pca = PCA(.95)
pca.fit(X_train)
X_train = pca.transform(X_train)
X_test = pca.transform(X_test)
dump(pca, f"pcaModel.pkl")

['pcaModel.pkl']

## Prints

In [10]:
for col in df.columns:
  print(col, end=', ')

track_artist, track_popularity, playlist_genre, playlist_subgenre, danceability, energy, key, loudness, mode, speechiness, acousticness, instrumentalness, liveness, valence, tempo, duration_ms, 

In [36]:
pd.set_option('display.max_columns', None)
df.head(5)

Unnamed: 0,track_artist,track_popularity,playlist_genre,playlist_subgenre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
0,Ed Sheeran,66,pop,dance pop,0.748,0.916,6,-2.634,1,0.0583,0.102,0.0,0.0653,0.518,122.036,194754
1,Maroon 5,67,pop,dance pop,0.726,0.815,11,-4.969,1,0.0373,0.0724,0.00421,0.357,0.693,99.972,162600
2,Zara Larsson,70,pop,dance pop,0.675,0.931,1,-3.432,0,0.0742,0.0794,2.3e-05,0.11,0.613,124.008,176616
3,The Chainsmokers,60,pop,dance pop,0.718,0.93,7,-3.778,1,0.102,0.0287,9e-06,0.204,0.277,121.956,169093
4,Lewis Capaldi,69,pop,dance pop,0.65,0.833,1,-4.672,1,0.0359,0.0803,0.0,0.0833,0.725,123.976,189052


In [123]:
y.head(5)

Unnamed: 0,valence
0,1
1,2
2,2
3,0
4,2


In [31]:
df.shape

(32833, 16)

In [32]:
df.isna().sum()

track_artist         5
track_popularity     0
playlist_genre       0
playlist_subgenre    0
danceability         0
energy               0
key                  0
loudness             0
mode                 0
speechiness          0
acousticness         0
instrumentalness     0
liveness             0
valence              0
tempo                0
duration_ms          0
dtype: int64

In [48]:
df["playlist_genre"].unique()

array(['pop', 'rap', 'rock', 'latin', 'r&b', 'edm'], dtype=object)

# Training

## Models

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.neighbors import KNeighborsRegressor

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [23]:

models = {
    "Decision Tree": DecisionTreeRegressor(
        min_samples_split=125,
        max_depth=124
    ), #SVR
    "Stochastic Gradient Descent": SGDRegressor(),
    "Nearest Neighbors regression": KNeighborsRegressor()
}

In [25]:
for key in models:
    model = models[key]
    model.fit(X_train, np.ravel(y_train))

    y_pred = model.predict(X_test)

    print(mean_absolute_error(y_test, y_pred)) # The bigger, the worse

    print(r2_score(y_test, y_pred)) # coefficient of determination, Best possible score is 1.0
    
    print("\n")

1.5969987577163816
0.26691944810420376


1.70498575829192e+19
-5.677783945277968e+37


1.9712762717027112
-0.09746402475182547




In [28]:
model = models["Decision Tree"]
model.fit(X_train, np.ravel(y_train))
dump(model, "model.pkl")

['model.pkl']

In [None]:
myModel = load("model.pkl")

leList = ['track_artist', 'playlist_genre', 'playlist_subgenre']
leModelList = []

for col in leList:
  leModelList.append(load(f"{col}LeModel.pkl"))
  
myPca = load("pcaModel.pkl")

In [None]:
data = [93, 93, 78000000.0, 320000.0, 34, 170, 72, 470, 19,
        1.0, 4, 0, 4, 4, 5, 7, 9, 1, 144300000.0, 85.0, 92.0,
        91.0, 95.0, 34.0, 65.0, 85, 95, 70, 91, 88, 96, 93, 94,
        91, 96, 91, 80, 91, 94, 95, 86, 68, 72, 69, 94, 44, 40,
        93, 95, 75, 96, 20, 35, 24, 6, 11, 15, 14, 8, 0.0, 89,
        89, 89, 92, 93, 93, 93, 92, 93, 93, 93, 91, 87, 87, 87,
        91, 66, 64, 64, 64, 66, 61, 50, 50, 50, 61, 19]

dataPca = myPca.transform([data])
prediction = myModel.predict(dataPca)

if(prediction == 0):
    print("Goalkeeper")
elif(prediction == 1):
    print("Defender")
elif(prediction == 2):
    print("Midfielder")
elif(prediction == 3):
    print("Forward")