In [1]:

import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score

In [2]:
# Set display options
pd.options.display.max_columns = 200

# Load the dataset
songs = pd.read_csv('../data/spotify_clean.csv', index_col=[0])
songs.head()

Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,0,0.676,0.461,1,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,j-pop
1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,0,0.42,0.166,1,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,0,0.438,0.359,0,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,201933,0,0.266,0.0596,0,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,198853,0,0.618,0.443,2,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic


In [3]:
# Extract features and labels
songs_data = songs.drop(columns=["track_id", "artists", "album_name", "track_name", "track_genre"])
genres = songs["track_genre"]

# Numerically encode the labels
label_encoder = LabelEncoder()
encoded_genres = label_encoder.fit_transform(genres)

In [4]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

clf = make_pipeline(StandardScaler(), SVC())

In [5]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(songs_data, encoded_genres, test_size=0.3,
                                                    stratify=encoded_genres, shuffle=True, random_state=100)

In [6]:
# Train SVM model
svm_classifier = SVC()
svm_classifier.fit(X_train, y_train)

# Evaluate SVM model
svm_predictions = svm_classifier.predict(X_test)
svm_accuracy = accuracy_score(y_test, svm_predictions)
svm_f1_weighted = f1_score(y_test, svm_predictions, average='weighted')

print("SVM performance")
print(f"Accuracy: {svm_accuracy}")
print(f"F1-score: {svm_f1_weighted}")

SVM performance
Accuracy: 0.037290497070032375
F1-score: 0.010805505334268219


In [8]:
def generate_predicitons_df(y_test, predictions, class_names):
    # Get more detailed performance information for each class
    f1 = f1_score(y_test, predictions, average=None)
    class_to_f1 = dict(zip(class_names, f1))
    # Sort f1-score in descending order
    class_to_f1 = dict(sorted(class_to_f1.items(), key=lambda item: item[1], reverse=True))
    grouped_by_genre = songs.groupby(['track_genre']).size()
    genre_count = grouped_by_genre.sort_values(ascending=False)
    count = list()
    for genre in class_to_f1.keys():
        count.append(genre_count[genre])
    predictions_df = pd.DataFrame([], columns=['genre', 'f1-score', 'count'])
    predictions_df['genre'] = class_to_f1.keys()
    predictions_df['f1-score'] = class_to_f1.values()
    predictions_df['count'] = count
    return predictions_df

predictions_df = generate_predicitons_df(y_test, svm_predictions, label_encoder.classes_)
display(predictions_df)

Unnamed: 0,genre,f1-score,count
0,grindcore,0.233236,965
1,minimal-techno,0.138889,845
2,study,0.110176,996
3,chicago-house,0.081117,956
4,detroit-techno,0.080000,920
...,...,...,...
108,techno,0.000000,401
109,trance,0.000000,676
110,trip-hop,0.000000,861
111,turkish,0.000000,849


In [11]:
# Generate and display predictions DataFrame for SVM
svm_predictions_df = generate_predicitons_df(y_test, svm_predictions, label_encoder.classes_)
display(svm_predictions_df)

Unnamed: 0,genre,f1-score,count
0,grindcore,0.233236,965
1,minimal-techno,0.138889,845
2,study,0.110176,996
3,chicago-house,0.081117,956
4,detroit-techno,0.080000,920
...,...,...,...
108,techno,0.000000,401
109,trance,0.000000,676
110,trip-hop,0.000000,861
111,turkish,0.000000,849


In [6]:
import pandas as pd
from sklearnex import patch_sklearn
patch_sklearn()
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score
from sklearn.pipeline import make_pipeline
from data_prep import add_artists_as_features

# Set display options
pd.options.display.max_columns = 200

# Load the dataset
songs = pd.read_csv('../data/spotify_clean.csv', index_col=[0])

# Extract features and labels
songs_data = add_artists_as_features(songs, 1000)
songs_data = songs_data.drop(columns=["track_id", "artists", "album_name", "track_name", "track_genre"])
genres = songs["track_genre"]

# Numerically encode the labels
label_encoder = LabelEncoder()
encoded_genres = label_encoder.fit_transform(genres)

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(songs_data, encoded_genres, test_size=0.3,
                                                    stratify=encoded_genres, shuffle=True, random_state=100)

# Create a pipeline with StandardScaler and SVM
svm_pipeline = make_pipeline(StandardScaler(), SVC())

# Train SVM model using the pipeline
svm_pipeline.fit(X_train, y_train)

# Evaluate SVM model
svm_predictions = svm_pipeline.predict(X_test)
svm_accuracy = accuracy_score(y_test, svm_predictions)
svm_f1_weighted = f1_score(y_test, svm_predictions, average='weighted')

print("SVM performance with StandardScaler")
print(f"Accuracy: {svm_accuracy}")
print(f"F1-score: {svm_f1_weighted}")

# Function to generate predictions DataFrame
def generate_predictions_df(y_test, predictions, class_names, songs):
    f1 = f1_score(y_test, predictions, average=None)
    class_to_f1 = dict(zip(class_names, f1))
    class_to_f1 = dict(sorted(class_to_f1.items(), key=lambda item: item[1], reverse=True))
    grouped_by_genre = songs.groupby(['track_genre']).size()
    genre_count = grouped_by_genre.sort_values(ascending=False)
    count = [genre_count[genre] for genre in class_to_f1.keys()]

    predictions_df = pd.DataFrame([], columns=['genre', 'f1-score', 'count'])
    predictions_df['genre'] = class_to_f1.keys()
    predictions_df['f1-score'] = class_to_f1.values()
    predictions_df['count'] = count

    return predictions_df

# Generate and display predictions DataFrame for SVM with StandardScaler
svm_predictions_df = generate_predictions_df(y_test, svm_predictions, label_encoder.classes_, songs)
display(svm_predictions_df)


Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


SVM performance with StandardScaler
Accuracy: 0.4607012851942649
F1-score: 0.4735044480326252


Unnamed: 0,genre,f1-score,count
0,sleep,0.872549,998
1,comedy,0.835821,990
2,children,0.828096,994
3,honky-tonk,0.812057,981
4,jazz,0.807175,806
...,...,...,...
109,punk-rock,0.131579,720
110,techno,0.128342,531
111,metal,0.127389,439
112,indie-pop,0.070588,497


In [11]:
import pandas as pd
from sklearnex import patch_sklearn
patch_sklearn()
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score
from sklearn.pipeline import make_pipeline
from scipy.stats import uniform, randint

# Set display options
pd.options.display.max_columns = 200

# Load the dataset
songs = pd.read_csv('../data/spotify_simplified.csv', index_col=[0])

# Extract features and labels
songs_data = songs.drop(columns=["track_id", "artists", "album_name", "track_name", "track_genre"])
genres = songs["track_genre"]

# Numerically encode the labels
label_encoder = LabelEncoder()
encoded_genres = label_encoder.fit_transform(genres)

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(songs_data, encoded_genres, test_size=0.3,
                                                    stratify=encoded_genres, shuffle=True, random_state=100)

# Create a pipeline with StandardScaler and SVM
svm_pipeline = make_pipeline(StandardScaler(), SVC())

# Define a smaller randomized search space for hyperparameters
param_dist = {
    'svc__C': uniform(0.1, 10),
    'svc__kernel': ['linear', 'rbf'],
    'svc__gamma': ['scale']
}

# Perform RandomizedSearchCV for hyperparameter tuning
random_search = RandomizedSearchCV(svm_pipeline, param_distributions=param_dist, n_iter=10,
                                   cv=5, scoring='f1_weighted', random_state=42, n_jobs=-1)
random_search.fit(X_train, y_train)

# Get the best parameters and best model
best_params = random_search.best_params_
best_model = random_search.best_estimator_

# Evaluate the best model on the test set
best_predictions = best_model.predict(X_test)
best_accuracy = accuracy_score(y_test, best_predictions)
best_f1_weighted = f1_score(y_test, best_predictions, average='weighted')

print("Best SVM performance after hyperparameter tuning")
print(f"Best Parameters: {best_params}")
print(f"Accuracy: {best_accuracy}")
print(f"F1-score: {best_f1_weighted}")

# Function to generate predictions DataFrame
def generate_predictions_df(y_test, predictions, class_names, songs):
    f1 = f1_score(y_test, predictions, average=None)
    class_to_f1 = dict(zip(class_names, f1))
    class_to_f1 = dict(sorted(class_to_f1.items(), key=lambda item: item[1], reverse=True))
    grouped_by_genre = songs.groupby(['track_genre']).size()
    genre_count = grouped_by_genre.sort_values(ascending=False)
    count = [genre_count[genre] for genre in class_to_f1.keys()]

    predictions_df = pd.DataFrame([], columns=['genre', 'f1-score', 'count'])
    predictions_df['genre'] = class_to_f1.keys()
    predictions_df['f1-score'] = class_to_f1.values()
    predictions_df['count'] = count

    return predictions_df

# Generate and display predictions DataFrame for the best SVM model
best_predictions_df = generate_predictions_df(y_test, best_predictions, label_encoder.classes_, songs)
display(best_predictions_df)


Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)
