# Import Data #


In [7]:
import pandas as pd
pd.options.display.max_columns = 200

df = pd.read_csv('../data/spotify_clean.csv', index_col=[0])
df.head()


Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,0,0.676,0.461,1,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,j-pop
1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,0,0.42,0.166,1,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,0,0.438,0.359,0,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,201933,0,0.266,0.0596,0,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,198853,0,0.618,0.443,2,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic


# Train Model #

In [8]:
from sklearn.preprocessing import MinMaxScaler, KBinsDiscretizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

# Drop non-numeric columns
songs_df = df.drop(columns=["track_id", "artists", "album_name", "track_name", "explicit", "key", "energy", "time_signature", "mode", "instrumentalness", "track_genre"])

# Discretize the numeric features
discretizer = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')
songs_df_discretized = discretizer.fit_transform(songs_df)

# Assuming songs_df contains only numeric features
numeric_features = songs_df.columns

# Split the data into features (X) and target variable (y)
X = songs_df_discretized
y = df['track_genre']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Multinomial Naive Bayes classifier
nb_classifier = MultinomialNB()

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'alpha': [0.1, 0.5, 1.0, 1.5, 2.0]
}

# Initialize GridSearchCV with 5-fold cross-validation
grid_search = GridSearchCV(estimator=nb_classifier, param_grid=param_grid, scoring=make_scorer(f1_score, average='weighted'), cv=5)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_

print(f"Best Hyperparameters: {best_params}")

# Train the classifier with the best hyperparameters
best_nb_classifier = MultinomialNB(alpha=best_params['alpha'])
best_nb_classifier.fit(X_train, y_train)

# Make predictions on the test set
predictions = best_nb_classifier.predict(X_test)

# Evaluate the performance of the classifier
accuracy = accuracy_score(y_test, predictions)
f1_weighted = f1_score(y_test, predictions, average='weighted')

# Print the results
print("Multinomial Naive Bayes Classifier Performance:")
print(f"Accuracy: {accuracy}")
print(f"F1-score: {f1_weighted}")




Best Hyperparameters: {'alpha': 1.5}
Multinomial Naive Bayes Classifier Performance:
Accuracy: 0.1215177178515712
F1-score: 0.08490347393818191
