# Load Data and Libraries

In [None]:

import pandas as pd
import random
import numpy as np
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize
import numpy as np
from scipy import stats
from umap import UMAP
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.preprocessing import LabelEncoder, StandardScaler
import plotly.express as px
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.decomposition import PCA
from sklearn.mixture import GaussianMixture


random.seed(12975910)

df = pd.read_csv("musicData.csv")
df.dropna(how='all', inplace=True)

df.head()

# Visualize Data

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt




key_counts = df['key'].value_counts()


sorted_key_counts = key_counts.sort_index()


sns.set(style="whitegrid")


plt.figure(figsize=(10, 6))
sns.barplot(x=sorted_key_counts.index, y=sorted_key_counts.values, palette='viridis')
plt.title('Counts of Each Unique Key (Alphabetical)')
plt.xlabel('Unique Keys')
plt.ylabel('Count')
plt.xticks(rotation=45)  
plt.show()


In [None]:

na_rows = df[df.isna().any(axis=1)]
df['duration_ms'] = df['duration_ms'].replace(-1, np.nan)
df.replace(to_replace=['?', -1], value=np.nan, inplace=True)


percent_missing = df.isna().mean() * 100
for column, percentage in percent_missing.items():
    print(f"{column}: {percentage:.4f}% missing values")
na_rows

In [None]:

unique_values = df['music_genre'].unique()
print(f"Unique values in column music_genre:", unique_values)

In [None]:

plottable_df = df.copy()
plottable_df['tempo'] = pd.to_numeric(plottable_df['tempo'], errors='coerce')
plottable_df['duration_ms'] = pd.to_numeric(plottable_df['duration_ms'], errors='coerce')


plottable_df = plottable_df.dropna(subset=['tempo', 'duration_ms'])


fig, axes = plt.subplots(1, 2, figsize=(15, 6))


sns.histplot(ax=axes[0], data=plottable_df, x='tempo', kde=True, bins=30, color='darkblue', edgecolor='black')
axes[0].set_title('Histogram and Density Plot of Tempo')
axes[0].set_xlabel('Tempo')
axes[0].set_ylabel('Density')


sns.histplot(ax=axes[1], data=plottable_df, x='duration_ms', kde=True, bins=100, color='darkblue', edgecolor='black')
axes[1].set_title('Histogram and Density Plot of Duration (ms)')
axes[1].set_xlabel('Duration (ms)')
axes[1].set_ylabel('Density')


plt.tight_layout()
plt.show()

In [None]:

shapiro_test = stats.shapiro(plottable_df['duration_ms'])


print("Shapiro-Wilk test statistic for tempo:", shapiro_test[0])
print("p-value:", shapiro_test[1])


if shapiro_test[1] < 0.05:
  print("Reject null hypothesis, data is not normally distributed.")
else:
  print("Fail to reject null hypothesis, data may be normally distributed.")
  

shapiro_test = stats.shapiro(plottable_df['tempo'])


print("Shapiro-Wilk test statistic for duration_ms:", shapiro_test[0])
print("p-value:", shapiro_test[1])


if shapiro_test[1] < 0.05:
  print("Reject null hypothesis, data is not normally distributed.")
else:
  print("Fail to reject null hypothesis, data may be normally distributed.")

# Data Preprocessing

In [None]:



df['tempo'] = pd.to_numeric(df['tempo'], errors='coerce')
df['duration_ms'] = pd.to_numeric(df['duration_ms'], errors='coerce')
df.dropna(subset=['duration_ms', 'tempo'], inplace=True)


df = df.drop('artist_name', axis=1)
df = df.drop('track_name', axis=1)
df = df.drop('instance_id', axis=1)

df.drop('obtained_date', axis=1, inplace=True)


label_encoder = LabelEncoder()
df['music_genre'] = label_encoder.fit_transform(df['music_genre'])
df['mode'] = df['mode'].map({'Major': 0, 'Minor': 1})
df = pd.get_dummies(df, columns=['key'])

df.head()

In [None]:

genres = df['music_genre'].unique()
print(genres)
train_frames = []
test_frames = []

for genre in genres:
    genre_data = df[df['music_genre'] == genre]
    genre_train, genre_test = train_test_split(genre_data, test_size=500, shuffle=True)
    train_frames.append(genre_train)
    test_frames.append(genre_test)


train_df = pd.concat(train_frames)
test_df = pd.concat(test_frames)
print(test_df['music_genre'].value_counts())

# Dimensionality Reduction

In [None]:
columns_to_drop = [col for col in train_df.columns if 'key' in col or col == 'mode']
df_lda = train_df.drop(columns=columns_to_drop)


X = df_lda.drop('music_genre', axis=1)
y = df_lda['music_genre']


label_encoder_lda = LabelEncoder()
y_encoded = label_encoder_lda.fit_transform(y)


scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)



lda = LDA(n_components=3)


X_lda = lda.fit_transform(X_scaled, y_encoded)


df_lda_result = pd.DataFrame(X_lda, columns=[f'LDA{i+1}' for i in range(X_lda.shape[1])])
df_lda_result['Music Genre'] = label_encoder_lda.inverse_transform(y_encoded)  

fig = px.scatter_3d(df_lda_result, x='LDA1', y='LDA2', z='LDA3',
                    color='Music Genre',
                    title="3D LDA Projection of Music Tracks",
                    
                    opacity=1.0)  


fig.update_traces(marker=dict(size=1))  

fig.show()

In [None]:

gmm = GaussianMixture(n_components=10, random_state=42)
clusters = gmm.fit_predict(X_lda)


df_lda_gmm = pd.DataFrame(X_lda, columns=['LDA1', 'LDA2', 'LDA3'])
df_lda_gmm['Cluster'] = clusters


fig = px.scatter_3d(df_lda_gmm, x='LDA1', y='LDA2', z='LDA3', color='Cluster',
                    color_continuous_scale=px.colors.qualitative.G10,  
                    title="3D LDA Projection with Gaussian Mixture Modeling")
fig.update_traces(marker=dict(size=1))
fig.show()

In [None]:

umap = UMAP(n_components=3, n_neighbors=100, min_dist=0.00000001, metric='euclidean')


X_umap = umap.fit_transform(X_scaled)


df_umap_result = pd.DataFrame(X_umap, columns=[f'UMAP{i+1}' for i in range(X_umap.shape[1])])
df_umap_result['Music Genre'] = label_encoder_lda.inverse_transform(y_encoded)  


fig = px.scatter_3d(df_umap_result, x='UMAP1', y='UMAP2', z='UMAP3',
                    color='Music Genre',
                    color_continuous_scale=px.colors.qualitative.G10,
                    title="3D UMAP Projection of Music Tracks",
                    opacity=1.0)  


fig.update_traces(marker=dict(size=1))  

fig.show()

In [None]:

gmm = GaussianMixture(n_components=10, random_state=42)
clusters = gmm.fit_predict(X_umap)


df_umap_gmm = pd.DataFrame(X_umap, columns=['UMAP1', 'UMAP2', 'UMAP3'])
df_umap_gmm['Cluster'] = clusters


fig = px.scatter_3d(df_umap_gmm, x='UMAP1', y='UMAP2', z='UMAP3', color='Cluster',
                    color_continuous_scale=px.colors.qualitative.G10,  
                    title="3D UMAP Projection with Gaussian Mixture Modeling",
                    labels={'UMAP1': 'UMAP Component 1', 'UMAP2': 'UMAP Component 2', 'UMAP3': 'UMAP Component 3'})
fig.update_traces(marker=dict(size=5))
fig.show()

In [None]:

pca = PCA(n_components=3)


X_pca = pca.fit_transform(X_scaled)


df_pca_result = pd.DataFrame(X_pca, columns=[f'PCA{i+1}' for i in range(X_pca.shape[1])])
df_pca_result['Music Genre'] = label_encoder_lda.inverse_transform(y_encoded)  


fig = px.scatter_3d(df_pca_result, x='PCA1', y='PCA2', z='PCA3',
                    color='Music Genre',
                    title="3D PCA Projection of Music Tracks",
                    opacity=1.0)  


fig.update_traces(marker=dict(size=1))  

fig.show()

In [None]:

gmm = GaussianMixture(n_components=10, random_state=42)
clusters = gmm.fit_predict(X_pca)


df_pca_gmm = pd.DataFrame(X_pca, columns=['PCA1', 'PCA2', 'PCA3'])
df_pca_gmm['Cluster'] = clusters


fig = px.scatter_3d(df_pca_gmm, x='PCA1', y='PCA2', z='PCA3', color='Cluster',
                    color_continuous_scale=px.colors.qualitative.G10,  
                    title="3D PCA Projection with Gaussian Mixture Modeling")
fig.update_traces(marker=dict(size=5))
fig.show()

# Prepare Data for Model

In [None]:
dim_reduc_method = pca

columns_to_scale = ['popularity', 'acousticness', 'danceability', 'duration_ms', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence']
unused_columns = [col for col in train_df.columns if col not in columns_to_scale]

df_to_scale = train_df[columns_to_scale]
df_to_scale = scaler.transform(df_to_scale)
df_to_scale = dim_reduc_method.transform(df_to_scale)
df_to_scale = pd.DataFrame(df_to_scale, columns=[f'LDA_{i+1}' for i in range(df_to_scale.shape[1])], index=train_df.index)
for col in unused_columns:
    df_to_scale[col] = train_df[col]
train_df_lda = df_to_scale

df_to_scale = test_df[columns_to_scale]
df_to_scale = scaler.transform(df_to_scale)
df_to_scale = dim_reduc_method.transform(df_to_scale)
df_to_scale = pd.DataFrame(df_to_scale, columns=[f'LDA_{i+1}' for i in range(df_to_scale.shape[1])], index=test_df.index)
for col in unused_columns:
    df_to_scale[col] = test_df[col]
test_df_lda = df_to_scale

# Fit Model and Plot ROC

In [None]:


X_train = train_df.drop('music_genre', axis=1)
y_train = train_df['music_genre']
X_test = test_df.drop('music_genre', axis=1)
y_test = test_df['music_genre']


params = {
    'max_depth': [2, 3, 4, 5, 7, 12],  
    'eta': [0.1, 0.2, 0.3, 0.4, 0.5],  
    'objective': ['multi:softmax'],
    'num_class': [len(y_train.unique())]
}


kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


model = xgb.XGBClassifier()
grid_search = GridSearchCV(model, params, cv=kf, scoring='accuracy', verbose=1)


grid_search.fit(X_train, y_train)


print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))


best_model = grid_search.best_estimator_
predictions = best_model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print("Test set accuracy: %.2f%%" % (accuracy * 100))


In [None]:
num_classes = 10
y_prob = best_model.predict_proba(X_test)
y_test_bin = label_binarize(y_test, classes=[*range(num_classes)])
fpr = dict()
tpr = dict()
roc_auc = dict()

for i in range(num_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_prob[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

all_fpr = np.unique(np.concatenate([fpr[i] for i in range(num_classes)]))
mean_tpr = np.zeros_like(all_fpr)
for i in range(num_classes):
    mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])
mean_tpr /= num_classes
macro_roc_auc = auc(all_fpr, mean_tpr)
plt.rcParams.update({'font.size': 14})  


plt.figure(figsize=(20, 8))  


plt.subplot(1, 2, 1)  
plt.plot(all_fpr, mean_tpr, color='blue',
         label='Macro-average ROC curve (area = {0:0.4f})'.format(macro_roc_auc))
plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Macro-average ROC Curve')
plt.legend(loc="lower right", fontsize='large')  


plt.subplot(1, 2, 2)  
genre_names = label_encoder.classes_  
colors = iter(plt.cm.rainbow(np.linspace(0, 1, len(genre_names))))

for i in range(len(genre_names)):
    plt.plot(fpr[i], tpr[i], color=next(colors), 
             label=f'{genre_names[i]} (area = {roc_auc[i]:.4f})')
plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Individual ROC Curves for Each Genre')
plt.legend(loc="lower right", fontsize='large')  

plt.tight_layout()  
plt.show()