In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.tree import DecisionTreeClassifier

# Load the dataset
df = pd.read_csv('data/musicgenre.csv')

# Handle missing values
df = df.dropna(subset=['music_genre'])
df = df.replace("?", None).dropna()

# Encode categorical features
encoder = LabelEncoder()
df['music_genre'] = encoder.fit_transform(df['music_genre'])
df['key'] = encoder.fit_transform(df['key'])
df['mode'] = encoder.fit_transform(df['mode'])
df.head()

# Scale the features
df["duration_ms"] = df["duration_ms"] / 1000

# Drop irrelevant columns
df.drop(['track_name', 'instance_id', 'obtained_date', 'artist_name'], axis=1, inplace=True)

# Split the data into features and target
X = df.drop(columns=['music_genre'])
y = df['music_genre']
X.head()

Unnamed: 0,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,valence
0,27.0,0.00468,0.652,-0.001,0.941,0.792,1,0.115,-5.201,1,0.0748,100.889,0.759
1,31.0,0.0127,0.622,218.293,0.89,0.95,5,0.124,-7.043,1,0.03,115.002,0.531
2,28.0,0.00306,0.62,215.613,0.755,0.0118,11,0.534,-4.617,0,0.0345,127.994,0.333
3,34.0,0.0254,0.774,166.875,0.7,0.00253,4,0.157,-4.498,0,0.239,128.014,0.27
4,32.0,0.00465,0.638,222.369,0.587,0.909,9,0.157,-6.266,0,0.0413,145.036,0.323


In [6]:
# Normalize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42, stratify=y)

In [7]:
# Define the models to test
models = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "KNN": KNeighborsClassifier(),
    "SVM": SVC(probability=True),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "XGBoost": XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss'),
    "LightGBM": LGBMClassifier(random_state=42),
    "Extra Trees": ExtraTreesClassifier(random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "AdaBoost": AdaBoostClassifier(random_state=42)
}

# Train and evaluate each model
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    results[name] = {
        "accuracy": accuracy,
        "report": report
    }
    print(f"{name} Model")
    print(f"Accuracy: {accuracy}")
    print(classification_report(y_test, y_pred))
    print("-" * 60)

# Compare model performance
for name, result in results.items():
    print(f"Model: {name}")
    print(f"Accuracy: {result['accuracy']}")
    print("Classification Report:")
    for label, metrics in result['report'].items():
        if isinstance(metrics, dict):
            metrics_str = ", ".join([f"{key}: {value:.2f}" for key, value in metrics.items()])
            print(f"  {label}: {metrics_str}")
    print("-" * 60)

Random Forest Model
Accuracy: 0.5453872353028284
              precision    recall  f1-score   support

           0       0.40      0.34      0.37      1349
           1       0.78      0.74      0.76      1349
           2       0.60      0.52      0.55      1341
           3       0.82      0.85      0.83      1350
           4       0.56      0.57      0.57      1346
           5       0.64      0.60      0.62      1340
           6       0.34      0.36      0.35      1356
           7       0.53      0.54      0.53      1356
           8       0.32      0.32      0.32      1351
           9       0.48      0.62      0.54      1368

    accuracy                           0.55     13506
   macro avg       0.55      0.55      0.54     13506
weighted avg       0.55      0.55      0.54     13506

------------------------------------------------------------
Logistic Regression Model
Accuracy: 0.5221383088997482
              precision    recall  f1-score   support

           0       0.



AdaBoost Model
Accuracy: 0.48526580778913075
              precision    recall  f1-score   support

           0       0.45      0.18      0.26      1349
           1       0.54      0.58      0.56      1349
           2       0.57      0.31      0.40      1341
           3       0.56      0.91      0.69      1350
           4       0.52      0.45      0.48      1346
           5       0.56      0.43      0.49      1340
           6       0.38      0.25      0.30      1356
           7       0.41      0.42      0.42      1356
           8       0.39      0.72      0.51      1351
           9       0.50      0.60      0.54      1368

    accuracy                           0.49     13506
   macro avg       0.49      0.49      0.47     13506
weighted avg       0.49      0.49      0.47     13506

------------------------------------------------------------
Model: Random Forest
Accuracy: 0.5453872353028284
Classification Report:
  0: precision: 0.40, recall: 0.34, f1-score: 0.37, support: 1