In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

# Load the dataset
df = pd.read_csv('data/musicgenre.csv')

# Handle missing values
df = df.dropna(subset=['music_genre'])
df = df.replace("?", None).dropna()

# Encode categorical features
encoder = LabelEncoder()
df['music_genre'] = encoder.fit_transform(df['music_genre'])
df['key'] = encoder.fit_transform(df['key'])
df['mode'] = encoder.fit_transform(df['mode'])

# Create a mapping dictionary for the music genres
genre_mapping = dict(zip(encoder.transform(encoder.classes_), encoder.classes_))

# Scale the features
df["duration_ms"] = df["duration_ms"] / 1000

# Drop irrelevant columns
df.drop(['track_name', 'instance_id', 'obtained_date', 'artist_name'], axis=1, inplace=True)

# Split the data into features and target
X = df.drop(columns=['music_genre'])
y = df['music_genre']

# Create the model
model = RandomForestClassifier(random_state=42)

# Perform cross-validation and calculate accuracy
scores = cross_val_score(model, X, y, cv=10, scoring='accuracy')

# Print the accuracy
print(f'Cross-validated accuracy: {scores.mean():.2f} ± {scores.std():.2f}')

# Train the model on the entire dataset
model.fit(X, y)

# Example: Display the feature importances
importances = model.feature_importances_
feature_importances = pd.DataFrame({'feature': X.columns, 'importance': importances})
print(feature_importances.sort_values(by='importance', ascending=False))


Cross-validated accuracy: 0.54 ± 0.01
             feature  importance
0         popularity    0.184212
10       speechiness    0.092470
8           loudness    0.092152
2       danceability    0.091327
1       acousticness    0.090211
5   instrumentalness    0.083111
4             energy    0.077433
12           valence    0.070680
3        duration_ms    0.064254
11             tempo    0.057199
7           liveness    0.051496
6                key    0.033573
9               mode    0.011881
