In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import hamming_loss
import joblib

# Load the dataset
file_path = r"C:\Users\wcyuy\OneDrive\Desktop\Prog\Snakeyboi\Gamer\Genre\dataset.csv"
data = pd.read_csv(file_path)

selected_features = [
    "danceability", "energy", "speechiness", "acousticness",
    "instrumentalness", "liveness", "valence", "tempo", "loudness", "mode"
]
target = "track_genre" 

In [None]:
# Group by `track_id` and aggregate genres and audio features
grouped_data = data.groupby("track_id").agg({
    "track_genre": lambda x: list(set(x)),  # Combine genres into a unique list
    **{feature: "mean" for feature in selected_features}  # Aggregate audio features by mean
}).reset_index()

# Convert `track_genre` to one-hot encoding (binary columns for each genre)
unique_genres = set(genre for genres in grouped_data["track_genre"] for genre in genres)

# Efficiently create a binary matrix for genres
genre_matrix = pd.DataFrame(
    {genre: grouped_data["track_genre"].apply(lambda x: 1 if genre in x else 0) for genre in unique_genres}
)

# Concatenate the genre matrix with the grouped_data
grouped_data = pd.concat([grouped_data, genre_matrix], axis=1)

# Count occurrences of each genre
genre_counts = grouped_data[list(unique_genres)].sum()

# Select the top 15 most popular genres
top_15_genres = genre_counts.nlargest(15).index


In [None]:
# Filter the target matrix to include only the top 15 genres
y = grouped_data[top_15_genres]

# Define X (features)
X = grouped_data[selected_features]

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest for multi-label classification
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
multi_rf = MultiOutputClassifier(rf_classifier)
multi_rf.fit(X_train, y_train)

# Predict on the test set
y_pred = multi_rf.predict(X_test)

# Evaluate with Hamming Loss
hamming = hamming_loss(y_test, y_pred)
print(f"Hamming Loss: {hamming:.2f}")

# Evaluate with Exact Match Accuracy (Subset Accuracy)
y_pred_array = np.array(y_pred)
y_test_array = np.array(y_test)
exact_matches = np.all(y_pred_array == y_test_array, axis=1)  # True if all labels match
exact_match_accuracy = np.mean(exact_matches)  # Fraction of exact matches
print(f"Exact Match Accuracy (Subset Accuracy): {exact_match_accuracy:.2f}")

In [None]:
joblib.dump(multi_rf, 'genre_prediction_model.pkl')
print("Model saved")
#loaded_model = joblib.load('genre_prediction_model.pkl')
#print("Model loaded")
