In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Define a dictionary to map genre names to numerical labels
genre_label_mapping = {
    'blues': 0,
    'classical': 1,
    'country': 2,
    'disco': 3,
    'hiphop': 4,
    'jazz': 5,
    'metal': 6,
    'pop': 7,
    'reggae': 8,
    'rock': 9
}

# Step 1: Read the train.csv and test.csv files
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

# Replace genre names in the "label" column of the train_data DataFrame with numerical labels
train_data['label'] = train_data['label'].map(genre_label_mapping)

# Extract features and labels from the train_data DataFrame
X = train_data.drop(["filename", "label"], axis=1)
y = train_data["label"]

# Split the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features (mean = 0, standard deviation = 1)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(test_data.drop(["id"], axis=1))

# Train XGBoost Classifier
xgb_classifier = XGBClassifier(n_estimators=100, random_state=42)
xgb_classifier.fit(X_train, y_train)
y_pred_proba_xgb = xgb_classifier.predict_proba(X_val)

# Train RandomForest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)
y_pred_proba_rf = rf_classifier.predict_proba(X_val)

# Combine predictions using a weighted average (you can adjust the weights as needed)
weighted_average_proba = 0.7 * y_pred_proba_xgb + 0.3 * y_pred_proba_rf

# Choose the class with the highest probability as the prediction
y_pred = weighted_average_proba.argmax(axis=1)

# Calculate accuracy on the validation set
accuracy = accuracy_score(y_val, y_pred)
print("Ensemble Model Validation Set Accuracy:", accuracy)

# Use the ensemble model to make predictions on the test set
test_pred_proba_xgb = xgb_classifier.predict_proba(X_test)
test_pred_proba_rf = rf_classifier.predict_proba(X_test)
test_weighted_average_proba = 0.7 * test_pred_proba_xgb + 0.3 * test_pred_proba_rf
test_predictions = test_weighted_average_proba.argmax(axis=1)

submission_df = pd.DataFrame({"id": test_data["id"], "label": test_predictions})
submission_df.to_csv("ensemble_custom_model_submission.csv", index=False)

#accuracy:89.6