In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score

# Define a dictionary to map genre names to numerical labels
genre_label_mapping = {
    'blues': 0,
    'classical': 1,
    'country': 2,
    'disco': 3,
    'hiphop': 4,
    'jazz': 5,
    'metal': 6,
    'pop': 7,
    'reggae': 8,
    'rock': 9
}

# Step 1: Read the train.csv and test.csv files
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

# Replace genre names in the "label" column of the train_data DataFrame with numerical labels
train_data['label'] = train_data['label'].map(genre_label_mapping)

# Extract features and labels from the train_data DataFrame
X = train_data.drop(["filename", "label"], axis=1)
y = train_data["label"]

# Split the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features (mean = 0, standard deviation = 1)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

# Train a Random Forest model to identify top features
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Select top 50 important features
selector = SelectFromModel(rf, threshold=-1, max_features=50)
selector.fit(X_train, y_train)
X_train_top = selector.transform(X_train)
X_val_top = selector.transform(X_val)

# Train the Random Forest Classifier using top features
rf_top = RandomForestClassifier(n_estimators=100, random_state=42)
rf_top.fit(X_train_top, y_train)

# Predict using top features and evaluate
y_pred = rf_top.predict(X_val_top)
accuracy = accuracy_score(y_val, y_pred)
print("Validation Set Accuracy (using top features):", accuracy)

# Apply the trained model to the test data using top features
X_test_top = selector.transform(test_data.drop(["id"], axis=1))
test_predictions = rf_top.predict(X_test_top)


# accuracy: 86 percent 