In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Read the data
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

# Define genre label mapping
genre_label_mapping = {
    'blues': 0,
    'classical': 1,
    'country': 2,
    'disco': 3,
    'hiphop': 4,
    'jazz': 5,
    'metal': 6,
    'pop': 7,
    'reggae': 8,
    'rock': 9
}

# Preprocess the data
train_data['label'] = train_data['label'].map(genre_label_mapping)
X = train_data.drop(["filename", "label"], axis=1)
y = train_data["label"]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(test_data.drop(["id"], axis=1))

# Train LightGBM Classifier
lgbm_classifier = LGBMClassifier(n_estimators=100, random_state=42)
lgbm_classifier.fit(X_train, y_train)
y_pred_lgbm = lgbm_classifier.predict_proba(X_val)

# Train K-Nearest Neighbors Classifier
knn_classifier = KNeighborsClassifier(n_neighbors=5)
knn_classifier.fit(X_train, y_train)
y_pred_knn = knn_classifier.predict_proba(X_val)

# Combine predictions using weighted average
weighted_average = 0.7 * y_pred_lgbm + 0.3 * y_pred_knn
y_pred_combined = np.argmax(weighted_average, axis=1)

# Calculate accuracy on the validation set
accuracy = accuracy_score(y_val, y_pred_combined)
print("Ensemble Model Validation Set Accuracy:", accuracy)

# Correlation matrix
correlation_matrix = np.corrcoef(X_train, rowvar=False)

# Finding highly correlated columns
corr_threshold = 0.7
highly_correlated = set()
for i in range(correlation_matrix.shape[1]):
    for j in range(i):
        if abs(correlation_matrix[i, j]) > corr_threshold:
            highly_correlated.add(j)

# Remove one of the correlated features
X_train_reduced = np.delete(X_train, list(highly_correlated), axis=1)
X_val_reduced = np.delete(X_val, list(highly_correlated), axis=1)
X_test_reduced = np.delete(X_test, list(highly_correlated), axis=1)

# Retrain the models
lgbm_classifier.fit(X_train_reduced, y_train)
knn_classifier.fit(X_train_reduced, y_train)

# Predict using the reduced features
y_pred_lgbm_reduced = lgbm_classifier.predict_proba(X_val_reduced)
y_pred_knn_reduced = knn_classifier.predict_proba(X_val_reduced)
test_pred_lgbm_reduced = lgbm_classifier.predict_proba(X_test_reduced)
test_pred_knn_reduced = knn_classifier.predict_proba(X_test_reduced)

# Combine predictions using weighted average
weighted_average_reduced = 0.7 * y_pred_lgbm_reduced + 0.3 * y_pred_knn_reduced
y_pred_combined_reduced = np.argmax(weighted_average_reduced, axis=1)

# Calculate accuracy on the validation set with reduced features
accuracy_reduced = accuracy_score(y_val, y_pred_combined_reduced)
print("Ensemble Model Validation Set Accuracy with Reduced Features:", accuracy_reduced)
