In [2]:

import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer  
from sklearn.impute import IterativeImputer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
import warnings
warnings.filterwarnings('ignore')


train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

X = train.drop(columns=['id', 'song_popularity'])
y = train['song_popularity']

# feature Engineering
def add_features(df):
    df = df.copy()
    df["duration_min"] = df["song_duration_ms"] / 60000
    for col in ["loudness", "song_duration_ms", "tempo"]:
        df[f"log_{col}"] = np.log1p(df[col].abs() + 1e-6)
    df["energy_per_dance"] = df["energy"] / (df["danceability"] + 1e-6)
    df["speech_per_liveness"] = df["speechiness"] / (df["liveness"] + 1e-6)
    df["acoustic_x_instrumental"] = df["acousticness"] * df["instrumentalness"]
    df["dance_energy"] = df["danceability"] * df["energy"]
    return df

X = add_features(X)
test_data = add_features(test.drop(columns=['id']))

#imputation and scaling
numeric_features = X.select_dtypes(include=[np.number]).columns

mice_imputer = IterativeImputer(
    estimator=RandomForestRegressor(
        n_estimators=100,
        max_depth=10,
        n_jobs=-1,
        random_state=42
    ),
    max_iter=10,
    random_state=42,
    initial_strategy='median',
    verbose=0
)

numeric_transformer = Pipeline(steps=[
    ('imputer', mice_imputer),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[('num', numeric_transformer, numeric_features)]
)

X_processed = preprocessor.fit_transform(X)
test_processed = preprocessor.transform(test_data)

# defining Models
models = {
    "KNN": KNeighborsClassifier(n_neighbors=15),
    "RandomForest": RandomForestClassifier(n_estimators=300, max_depth=10, random_state=42, criterion='entropy'),
    "GBM": GradientBoostingClassifier(n_estimators=300, learning_rate=0.05, criterion='squared_error', subsample=0.8, max_features='sqrt'),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='auc', n_estimators=300, learning_rate=0.05, max_depth=6, random_state=42),
    "CatBoost": CatBoostClassifier(iterations=300, verbose=0, random_state=42),
    "LightGBM": LGBMClassifier(n_estimators=300, learning_rate=0.05, random_state=42),
    "NN_TORCH": MLPClassifier(hidden_layer_sizes=(128, 64, 32), max_iter=500, activation='relu', random_state=42)
}

# cross validation
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
cv_results = {}

print("\nModel performances (10-Fold ROC-AUC):\n")

for name, model in models.items():
    fold_aucs = []
    for train_idx, val_idx in kf.split(X_processed, y):
        X_train, X_val = X_processed[train_idx], X_processed[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        model.fit(X_train, y_train)
        preds = model.predict_proba(X_val)[:, 1]
        auc = roc_auc_score(y_val, preds)
        fold_aucs.append(auc)
    
    mean_auc = np.mean(fold_aucs)
    cv_results[name] = mean_auc
    print(f"{name}: {mean_auc:.4f}")


# Stacking Ensemble
print("\nTraining stacked ensemble on full data...")
base_estimators = [(name, model) for name, model in models.items()]
meta_model = XGBClassifier(eval_metric='auc', n_estimators=200, random_state=42)

stack = StackingClassifier(
    estimators=base_estimators,
    final_estimator=meta_model,
    passthrough=True,
    n_jobs=-1
)

stack.fit(X_processed, y)

# predicting on test set
test_predictions_proba = stack.predict_proba(test_processed)[:, 1]
submission = pd.DataFrame({'id': test['id'], 'song_popularity': test_predictions_proba})
submission.to_csv('submission.csv', index=False)

print("\n✅ Submission file 'submission.csv' created successfully.")
print("\nAverage CV AUC Scores:")
for name, auc in cv_results.items():
    print(f"{name}: {auc:.4f}")



Model performances (10-Fold ROC-AUC):

KNN: 0.5195
RandomForest: 0.5679
GBM: 0.5702
XGBoost: 0.5534
CatBoost: 0.5550
[LightGBM] [Info] Number of positive: 9839, number of negative: 17161
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000619 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4848
[LightGBM] [Info] Number of data points in the train set: 27000, number of used features: 21
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.364407 -> initscore=-0.556285
[LightGBM] [Info] Start training from score -0.556285
[LightGBM] [Info] Number of positive: 9839, number of negative: 17161
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000717 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4848
[LightGBM] [Info] Number of data points in the train set: 27000, number of used features: 21
[LightGBM] [Info] [binary:Boos

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.




✅ Submission file 'submission.csv' created successfully.

Average CV AUC Scores:
KNN: 0.5195
RandomForest: 0.5679
GBM: 0.5702
XGBoost: 0.5534
CatBoost: 0.5550
LightGBM: 0.5570
NN_TORCH: 0.5062
