The notebook is clear and organized and easy to follow. I would recommend using a heatmap which can show the correlation to BeatsPerMinutes to help see the strongest predictors for your final model. Possibly add more headings and descriptions.

In [27]:
# Imports
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Load dataset from Kaggle input path
df = pd.read_csv("/kaggle/input/test-and-training-data/train.csv")

# Features and target (drop target + id column)
X = df.drop(columns=['BeatsPerMinute', 'id'])
y = df['BeatsPerMinute']

# Drop low-variance features
selector = VarianceThreshold(threshold=0.01)
X_filtered = pd.DataFrame(selector.fit_transform(X), columns=X.columns[selector.get_support()])

# Log-transform skewed features (example: TrackDurationMs)
if 'TrackDurationMs' in X_filtered.columns:
    X_filtered['TrackDurationMs'] = np.log1p(X_filtered['TrackDurationMs'])

# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X_filtered, y, test_size=0.2, random_state=42
)

# --- Ridge Regression with scaling and tuned alpha ---
ridge = make_pipeline(StandardScaler(), Ridge(alpha=10))
ridge.fit(X_train, y_train)
y_pred_ridge = ridge.predict(X_val)
mse_ridge = mean_squared_error(y_val, y_pred_ridge)
print("Ridge Regression MSE:", mse_ridge)

# --- Random Forest with tuned parameters ---
rf = RandomForestRegressor(
    n_estimators=300, max_depth=20, max_features='sqrt', random_state=42, n_jobs=-1
)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_val)
mse_rf = mean_squared_error(y_val, y_pred_rf)
print("Random Forest MSE:", mse_rf)

# --- Gradient Boosting with tuned parameters ---
gb = GradientBoostingRegressor(
    n_estimators=1000, learning_rate=0.05, max_depth=3, random_state=42
)
gb.fit(X_train, y_train)
y_pred_gb = gb.predict(X_val)
mse_gb = mean_squared_error(y_val, y_pred_gb)
print("Gradient Boosting MSE:", mse_gb)

# --- Baseline (mean predictor) ---
baseline_pred = np.full_like(y_val, y_train.mean())
mse_baseline = mean_squared_error(y_val, baseline_pred)
print("Baseline MSE:", mse_baseline)

# --- Plot predictions ---
plt.figure(figsize=(10, 7))
plt.scatter(y_val, y_pred_rf, alpha=0.5, label="Random Forest Predictions", color='green')
plt.scatter(y_val, y_pred_gb, alpha=0.5, label="Gradient Boosting Predictions", color='purple')
plt.scatter(y_val, y_pred_ridge, alpha=0.7, label="Ridge Predictions", color='dodgerblue', marker='x')
plt.plot([y_val.min(), y_val.max()], [y_val.min(), y_val.max()], 'r--', label="Ideal Prediction Line")
plt.xlabel("Actual BPM")
plt.ylabel("Predicted BPM")
plt.title(
    f"Model Comparison\nRidge MSE: {mse_ridge:.2f} | RF MSE: {mse_rf:.2f} | GB MSE: {mse_gb:.2f} | Baseline MSE: {mse_baseline:.2f}"
)
plt.legend(loc="upper left")
plt.grid(True)
plt.show()


Ridge Regression MSE: 699.2481816078694


KeyboardInterrupt: 

In [None]:
# Retrain Random Forest on full training data
rf.fit(X, y)

# Load test.csv and predict
test_df = pd.read_csv("test.csv")
test_preds = rf.predict(test_df)

# Save submission file
submission = pd.DataFrame({"Id": test_df.index, "BeatsPerMinute": test_preds})
submission.to_csv("submission.csv", index=False)
