The notebook is clear and organized and easy to follow. I would recommend using a heatmap which can show the correlation to BeatsPerMinutes to help see the strongest predictors for your final model. Possibly add more headings and descriptions.

In [52]:
# Imports
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [53]:
# Load dataset
df = pd.read_csv("/kaggle/input/test-and-training-data/train.csv")

In [54]:
df.head()


Unnamed: 0,id,RhythmScore,AudioLoudness,VocalContent,AcousticQuality,InstrumentalScore,LivePerformanceLikelihood,MoodScore,TrackDurationMs,Energy,BeatsPerMinute
0,0,0.60361,-7.636942,0.0235,5e-06,1e-06,0.051385,0.409866,290715.645,0.826267,147.5302
1,1,0.639451,-16.267598,0.07152,0.444929,0.349414,0.170522,0.65101,164519.5174,0.1454,136.15963
2,2,0.514538,-15.953575,0.110715,0.173699,0.453814,0.029576,0.423865,174495.5667,0.624667,55.31989
3,3,0.734463,-1.357,0.052965,0.001651,0.159717,0.086366,0.278745,225567.4651,0.487467,147.91212
4,4,0.532968,-13.056437,0.0235,0.068687,1e-06,0.331345,0.477769,213960.6789,0.947333,89.58511


In [55]:
# Features and target 
# Drop target + id column right away
X_raw = df.drop(columns=['BeatsPerMinute', 'id'])
y = df['BeatsPerMinute']

In [56]:
print (y.describe())

count    524164.000000
mean        119.034899
std          26.468077
min          46.718000
25%         101.070410
50%         118.747660
75%         136.686590
max         206.037000
Name: BeatsPerMinute, dtype: float64


In [57]:
print(y['BeatsPerMinute'].mean)

KeyError: 'BeatsPerMinute'

In [None]:
print (X_raw.shape)

In [None]:
#Drop low-variance features
selector = VarianceThreshold(threshold=0.01)
X_filtered = pd.DataFrame(
    selector.fit_transform(X_raw),
    columns=X_raw.columns[selector.get_support()]
)

In [None]:
print (X_filtered.shape)

In [None]:
X_raw['TrackDurationMs'].info()

In [None]:
X_raw['TrackDurationMs'].describe()

In [None]:
# Feature engineering: log-transform skewed features
if 'TrackDurationMs' in X_filtered.columns:
    X_filtered['TrackDurationMs'] = np.log1p(X_filtered['TrackDurationMs'])
    #change to seconds

In [None]:
# Train/validation split
X_train, X_val, y_train, y_val = train_test_split(
    X_filtered, y, test_size=0.2, random_state=42
)

In [None]:
print (X_val.shape)
print (X_val.head)

In [None]:
print (X_train.shape)
print (X_train.head)
print(X_train.columns)

In [None]:
print (y_val.shape)
print (y_val.head)

In [None]:
print (y_train.shape)
print (y_train.head)
print(y_train.columns)

In [None]:
# Ridge Regression (scaled + tuned alpha)
ridge = make_pipeline(StandardScaler(), Ridge(alpha=10))
ridge.fit(X_train, y_train)
y_pred_ridge = ridge.predict(X_val)
mse_ridge = mean_squared_error(y_val, y_pred_ridge)
print("Ridge Regression MSE:", mse_ridge)

In [None]:
# Random Forest 
rf = RandomForestRegressor(
    n_estimators=300, max_depth=20, max_features='sqrt',
    random_state=42, n_jobs=2
)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_val)
mse_rf = mean_squared_error(y_val, y_pred_rf)
print("Random Forest MSE:", mse_rf)

In [None]:
# Gradient Boosting 
gb = GradientBoostingRegressor(
    n_estimators=1000, learning_rate=0.05, max_depth=3, random_state=42
)
gb.fit(X_train, y_train)
y_pred_gb = gb.predict(X_val)
mse_gb = mean_squared_error(y_val, y_pred_gb)
print("Gradient Boosting MSE:", mse_gb)

In [None]:
# Baseline (mean predictor) 
baseline_pred = np.full_like(y_val, y_train.mean())
mse_baseline = mean_squared_error(y_val, baseline_pred)
print("Baseline MSE:", mse_baseline)

In [None]:
# --- Plot predictions ---
plt.figure(figsize=(10, 7))
plt.scatter(y_val, y_pred_rf, alpha=0.5, label="Random Forest Predictions", color='green')
plt.scatter(y_val, y_pred_gb, alpha=0.5, label="Gradient Boosting Predictions", color='purple')
plt.scatter(y_val, y_pred_ridge, alpha=0.7, label="Ridge Predictions", color='dodgerblue', marker='x')
plt.plot([y_val.min(), y_val.max()], [y_val.min(), y_val.max()], 'r--', label="Ideal Prediction Line")
plt.xlabel("Actual BPM")
plt.ylabel("Predicted BPM")
plt.title(
    f"Model Comparison\nRidge MSE: {mse_ridge:.2f} | RF MSE: {mse_rf:.2f} | GB MSE: {mse_gb:.2f} | Baseline MSE: {mse_baseline:.2f}"
)
plt.legend(loc="upper left")
plt.grid(True)
plt.show()


In [None]:
# Retrain Random Forest on full training data
rf.fit(X, y)

# Load test.csv and predict
test_df = pd.read_csv("test.csv")
test_preds = rf.predict(test_df)

# Save submission file
submission = pd.DataFrame({"Id": test_df.index, "BeatsPerMinute": test_preds})
submission.to_csv("submission.csv", index=False)
