In [4]:
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import HuberRegressor, LinearRegression, Ridge

import kagglehub


path = kagglehub.dataset_download("ruchikakumbhar/calories-burnt-prediction")
print("Path to dataset files:", path)

csvs = []
for root, dirs, files in os.walk(path):
    for f in files:
        if f.endswith(".csv"):
            csvs.append(os.path.join(root, f))

csvs = sorted(csvs)
print("CSV files found:")
for c in csvs:
    print("-", os.path.basename(c))


def pick_by_keyword(files, keyword):
    for f in files:
        if keyword in os.path.basename(f).lower():
            return f
    return None


exercise_path = pick_by_keyword(csvs, "exercise")
calories_path = pick_by_keyword(csvs, "calor")

if exercise_path and calories_path and exercise_path == calories_path:
    others = [f for f in csvs if f != exercise_path]
    calories_path = others[0] if others else None

if exercise_path is None:
    exercise_path = csvs[0] if csvs else None

if calories_path is None and len(csvs) > 1:
    candidates = [f for f in csvs if f != exercise_path]
    calories_path = candidates[0] if candidates else None

print("Exercise file:", exercise_path)
print("Calories file:", calories_path)


if exercise_path is None:
    raise ValueError("No CSV files found in dataset folder.")


exercise = pd.read_csv(exercise_path)
calories = None

if calories_path and os.path.exists(calories_path) and calories_path != exercise_path:
    calories = pd.read_csv(calories_path)


df = exercise.copy()

if calories is not None:
    common_cols = set(df.columns).intersection(set(calories.columns))
    key = "User_ID" if "User_ID" in common_cols else None
    if key:
        df = df.merge(calories, on=key, how="inner")


cols_lower = {c.lower(): c for c in df.columns}

target_col = None

for name in ["calories", "calorie", "burned_calories"]:
    if name in cols_lower:
        target_col = cols_lower[name]
        break

if target_col is None:
    calor_like = [c for c in df.columns if "calor" in c.lower()]
    if len(calor_like) > 0:
        target_col = calor_like[0]

if target_col is None:
    raise ValueError(f"Target column not found. Columns are: {df.columns.tolist()}")

print("Target column:", target_col)


id_cols = [c for c in df.columns if c.lower() in ["user_id", "id", "userid"]]
df_model = df.drop(columns=id_cols, errors="ignore").copy()

X = df_model.drop(columns=[target_col])
y = df_model[target_col]

numeric_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_features = X.select_dtypes(include=["object", "category", "bool"]).columns.tolist()


numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ],
    remainder="drop"
)

clf = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", HuberRegressor())
])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("MAE :", round(mae, 3))
print("RMSE:", round(rmse, 3))
print("R2  :", round(r2, 3))


baseline_lr = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", LinearRegression())
])

baseline_ridge = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", Ridge(alpha=1.0))
])

baseline_lr.fit(X_train, y_train)
baseline_ridge.fit(X_train, y_train)

pred_lr = baseline_lr.predict(X_test)
pred_ridge = baseline_ridge.predict(X_test)

print("LinearRegression R2:", round(r2_score(y_test, pred_lr), 3))
print("Ridge R2          :", round(r2_score(y_test, pred_ridge), 3))


sample = {}

if "Age" in X.columns: sample["Age"] = 25
if "Weight" in X.columns: sample["Weight"] = 70
if "Height" in X.columns: sample["Height"] = 175
if "Duration" in X.columns: sample["Duration"] = 60

if "Heart_Rate" in X.columns:
    sample["Heart_Rate"] = 130
elif "Heart Rate" in X.columns:
    sample["Heart Rate"] = 130

activity_col = None
for c in X.columns:
    if c.lower() in ["activity", "activity_type", "exercise_type"]:
        activity_col = c
        break

if activity_col:
    sample[activity_col] = "Brisk Walking"

if "Gender" in X.columns and "Gender" not in sample:
    sample["Gender"] = "male"

if "Body_Temp" in X.columns and "Body_Temp" not in sample:
    sample["Body_Temp"] = np.nan

sample_df = pd.DataFrame([sample], columns=X.columns)

pred_cal = clf.predict(sample_df)[0]
print("Predicted Calories Burned:", round(float(pred_cal), 2))


Using Colab cache for faster access to the 'calories-burnt-prediction' dataset.
Path to dataset files: /kaggle/input/calories-burnt-prediction
CSV files found:
- calories.csv
Exercise file: /kaggle/input/calories-burnt-prediction/calories.csv
Calories file: /kaggle/input/calories-burnt-prediction/calories.csv
Target column: Calories
MAE : 8.254
RMSE: 11.913
R2  : 0.965
LinearRegression R2: 0.967
Ridge R2          : 0.967
Predicted Calories Burned: 428.76
