In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
import joblib
import json
import os

In [7]:
df = pd.read_csv('./dataset/data.csv')
print(df.head())

   Age  SleepHours  TeaCoffeePerDay  ExerciseDays  HealthyMeals  MealsPerDay  \
0   22           5                2             7             1            2   
1   15           8                0             0             0            3   
2   18           4                3             1             0            3   
3   65           8                2             0             3            3   
4   14           7                0             1             1            3   

  HealthStatus  
0      Average  
1      Average  
2         Poor  
3         Good  
4      Average  


In [8]:
df["HealthStatus"] = df["HealthStatus"].str.strip()
df["Age"] = df["Age"].fillna(df["Age"].median())
df["SleepHours"] = df["SleepHours"].fillna(df["SleepHours"].median())
df["TeaCoffeePerDay"] = df["TeaCoffeePerDay"].fillna(0)
df["ExerciseDays"] = df["ExerciseDays"].fillna(0)
df["HealthyMeals"] = df["HealthyMeals"].fillna(df["HealthyMeals"].median())
df["MealsPerDay"] = df["MealsPerDay"].fillna(df["MealsPerDay"].mode()[0])
df["HealthStatus"] = df["HealthStatus"].fillna("Average")
df = df.drop_duplicates()
df = df.dropna()

df["Meals_to_SleepRatio"] = np.where(
    df["SleepHours"] > 0, 
    df["MealsPerDay"] / df["SleepHours"], 
    0
)
df["Exercise_to_AgeRatio"] = np.where(
    df["Age"] > 0, 
    df["ExerciseDays"] / df["Age"], 
    0
)
df["Caffeine_to_Meals"] = np.where(
    df["MealsPerDay"] > 0, 
    df["TeaCoffeePerDay"] / df["MealsPerDay"], 
    0
)

numeric_cols = df.select_dtypes(include=["int64", "float64"]).columns.tolist()
num_features_to_scale = [c for c in numeric_cols]

scaler = StandardScaler()
df[num_features_to_scale] = scaler.fit_transform(df[num_features_to_scale])

os.makedirs("models", exist_ok=True)

joblib.dump(scaler, "models/health_scaler.pkl")
TRAIN_COLUMNS = df.drop(columns=["HealthStatus"]).columns.tolist()
json.dump(TRAIN_COLUMNS, open("models/train_columns.json", "w"))

print(df.head())

Data after preprocessing:
        Age  SleepHours  TeaCoffeePerDay  ExerciseDays  HealthyMeals  \
0 -0.692044   -0.501443        -0.433706      2.438029     -0.497519   
1 -1.470593    1.104873        -1.359927     -0.932412     -1.019913   
2 -1.136929   -1.036881         0.029404     -0.450921     -1.019913   
3  4.090474    1.104873        -0.433706     -0.932412      0.547270   
4 -1.581815    0.569435        -1.359927     -0.450921     -0.497519   

   MealsPerDay HealthStatus  Meals_to_SleepRatio  Exercise_to_AgeRatio  \
0    -0.984251      Average            -0.361427              2.643260   
1     1.016001      Average            -0.499559             -0.849615   
2     1.016001         Poor             1.572425             -0.239748   
3     1.016001         Good            -0.499559             -0.849615   
4     1.016001      Average            -0.203562             -0.065500   

   Caffeine_to_Meals  
0          -0.260193  
1          -1.340997  
2          -0.260193  
3   

In [19]:
X = df.drop(columns=["HealthStatus"])
y = df["HealthStatus"]

le = LabelEncoder()
y_encoded = le.fit_transform(y)

joblib.dump(le, "models/label_encoder.joblib")

X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42
)

lr = LinearRegression()
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)

rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

In [10]:
def print_metrics(name, y_true, y_pred):
    r2 = r2_score(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    print(f"\n{name} Performance:")
    print(f"  R²   : {r2:.3f}")
    print(f"  MAE  : {mae:,.3f}")
    print(f"  MSE  : {mse:,.3f}")
    print(f"  RMSE : {rmse:,.3f}")

print_metrics("Linear Regression", y_test, lr_pred)
print_metrics("Random Forest", y_test, rf_pred)


Linear Regression Performance:
  R²   : 0.188
  MAE  : 0.667
  MSE  : 0.519
  RMSE : 0.720

Random Forest Performance:
  R²   : 0.354
  MAE  : 0.478
  MSE  : 0.413
  RMSE : 0.642


In [18]:
i = 4
x_one_df = X_test.iloc[[i]]
y_true = y_test[i]
p_lr_one = float(lr.predict(x_one_df)[0])
p_rf_one = float(rf.predict(x_one_df)[0])

print("\nSingle-row sanity check:")
print(f"  Actual Value: {y_true} ({le.inverse_transform([y_true])[0]})")
print(f"  LR Pred     : {p_lr_one:.2f}")
print(f"  RF Pred     : {p_rf_one:.2f}")

joblib.dump(lr, "models/lr_model.joblib")
joblib.dump(rf, "models/rf_model.joblib")


Single-row sanity check:
  Actual Value: 1 (Good)
  LR Pred     : 0.59
  RF Pred     : 0.31


['models/rf_model.joblib']