## Imports

In [137]:
 
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
from typing import Tuple, List, Optional, Dict 
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV 
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler,PowerTransformer, OneHotEncoder, LabelEncoder 
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.impute import SimpleImputer 
from sklearn.pipeline import Pipeline 
from sklearn.compose import ColumnTransformer 
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_classif, f_regression, RFE, mutual_info_regression
from sklearn.decomposition import PCA 
from sklearn.linear_model import LogisticRegression, LinearRegression, LassoCV 
from sklearn.svm import SVC, SVR 
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor 
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor 
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, GradientBoostingRegressor 
from sklearn.naive_bayes import GaussianNB 
from sklearn.cluster import KMeans 
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,roc_auc_score, roc_curve, confusion_matrix, classification_report, mean_squared_error, mean_absolute_error, r2_score)
from scipy import stats

## Basics

In [138]:
df = pd.read_csv("C.csv")

In [139]:
print(df.describe())
print(df.head())
print(df.info())

            User_ID           Age        Height        Weight      Duration  \
count  1.500000e+04  14998.000000  15000.000000  14995.000000  15000.000000   
mean   1.497749e+07     43.267036    174.457667     69.011204     15.531733   
std    2.872680e+06     46.268160     14.257967      0.922866      8.319143   
min    1.000116e+07      1.000000    123.000000     50.000000      1.000000   
25%    1.247482e+07     28.000000    164.000000     69.000000      8.000000   
50%    1.499893e+07     39.000000    175.000000     69.000000     16.000000   
75%    1.744928e+07     56.000000    185.000000     69.000000     23.000000   
max    1.999965e+07   4566.000000    222.000000    101.000000     30.000000   

         Heart_Rate     Body_Temp      Calories  
count  15000.000000  15000.000000  15000.000000  
mean      95.516533     40.025520     89.539867  
std        9.582873      0.779377     62.446654  
min       67.000000     37.100000      1.000000  
25%       88.000000     39.600000     

## Data Cleaning

In [140]:
X = df.drop(columns=['User_ID', 'Calories'])
Y = df['Calories']

In [141]:
X = X.dropna()
Y = Y[X.index]
# Outlier
z_scores = np.abs(stats.zscore(X.select_dtypes(include=[np.number])))
filtrd_entrs = (z_scores < 3).all(axis=1)
X = X[filtrd_entrs]
Y = Y[filtrd_entrs]

le = LabelEncoder()
X['Gender'] = le.fit_transform(X['Gender'])
X.head()

missing_values = df.isnull().sum()
print(missing_values[missing_values])
numeric_imputer = SimpleImputer(strategy='mean')
df[["Age", "Duration", "Heart_Rate"]] = numeric_imputer.fit_transform(df[["Age", "Duration", "Heart_Rate"]])

User_ID     0
Height      0
Age         2
User_ID     0
Duration    0
User_ID     0
User_ID     0
User_ID     0
User_ID     0
dtype: int64


  print(missing_values[missing_values])


In [142]:
print(X.describe())
print(X.head())
print(X.info())

             Gender           Age        Height        Weight      Duration  \
count  14846.000000  14846.000000  14846.000000  14846.000000  14846.000000   
mean       0.496363     42.806345    174.465041     69.000269     15.606224   
std        0.500004     16.983098     14.227345      0.034820      8.273393   
min        0.000000     20.000000    132.000000     67.000000      1.000000   
25%        0.000000     28.000000    164.000000     69.000000      8.000000   
50%        0.000000     40.000000    175.000000     69.000000     16.000000   
75%        1.000000     56.000000    185.000000     69.000000     23.000000   
max        1.000000     79.000000    217.000000     71.000000     30.000000   

         Heart_Rate     Body_Temp  
count  14846.000000  14846.000000  
mean      95.593426     40.038845  
std        9.537488      0.759019  
min       67.000000     37.700000  
25%       88.000000     39.600000  
50%       96.000000     40.200000  
75%      103.000000     40.600000  


## Feature Selection

In [143]:
## Corelation Matrix
corr = X.corr()
print(corr)
upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))

droppin = [column for column in upper.columns if any(upper[column].abs() > 0.9)]
X = X.drop(columns=droppin)
print(f"Dropped columns: {droppin}")

              Gender       Age    Height    Weight  Duration  Heart_Rate  \
Gender      1.000000  0.002571  0.711106 -0.007682  0.003708    0.011231   
Age         0.002571  1.000000  0.009826 -0.010278  0.011723    0.008974   
Height      0.711106  0.009826  1.000000  0.003146 -0.003766    0.001058   
Weight     -0.007682 -0.010278  0.003146  1.000000 -0.008050   -0.001496   
Duration    0.003708  0.011723 -0.003766 -0.008050  1.000000    0.851712   
Heart_Rate  0.011231  0.008974  0.001058 -0.001496  0.851712    1.000000   
Body_Temp   0.008067  0.009900  0.002316 -0.008297  0.906143    0.773393   

            Body_Temp  
Gender       0.008067  
Age          0.009900  
Height       0.002316  
Weight      -0.008297  
Duration     0.906143  
Heart_Rate   0.773393  
Body_Temp    1.000000  
Dropped columns: ['Body_Temp']


In [144]:
# Mutual Information

k_best = SelectKBest(score_func=mutual_info_regression, k= 3)
k_best.fit(X, Y)
scores = k_best.scores_

feature_scores = pd.Series(scores, index=X.columns)
print(feature_scores.sort_values(ascending=False))

selected_features_mi = X.columns[k_best.get_support()]
# X = X[selected_features_mi]

selected_features_mi

Duration      1.498556
Heart_Rate    0.866576
Age           0.033148
Height        0.010090
Gender        0.007962
Weight        0.000000
dtype: float64


Index(['Age', 'Duration', 'Heart_Rate'], dtype='object')

In [145]:
# Decision tree 
tree = DecisionTreeRegressor()
tree.fit(X, Y)
importances = tree.feature_importances_
feature_importances = pd.Series(importances, index=X.columns)

mean_impt = np.mean(feature_importances)

selected_features_dtree = feature_importances[feature_importances > 0.5].index
# X = X[selected_features_dtree]
# selected_features_dtree

print(feature_importances)
print(mean_impt)

Gender        7.984825e-03
Age           2.666005e-02
Height        2.899658e-03
Weight        9.491414e-07
Duration      9.147734e-01
Heart_Rate    4.768109e-02
dtype: float64
0.16666666666666666


In [146]:
# Check Verience Threshold
selector = VarianceThreshold(threshold=0.1)
selector.fit(X)
variances = selector.variances_
feature_variances = pd.Series(variances, index=X.columns)
print(feature_variances)

selected_features_var = feature_variances[feature_variances > 0.1].index
# X = X[selected_features_var]
selected_features_var

Gender          0.249987
Age           288.406186
Height        202.403702
Weight          0.001212
Duration       68.444428
Heart_Rate     90.957559
dtype: float64


Index(['Gender', 'Age', 'Height', 'Duration', 'Heart_Rate'], dtype='object')

# Model Training

In [147]:
selected_features = ["Age", "Duration", "Heart_Rate"]
X = df[selected_features]
Y = df['Calories']

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.5, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)
Y_pred = model.predict(X_test)
score = model.score(X_test, y_test)

score

0.9363485916822804