In [1]:
import pandas as pd
import statsmodels.api as sm
import numpy as np
import matplotlib.pyplot as plt
import time
import itertools

#Define dataset and clean
cal = pd.read_csv("Downloads/calories.csv")
cal['Gender'] = cal['Gender'].map({'male': 0, 'female': 1})
cal=cal.drop(["User_ID"], axis=1)
cal.describe()

Unnamed: 0,Gender,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories
count,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0
mean,0.503533,42.7898,174.465133,74.966867,15.5306,95.518533,40.025453,89.539533
std,0.500004,16.980264,14.258114,15.035657,8.319203,9.583328,0.77923,62.456978
min,0.0,20.0,123.0,36.0,1.0,67.0,37.1,1.0
25%,0.0,28.0,164.0,63.0,8.0,88.0,39.6,35.0
50%,1.0,39.0,175.0,74.0,16.0,96.0,40.2,79.0
75%,1.0,56.0,185.0,87.0,23.0,103.0,40.6,138.0
max,1.0,79.0,222.0,132.0,30.0,128.0,41.5,314.0


In [7]:
#Verify no NA values
len(cal)
cal.isnull().sum()

User_ID       0
Gender        0
Age           0
Height        0
Weight        0
Duration      0
Heart_Rate    0
Body_Temp     0
Calories      0
dtype: int64

In [2]:
#Best Subset Selection data prep
Xn = cal.drop(columns = ["Calories"])
Xn = sm.add_constant(Xn)

yn = cal["Calories"]

In [6]:
#make function to select on
def processSubset(feature_set):
    # Fit model on feature_set and calculate RSS
    if 'const' not in feature_set:
      feature_set = feature_set + ('const',) # mannually add intercept, may need to change to ('const',) if it is giving you the tuple error!!
    model = sm.OLS(yn,Xn[list(feature_set)])
    regr = model.fit()
    R_sq = regr.rsquared
    return {"model":regr, "R_sq":R_sq}

In [7]:
#make function to select best subset
def getBest(k):

    tic = time.time()

    results = []

    for combo in itertools.combinations(Xn.columns, k):
        results.append(processSubset(combo))

    # Wrap everything up in a nice dataframe
    models = pd.DataFrame(results)

    # Choose the model with the highest R Squared
    best_model = models.loc[models['R_sq'].argmax()]

    toc = time.time()
    print("Processed", models.shape[0], "models on", k, "predictors in", (toc-tic), "seconds.")

    # Return the best model, along with some other useful information about the model
    return best_model

In [13]:
# Run best subset selection model
models_best = pd.DataFrame(columns=["R_sq", "model"])

tic = time.time()

for i in range(1,9):
    models_best.loc[i] = getBest(i)

toc = time.time()
print("Total elapsed time:", (toc-tic), "seconds.")

print(getBest(i)["model"].summary())

Processed 8 models on 1 predictors in 0.04212021827697754 seconds.
Processed 28 models on 2 predictors in 0.10476541519165039 seconds.
Processed 56 models on 3 predictors in 0.22306609153747559 seconds.
Processed 70 models on 4 predictors in 0.4307708740234375 seconds.
Processed 56 models on 5 predictors in 0.3883237838745117 seconds.
Processed 28 models on 6 predictors in 0.193436861038208 seconds.
Processed 8 models on 7 predictors in 0.07471513748168945 seconds.
Processed 1 models on 8 predictors in 0.012114763259887695 seconds.
Total elapsed time: 1.5950770378112793 seconds.
Processed 1 models on 8 predictors in 0.0064046382904052734 seconds.
                            OLS Regression Results                            
Dep. Variable:               Calories   R-squared:                       0.967
Model:                            OLS   Adj. R-squared:                  0.967
Method:                 Least Squares   F-statistic:                 6.316e+04
Date:                Wed, 09 

In [None]:
#Manually running OLS to determine if difference is minute. If so, a simpler model may be more usable in the field. 

In [9]:
# Define dependent and independent variables
y = cal['Calories']
X = cal['Duration']

# Add a constant to the independent variable (for the intercept)
X = sm.add_constant(X)

# Create and fit the OLS model
model = sm.OLS(y, X)
results = model.fit()

# Print the results
print(results.summary())
#0.913

                            OLS Regression Results                            
Dep. Variable:               Calories   R-squared:                       0.913
Model:                            OLS   Adj. R-squared:                  0.913
Method:                 Least Squares   F-statistic:                 1.571e+05
Date:                Mon, 24 Mar 2025   Prob (F-statistic):               0.00
Time:                        14:07:56   Log-Likelihood:                -65002.
No. Observations:               15000   AIC:                         1.300e+05
Df Residuals:                   14998   BIC:                         1.300e+05
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        -21.8597      0.319    -68.550      0.0

AttributeError: 'OLS' object has no attribute 'rsquared_adj'

In [11]:
# Define dependent and independent variables
y = cal['Calories']
X = cal[['Duration', "Heart_Rate"]]

# Add a constant to the independent variable (for the intercept)
X = sm.add_constant(X)

# Create and fit the OLS model
model = sm.OLS(y, X)
results = model.fit()

# Print the results
print(results.summary())
#0.938

                            OLS Regression Results                            
Dep. Variable:               Calories   R-squared:                       0.938
Model:                            OLS   Adj. R-squared:                  0.938
Method:                 Least Squares   F-statistic:                 1.137e+05
Date:                Mon, 24 Mar 2025   Prob (F-statistic):               0.00
Time:                        14:11:11   Log-Likelihood:                -62432.
No. Observations:               15000   AIC:                         1.249e+05
Df Residuals:                   14997   BIC:                         1.249e+05
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       -181.1795      2.053    -88.265      0.0

In [15]:
# Define dependent and independent variables
y = cal['Calories']
X = cal[['Duration', "Heart_Rate", "Age"]]

# Add a constant to the independent variable (for the intercept)
X = sm.add_constant(X)

# Create and fit the OLS model
model = sm.OLS(y, X)
results = model.fit()

# Print the results
print(results.summary())
#0.958

                            OLS Regression Results                            
Dep. Variable:               Calories   R-squared:                       0.958
Model:                            OLS   Adj. R-squared:                  0.958
Method:                 Least Squares   F-statistic:                 1.148e+05
Date:                Wed, 09 Apr 2025   Prob (F-statistic):               0.00
Time:                        13:44:22   Log-Likelihood:                -59474.
No. Observations:               15000   AIC:                         1.190e+05
Df Residuals:                   14996   BIC:                         1.190e+05
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       -203.5339      1.706   -119.321      0.0

In [24]:
y=cal['Calories']
X=cal[["Duration", "Heart_Rate", "Age", "Gender"]]

X = sm.add_constant(X)

model=sm.OLS(y, X)
results=model.fit()

print(results.summary())
#0.959

                            OLS Regression Results                            
Dep. Variable:               Calories   R-squared:                       0.959
Model:                            OLS   Adj. R-squared:                  0.959
Method:                 Least Squares   F-statistic:                 8.667e+04
Date:                Mon, 24 Mar 2025   Prob (F-statistic):               0.00
Time:                        14:46:45   Log-Likelihood:                -59428.
No. Observations:               15000   AIC:                         1.189e+05
Df Residuals:                   14995   BIC:                         1.189e+05
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       -202.2488      1.706   -118.564      0.0

In [23]:
#The difference in R squared is quite small when moving between three and four variables. When looking for ease of application, companies may consider using a smaller model with only the variables Duration, Heart_Rate, and Age. 

Unnamed: 0,Duration,Heart_Rate,Age,Gender
count,15000.0,15000.0,15000.0,15000.0
mean,15.5306,95.518533,42.7898,0.503533
std,8.319203,9.583328,16.980264,0.500004
min,1.0,67.0,20.0,0.0
25%,8.0,88.0,28.0,0.0
50%,16.0,96.0,39.0,1.0
75%,23.0,103.0,56.0,1.0
max,30.0,128.0,79.0,1.0


In [1]:
git --version


NameError: name 'git' is not defined