In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
from typing import Any, Tuple
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error
import csv 
import joblib
from dateutil import parser
import optuna
import lightgbm as lgb

In [3]:
def load_data(dataset_path: str, is_eval_dataset=False, is_handball_dataset=False) -> Tuple[Any, Any, Any, int, float, int]:
    # Remove columns which you consider not relevant for calories prediction
    columns_to_drop = ["User_ID"]
    
    avg_duration = 0
    no_males = 0
    no_senior_users = 0

    # Load dataset from CSV file
    dataset = []
    with open(dataset_path, "r", encoding="utf-8") as file:
        reader = csv.DictReader(file)
        for row in reader:
            # First, remove columns that are not relevant for calories prediction
            for col in columns_to_drop:
                row.pop(col)

            # Convert numerical values to int or float
            for key, value in row.items():
                if value.replace('.', '', 1).isdigit():  # Check if it is a number
                    if '.' in value:
                        row[key] = float(value)  # Convert to float if decimal present
                    else:
                        row[key] = int(value)
                
            # Encode gender if available 
            if not(is_handball_dataset):
                
                if row["Gender"] == 'male':
                    no_males += 1
                    row["Gender"] = 0
                else:
                    row["Gender"] = 1
            dataset.append(row)

            avg_duration += row['Duration']

            if row['Age'] >= 75:
                no_senior_users += 1
            
    # Calculate average price and owners
    avg_duration = round(avg_duration / len(dataset), 2)

    
    target = "Calories"
    # Separate features (X) and target (y)
    X = []
    y = []    
    
    for row in dataset:
        features = {} 
        for key in row:
            if key != target:
                features[key] = row[key]
        X.append(features)
        y.append(row[target] if not is_eval_dataset else 0)

    # Return the output tuple        
    return pd.DataFrame(X), pd.DataFrame(y, columns=['Calories']), no_males, avg_duration, no_senior_users

# Load the full dataset
X, y, no_males, avg_duration, no_senior_users = load_data("dataset_train.csv")

# Output to a file named output_1.csv the number of samples, number of males, average duration
# and number of senior users
with open("output_0.csv", "w") as file:
    # Write the header with the required columns: number of samples, Average Price, Average Owners, and number of unique Genres
    file.write("Samples,No. Males,Average Duration,SeniorUsers\n")
    # Write the data
    file.write(f"{len(X)},{no_males},{avg_duration},{no_senior_users}\n")

In [4]:
df = X.copy()

In [5]:
df

Unnamed: 0,Gender,Age,Height,Weight,Duration,Heart_Rate,Body_Temp
0,0,79,165.0,73.0,13.0,79.0,40.0
1,1,28,148.0,48.0,6.0,91.0,39.4
2,0,27,187.0,82.0,25.0,101.0,40.8
3,0,40,173.0,71.0,16.0,93.0,40.4
4,0,40,183.0,90.0,5.0,80.0,39.0
...,...,...,...,...,...,...,...
8995,0,38,181.0,82.0,21.0,103.0,40.5
8996,1,63,172.0,70.0,3.0,85.0,38.6
8997,1,30,168.0,70.0,28.0,103.0,41.1
8998,1,45,164.0,61.0,5.0,84.0,39.1


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9000 entries, 0 to 8999
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Gender      9000 non-null   int64  
 1   Age         9000 non-null   int64  
 2   Height      9000 non-null   float64
 3   Weight      9000 non-null   float64
 4   Duration    9000 non-null   float64
 5   Heart_Rate  9000 non-null   float64
 6   Body_Temp   9000 non-null   float64
dtypes: float64(5), int64(2)
memory usage: 492.3 KB


In [7]:
def normalize_data(df: pd.DataFrame) -> pd.DataFrame:
    df_normalized = df.copy()
    for column in df.select_dtypes(include=['number']).columns:
        df_normalized[column] = (df[column] - df[column].min()) / (df[column].max() - df[column].min())
    return df_normalized

In [8]:
from sklearn.model_selection import RepeatedKFold
from lightgbm import LGBMRegressor, early_stopping, log_evaluation
import numpy as np

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

def objective(trial):
    # Hyperparameters to tune
    param = {
        'objective': 'regression',  # Regression task
        'metric': 'mae',  # Mean Absolute Error
        'verbosity': -1,
        'boosting_type': 'gbdt',  # Gradient Boosting Decision Tree
        'max_depth': trial.suggest_int('max_depth', 2, 10),
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 10.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 10.0)
    }

    # Initialize and train regressor
    model = lgb.LGBMRegressor(**param)
    model.fit(X_train, y_train)

    # Get predictions
    y_pred = model.predict(X_test)

    # Compute MAE (lower is better)
    mae = mean_absolute_error(y_test, y_pred)
    return mae

def train_model(X: pd.DataFrame, y: pd.DataFrame) -> Any:

    # More pre-processing first
    X = normalize_data(X)
    X = X.drop(columns=['Height', 'Body_Temp', 'Heart_Rate'], errors='ignore')

    study = optuna.create_study(direction='minimize')  # Minimize the objective value
    study.optimize(objective, n_trials=50)
    print("Best trial:")
    trial = study.best_trial

    print(f"  Value: {trial.value}")
    print("  Params: ")
    for key, value in trial.params.items():
        print(f"    {key}: {value}")

    best_params = study.best_trial.params
    model = lgb.LGBMClassifier(**best_params)
    model.fit(X, y)
    
    return model

trained_model = train_model(X, y)

[I 2025-03-19 22:28:12,769] A new study created in memory with name: no-name-8bd130b2-ef54-49f9-83d5-f958b89906e6
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 10.0),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 10.0)
found 0 physical cores < 1
  File "C:\Users\retr0\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 282, in _count_physical_cores
    raise ValueError(f"found {cpu_count_physical} physical cores < 1")
[I 2025-03-19 22:28:12,988] Trial 0 finished with value: 5.067591745398576 and parameters: {'max_depth': 4, 'num_leaves': 130, 'learning_rate': 0.014157371421771264, 'n_estimators': 209, 'min_child_samples': 12, 'subsample': 0.7333407475479692, 'colsample_bytree': 0.8118589532065654, 'reg_alpha': 7.03918467245739, '

Best trial:
  Value: 1.1206370887520352
  Params: 
    max_depth: 4
    num_leaves: 59
    learning_rate: 0.23515872968382726
    n_estimators: 769
    min_child_samples: 6
    subsample: 0.8566538961418617
    colsample_bytree: 0.5752427792639649
    reg_alpha: 1.081563813621406
    reg_lambda: 6.273985246890202e-08


In [9]:
trained_model

In [10]:
def predict_calories(trained_model: Any, dataset_path: str, output_file: str, gender_input = True) -> pd.DataFrame:
    # Load the model
    # model = joblib.load(trained_model)
    
    # Load the evaluation dataset
    X_eval, _, _, _, _ = load_data(dataset_path, is_eval_dataset=True, is_handball_dataset=not(gender_input))

    X_eval = normalize_data(X_eval)
    X_eval = X_eval.drop(columns=['Height', 'Body_Temp', 'Heart_Rate'], errors='ignore')


    if not(gender_input):
        X_eval['Gender'] = 0
        # Move Gender column to beginning to match training set-up 
        X_eval = X_eval[['Gender'] + [col for col in X_eval.columns if col != 'Gender']]
    
    # Make predictions
    y_pred = trained_model.predict(X_eval)
    
    # Save the predictions to a file named output_2.csv with a single column of predictions
    # no pandas 
    with open(output_file, "w") as file:
        # Write the header
        file.write("Calories\n")
        # Write the predictions
        for pred in y_pred:
            file.write(str(pred) + "\n")
    
    
# trained_model = joblib.load("Output_CandidatX/trained_model.pkl")
predict_calories(trained_model, "task1_dataset_eval.csv", "output_1.csv")

In [11]:
predict_calories(trained_model, "task2_dataset_eval.csv", "output_2.csv", gender_input=False)