In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
from typing import Any, Tuple
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error
import csv 
import joblib
from dateutil import parser

In [4]:
def load_data(dataset_path: str, is_eval_dataset=False, is_handball_dataset=False) -> Tuple[Any, Any, Any, int, float, int]:
    # Remove columns which you consider not relevant for calories prediction
    columns_to_drop = ["User_ID"]
    
    avg_duration = 0
    no_males = 0
    no_senior_users = 0

    # Load dataset from CSV file
    dataset = []
    with open(dataset_path, "r", encoding="utf-8") as file:
        reader = csv.DictReader(file)
        for row in reader:
            # First, remove columns that are not relevant for calories prediction
            for col in columns_to_drop:
                row.pop(col)

            # Convert numerical values to int or float
            for key, value in row.items():
                if value.replace('.', '', 1).isdigit():  # Check if it is a number
                    if '.' in value:
                        row[key] = float(value)  # Convert to float if decimal present
                    else:
                        row[key] = int(value)
                
            # Encode gender if available 
            if not(is_handball_dataset):
                
                if row["Gender"] == 'male':
                    no_males += 1
                    row["Gender"] = 0
                else:
                    row["Gender"] = 1
            dataset.append(row)

            avg_duration += row['Duration']

            if row['Age'] >= 75:
                no_senior_users += 1
            
    # Calculate average price and owners
    avg_duration = round(avg_duration / len(dataset), 2)

    
    target = "Calories"
    # Separate features (X) and target (y)
    X = []
    y = []    
    
    for row in dataset:
        features = {} 
        for key in row:
            if key != target:
                features[key] = row[key]
        X.append(features)
        y.append(row[target] if not is_eval_dataset else 0)

    # Return the output tuple        
    return pd.DataFrame(X), pd.DataFrame(y, columns=['Calories']), no_males, avg_duration, no_senior_users

# Load the full dataset
X, y, no_males, avg_duration, no_senior_users = load_data("dataset_train.csv")

# Output to a file named output_1.csv the number of samples, number of males, average duration
# and number of senior users
with open("output_0.csv", "w") as file:
    # Write the header with the required columns: number of samples, Average Price, Average Owners, and number of unique Genres
    file.write("Samples,No. Males,Average Duration,SeniorUsers\n")
    # Write the data
    file.write(f"{len(X)},{no_males},{avg_duration},{no_senior_users}\n")

In [16]:
df = X.copy()

In [18]:
df

Unnamed: 0,Gender,Age,Height,Weight,Duration,Heart_Rate,Body_Temp
0,0,79,165.0,73.0,13.0,79.0,40.0
1,1,28,148.0,48.0,6.0,91.0,39.4
2,0,27,187.0,82.0,25.0,101.0,40.8
3,0,40,173.0,71.0,16.0,93.0,40.4
4,0,40,183.0,90.0,5.0,80.0,39.0
...,...,...,...,...,...,...,...
8995,0,38,181.0,82.0,21.0,103.0,40.5
8996,1,63,172.0,70.0,3.0,85.0,38.6
8997,1,30,168.0,70.0,28.0,103.0,41.1
8998,1,45,164.0,61.0,5.0,84.0,39.1


In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9000 entries, 0 to 8999
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Gender      9000 non-null   int64  
 1   Age         9000 non-null   int64  
 2   Height      9000 non-null   float64
 3   Weight      9000 non-null   float64
 4   Duration    9000 non-null   float64
 5   Heart_Rate  9000 non-null   float64
 6   Body_Temp   9000 non-null   float64
dtypes: float64(5), int64(2)
memory usage: 492.3 KB


In [22]:
def normalize_data(df: pd.DataFrame) -> pd.DataFrame:
    df_normalized = df.copy()
    for column in df.select_dtypes(include=['number']).columns:
        df_normalized[column] = (df[column] - df[column].min()) / (df[column].max() - df[column].min())
    return df_normalized

In [30]:
from sklearn.model_selection import RepeatedKFold
from lightgbm import LGBMRegressor, early_stopping, log_evaluation
import numpy as np

def train_model(X: pd.DataFrame, y: pd.DataFrame) -> Any:

    # More pre-processing first
    X = normalize_data(X)
    X = X.drop(columns=['Height', 'Body_Temp', 'Heart_Rate'], errors='ignore')

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    cv = RepeatedKFold(n_repeats = 3, n_splits = 10, random_state = 42)
    n_splits = 10

    oof_predictions = np.zeros(X_train.shape[0])
    test_predictions = np.zeros(X_test.shape[0])

    model_params = {
        'n_jobs': -1,
        'learning_rate': 0.15,
        'max_bin': 100,
        'colsample_bytree': 0.8,
        'subsample': 0.8,
        'objective': 'regression_l1',
        'metric': 'mae',
        'verbosity': -1,
        'n_estimators': 2000,
        'random_state': 42
    }

    for fold, (train_idx, val_idx) in enumerate(cv.split(X_train, y_train), 1):
        print(f'Fold {fold}')

        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

        model = LGBMRegressor(**model_params)

        # Train model
        model.fit(
            X_tr, y_tr,
            eval_set=[(X_val, y_val)],
            eval_metric='mae',
            callbacks=[
                early_stopping(stopping_rounds=200),
                log_evaluation(period=100)
            ]
        )
    
        # Make predictions
        pred = model.predict(X_val)
        oof_predictions[val_idx] = pred  # Store OOF predictions
    
        # Aggregate test set predictions
        test_predictions += model.predict(X_test) / n_splits
    
        # Compute and print fold MAE
        fold_mae = mean_absolute_error(y_val, pred)
        print(f'Fold {fold} MAE: {fold_mae}')
    
    return model

trained_model = train_model(X, y)

Fold 1
Training until validation scores don't improve for 200 rounds
[100]	valid_0's l1: 9.2026
[200]	valid_0's l1: 9.1708
[300]	valid_0's l1: 9.16244
[400]	valid_0's l1: 9.19552
Early stopping, best iteration is:
[234]	valid_0's l1: 9.14022
Fold 1 MAE: 9.1402242675001
Fold 2
Training until validation scores don't improve for 200 rounds
[100]	valid_0's l1: 8.74533
[200]	valid_0's l1: 8.7889
Early stopping, best iteration is:
[91]	valid_0's l1: 8.731
Fold 2 MAE: 8.730996250836057
Fold 3
Training until validation scores don't improve for 200 rounds
[100]	valid_0's l1: 8.5631
[200]	valid_0's l1: 8.62333
Early stopping, best iteration is:
[86]	valid_0's l1: 8.52901
Fold 3 MAE: 8.529014091568417
Fold 4
Training until validation scores don't improve for 200 rounds
[100]	valid_0's l1: 8.11491
[200]	valid_0's l1: 8.17691
[300]	valid_0's l1: 8.19783
Early stopping, best iteration is:
[101]	valid_0's l1: 8.11265
Fold 4 MAE: 8.112647635445336
Fold 5
Training until validation scores don't improve 

In [34]:
trained_model

In [38]:
def predict_calories(trained_model: Any, dataset_path: str, output_file: str, gender_input = True) -> pd.DataFrame:
    # Load the model
    # model = joblib.load(trained_model)
    
    # Load the evaluation dataset
    X_eval, _, _, _, _ = load_data(dataset_path, is_eval_dataset=True, is_handball_dataset=not(gender_input))

    X_eval = normalize_data(X_eval)
    X_eval = X_eval.drop(columns=['Height', 'Body_Temp', 'Heart_Rate'], errors='ignore')


    if not(gender_input):
        X_eval['Gender'] = 0
        # Move Gender column to beginning to match training set-up 
        X_eval = X_eval[['Gender'] + [col for col in X_eval.columns if col != 'Gender']]
    
    # Make predictions
    y_pred = trained_model.predict(X_eval)
    
    # Save the predictions to a file named output_2.csv with a single column of predictions
    # no pandas 
    with open(output_file, "w") as file:
        # Write the header
        file.write("Calories\n")
        # Write the predictions
        for pred in y_pred:
            file.write(str(pred) + "\n")
    
    
# trained_model = joblib.load("Output_CandidatX/trained_model.pkl")
predict_calories(trained_model, "task1_dataset_eval.csv", "output_1.csv")

In [40]:
predict_calories(trained_model, "task2_dataset_eval.csv", "output_2.csv", gender_input=False)