In [157]:
# https://platform.olimpiada-ai.ro/en/problems/74

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')

In [158]:
train = pd.read_csv("/kaggle/input/caloric-consumption/train_data.csv")
test = pd.read_csv("/kaggle/input/caloric-consumption/test_data.csv")
subm = pd.read_csv("/kaggle/input/caloric-consumption/sample_output.csv")

# train.drop_duplicates(inplace=True)

train.shape, test.shape, subm.shape

((9000, 9), (2500, 9), (2504, 3))

In [159]:
subm.loc[:3, 'answer'] = [len(train), (train['Gender'] == 'male').sum().item(), train['Duration'].mean().item(), (train['Age'] >= 75).sum().item()]

In [160]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9000 entries, 0 to 8999
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   User_ID     9000 non-null   int64  
 1   Age         9000 non-null   int64  
 2   Height      9000 non-null   int64  
 3   Weight      9000 non-null   int64  
 4   Duration    9000 non-null   int64  
 5   Heart_Rate  9000 non-null   int64  
 6   Body_Temp   9000 non-null   float64
 7   Gender      9000 non-null   object 
 8   Calories    9000 non-null   int64  
dtypes: float64(1), int64(7), object(1)
memory usage: 632.9+ KB


In [161]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2500 entries, 0 to 2499
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Subtask     2500 non-null   int64  
 1   User_ID     2500 non-null   int64  
 2   Age         2500 non-null   int64  
 3   Height      2500 non-null   int64  
 4   Weight      2500 non-null   int64  
 5   Duration    2500 non-null   int64  
 6   Heart_Rate  2500 non-null   int64  
 7   Body_Temp   2500 non-null   float64
 8   Gender      2000 non-null   object 
dtypes: float64(1), int64(7), object(1)
memory usage: 175.9+ KB


In [162]:
train.groupby('Gender')['Calories'].agg(['mean', 'median', 'min', 'max'])

Unnamed: 0_level_0,mean,median,min,max
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,88.196621,80.0,2,240
male,90.699077,77.0,1,314


In [163]:
subm['subtaskID'].value_counts()

subtaskID
5    2000
6     500
2       1
1       1
4       1
3       1
Name: count, dtype: int64

In [164]:
train['Gender_male'] = (train['Gender']=='male').astype(int)
train['Gender_female'] = (train['Gender']=='female').astype(int)
train['Gender_other'] = (~train['Gender'].isin(['male', 'female'])).astype(int)

test['Gender_male'] = (test['Gender']=='male').astype(int)
test['Gender_female'] = (test['Gender']=='female').astype(int)
test['Gender_other'] = (~test['Gender'].isin(['male', 'female'])).astype(int)

train.drop(columns=['Gender'], inplace=True)
test.drop(columns=['Gender'], inplace=True)

In [165]:
from sklearn.model_selection import train_test_split

features = ['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp', "Gender_male", "Gender_female", "Gender_other"]
target_col = 'Calories'

X, y = train[features], train[target_col]
X_test = test[features]

X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=42, test_size=0.2)

X_train.shape, X_valid.shape

((7200, 9), (1800, 9))

In [170]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV

model = GradientBoostingRegressor(random_state=42, loss='absolute_error')

param_grid = {
    'max_depth': [3, 4, 5, 6, 7, 8, 9],
    'n_estimators': [100, 500]
}
search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_absolute_error', refit=True, verbose=2)

search.fit(X_train, y_train)

Fitting 5 folds for each of 14 candidates, totalling 70 fits
[CV] END ......................max_depth=3, n_estimators=100; total time=   0.9s
[CV] END ......................max_depth=3, n_estimators=100; total time=   0.9s
[CV] END ......................max_depth=3, n_estimators=100; total time=   0.9s
[CV] END ......................max_depth=3, n_estimators=100; total time=   0.9s
[CV] END ......................max_depth=3, n_estimators=100; total time=   0.9s
[CV] END ......................max_depth=3, n_estimators=500; total time=   4.3s
[CV] END ......................max_depth=3, n_estimators=500; total time=   4.3s
[CV] END ......................max_depth=3, n_estimators=500; total time=   4.2s
[CV] END ......................max_depth=3, n_estimators=500; total time=   4.3s
[CV] END ......................max_depth=3, n_estimators=500; total time=   4.2s
[CV] END ......................max_depth=4, n_estimators=100; total time=   1.2s
[CV] END ......................max_depth=4, n_es

In [171]:
search.best_params_

{'max_depth': 6, 'n_estimators': 500}

In [173]:
from sklearn.metrics import mean_absolute_error

preds = search.predict(X_valid)

mae = mean_absolute_error(y_valid, preds)

print(f'MAE: {mae:.5f}')

MAE: 1.51685


In [174]:
subm.loc[4:, 'answer'] = search.predict(X_test)
subm.to_csv("submission.csv", index=False)