In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import xgboost as XGB
import sklearn as sk
import matplotlib.pyplot as plt
import random
import warnings
import itertools

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s5e5/sample_submission.csv
/kaggle/input/playground-series-s5e5/train.csv
/kaggle/input/playground-series-s5e5/test.csv
/kaggle/input/orginal-dataset/calories.csv


In [2]:
def config(tf_log_level='3', ignore_warnings=True, pandas_copy_on_write=True):
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = tf_log_level
    if ignore_warnings:
        warnings.simplefilter('ignore')
    pd.options.mode.copy_on_write = pandas_copy_on_write

config()

In [3]:
tr_data = pd.read_csv('/kaggle/input/playground-series-s5e5/train.csv')
tr_data.head(-10)

Unnamed: 0,id,Sex,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories
0,0,male,36,189.0,82.0,26.0,101.0,41.0,150.0
1,1,female,64,163.0,60.0,8.0,85.0,39.7,34.0
2,2,female,51,161.0,64.0,7.0,84.0,39.8,29.0
3,3,male,20,192.0,90.0,25.0,105.0,40.7,140.0
4,4,female,38,166.0,61.0,25.0,102.0,40.6,146.0
...,...,...,...,...,...,...,...,...,...
749985,749985,female,50,162.0,61.0,23.0,100.0,40.8,132.0
749986,749986,male,56,192.0,93.0,17.0,101.0,39.9,112.0
749987,749987,male,56,195.0,94.0,14.0,91.0,40.2,74.0
749988,749988,male,33,175.0,78.0,11.0,90.0,39.6,44.0


In [4]:
c_data = pd.read_csv('/kaggle/input/orginal-dataset/calories.csv')
c_data.head(-10)

Unnamed: 0,id,Sex,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories
0,14733363,male,68,190.0,94.0,29.0,105.0,40.8,231.0
1,14861698,female,20,166.0,60.0,14.0,94.0,40.3,66.0
2,11179863,male,69,179.0,79.0,5.0,88.0,38.7,26.0
3,16180408,female,34,179.0,71.0,13.0,100.0,40.5,71.0
4,17771927,female,27,154.0,58.0,10.0,81.0,39.8,35.0
...,...,...,...,...,...,...,...,...,...
14985,11040953,female,46,168.0,66.0,4.0,78.0,39.1,13.0
14986,12610726,male,22,178.0,75.0,24.0,91.0,40.5,78.0
14987,18746025,female,42,181.0,77.0,23.0,102.0,40.7,131.0
14988,14361981,female,26,160.0,61.0,22.0,110.0,40.7,143.0


In [5]:
ts_data = pd.read_csv('/kaggle/input/playground-series-s5e5/test.csv')
ts_data.head(-10)

Unnamed: 0,id,Sex,Age,Height,Weight,Duration,Heart_Rate,Body_Temp
0,750000,male,45,177.0,81.0,7.0,87.0,39.8
1,750001,male,26,200.0,97.0,20.0,101.0,40.5
2,750002,female,29,188.0,85.0,16.0,102.0,40.4
3,750003,female,39,172.0,73.0,20.0,107.0,40.6
4,750004,female,30,173.0,67.0,16.0,94.0,40.5
...,...,...,...,...,...,...,...,...
249985,999985,male,48,178.0,69.0,28.0,99.0,41.0
249986,999986,female,26,158.0,56.0,28.0,110.0,40.9
249987,999987,male,48,191.0,94.0,5.0,87.0,39.0
249988,999988,female,42,154.0,52.0,22.0,97.0,40.9


In [6]:
def feature_engineering(df):
    df.drop(columns=['id'], inplace=True)

    df['Sex'] = df['Sex'].map({'female': 1, 'male': 2})
    df['AgeSex'] = df['Age'].astype(str) + df['Sex'].astype(str)
    df['AgeSex'] = sk.preprocessing.LabelEncoder().fit_transform(df['AgeSex']) + 1
    for col in ['Sex', 'Age', 'AgeSex']:
        df['CAT_' + col] = df[col].astype('category')
        
    features = ['Weight', 'Height', 'Body_Temp', 'Heart_Rate', 'Duration', 'Age', 'Sex', 'AgeSex']

    for comb in itertools.combinations(features, 2):
        df[" * ".join(comb)] = df[comb[0]] * df[comb[1]]
        df[" / ".join(comb)] = df[comb[0]] / df[comb[1]]
        df[" ** ".join(comb)] = df[comb[0]] * (df[comb[1]] ** 2)
        df[" *** ".join(comb)] = df[comb[1]] * (df[comb[0]] ** 2)
        
    
    return df

In [7]:
tr_data = feature_engineering(tr_data)
c_data = feature_engineering(c_data)
ts_data = feature_engineering(ts_data)

In [8]:
tr_data.shape

(750000, 124)

In [9]:
ts_data.shape

(250000, 123)

In [10]:
c_data.shape

(15000, 124)

In [11]:
tr_data.head(-10)

Unnamed: 0,Sex,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories,AgeSex,CAT_Sex,...,Age ** Sex,Age *** Sex,Age * AgeSex,Age / AgeSex,Age ** AgeSex,Age *** AgeSex,Sex * AgeSex,Sex / AgeSex,Sex ** AgeSex,Sex *** AgeSex
0,2,36,189.0,82.0,26.0,101.0,41.0,150.0,34,2,...,144,2592,1224,1.058824,41616,44064,68,0.058824,2312,136
1,1,64,163.0,60.0,8.0,85.0,39.7,34.0,89,1,...,64,4096,5696,0.719101,506944,364544,89,0.011236,7921,89
2,1,51,161.0,64.0,7.0,84.0,39.8,29.0,63,1,...,51,2601,3213,0.809524,202419,163863,63,0.015873,3969,63
3,2,20,192.0,90.0,25.0,105.0,40.7,140.0,2,2,...,80,800,40,10.000000,80,800,4,1.000000,8,8
4,1,38,166.0,61.0,25.0,102.0,40.6,146.0,37,1,...,38,1444,1406,1.027027,52022,53428,37,0.027027,1369,37
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
749985,1,50,162.0,61.0,23.0,100.0,40.8,132.0,61,1,...,50,2500,3050,0.819672,186050,152500,61,0.016393,3721,61
749986,2,56,192.0,93.0,17.0,101.0,39.9,112.0,74,2,...,224,6272,4144,0.756757,306656,232064,148,0.027027,10952,296
749987,2,56,195.0,94.0,14.0,91.0,40.2,74.0,74,2,...,224,6272,4144,0.756757,306656,232064,148,0.027027,10952,296
749988,2,33,175.0,78.0,11.0,90.0,39.6,44.0,28,2,...,132,2178,924,1.178571,25872,30492,56,0.071429,1568,112


In [12]:
seed = 42
FOLD = 8
cv = sk.model_selection .KFold(FOLD, random_state=seed, shuffle=True)
pred_test = np.zeros((250000,))
print(cv)

KFold(n_splits=8, random_state=42, shuffle=True)


In [13]:
for idx_train, idx_valid in cv.split(tr_data):
    print("\n")

    X_train = tr_data.iloc[idx_train]
    X_train = pd.concat([X_train, c_data], axis=0, ignore_index=True).sample(frac=1, random_state=seed)
    X_valid = tr_data.iloc[idx_valid]

    y_train = np.log1p(X_train.pop('Calories'))
    y_valid = np.log1p(X_valid.pop('Calories'))

    dtrain = XGB.DMatrix(X_train, label=y_train, enable_categorical=True)
    dval = XGB.DMatrix(X_valid, label=y_valid, enable_categorical=True)
    dtest = XGB.DMatrix(ts_data, enable_categorical=True)

    params = {
        'eval_metric': 'rmse',
        'seed': seed,
        'max_depth': 10,
        'learning_rate': 0.01,
        'reg_alpha': 2,
        'reg_lambda': 1,
        'max_delta_step': 2,
        'subsample': 0.9,
        'colsample_bytree': 0.55,
        'min_child_weight': 5,
        'enable_categorical': True,
        'device': "cuda"
    }
    
    model = XGB.train(
        params, 
        dtrain, 
        num_boost_round=1000000, 
        evals=[(dtrain, 'train'), (dval, 'validation')], 
        early_stopping_rounds=50, 
        verbose_eval=2000
    )

    predictions = model.predict(dval)
    pred_test += model.predict(dtest)



[0]	train-rmse:0.95414	validation-rmse:0.95331
[1080]	train-rmse:0.05360	validation-rmse:0.05939


[0]	train-rmse:0.95374	validation-rmse:0.95615
[1017]	train-rmse:0.05376	validation-rmse:0.05973


[0]	train-rmse:0.95378	validation-rmse:0.95585
[1182]	train-rmse:0.05342	validation-rmse:0.05980


[0]	train-rmse:0.95405	validation-rmse:0.95395
[888]	train-rmse:0.05415	validation-rmse:0.05862


[0]	train-rmse:0.95395	validation-rmse:0.95463
[1191]	train-rmse:0.05341	validation-rmse:0.05988


[0]	train-rmse:0.95433	validation-rmse:0.95197
[1143]	train-rmse:0.05358	validation-rmse:0.05882


[0]	train-rmse:0.95450	validation-rmse:0.95074
[1062]	train-rmse:0.05368	validation-rmse:0.05994


[0]	train-rmse:0.95370	validation-rmse:0.95646
[1197]	train-rmse:0.05340	validation-rmse:0.05881


In [14]:
data_subm = pd.read_csv("/kaggle/input/playground-series-s5e5/sample_submission.csv")
data_subm['Calories'] = np.expm1(pred_test)
data_subm.to_csv('submission.csv', index=False)