In [45]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error

In [46]:
train = pd.read_csv('train.csv')
train.head()

Unnamed: 0,id,Sex,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories
0,0,male,36,189.0,82.0,26.0,101.0,41.0,150.0
1,1,female,64,163.0,60.0,8.0,85.0,39.7,34.0
2,2,female,51,161.0,64.0,7.0,84.0,39.8,29.0
3,3,male,20,192.0,90.0,25.0,105.0,40.7,140.0
4,4,female,38,166.0,61.0,25.0,102.0,40.6,146.0


In [47]:
train.shape

(750000, 9)

In [48]:
train.isnull().sum()

id            0
Sex           0
Age           0
Height        0
Weight        0
Duration      0
Heart_Rate    0
Body_Temp     0
Calories      0
dtype: int64

In [49]:
train.duplicated().sum()

0

In [50]:
test = pd.read_csv('test.csv')
test.head()

Unnamed: 0,id,Sex,Age,Height,Weight,Duration,Heart_Rate,Body_Temp
0,750000,male,45,177.0,81.0,7.0,87.0,39.8
1,750001,male,26,200.0,97.0,20.0,101.0,40.5
2,750002,female,29,188.0,85.0,16.0,102.0,40.4
3,750003,female,39,172.0,73.0,20.0,107.0,40.6
4,750004,female,30,173.0,67.0,16.0,94.0,40.5


In [51]:
test.shape

(250000, 8)

In [52]:
test.isnull().sum()

id            0
Sex           0
Age           0
Height        0
Weight        0
Duration      0
Heart_Rate    0
Body_Temp     0
dtype: int64

In [53]:
test.duplicated().sum()

0

In [54]:
combined = pd.concat([train, test], axis = 0)
combined.shape

(1000000, 9)

In [55]:
combined["Duration_HeartRate"] = combined["Duration"] * combined["Heart_Rate"]

In [56]:
combined["Duration_BodyTemp"] = combined["Duration"] * combined["Body_Temp"]

In [57]:
sex_mapping = {'male':'0','female':'1'}
combined['Sex'] = (combined['Sex'].replace(sex_mapping)).astype(float)

In [58]:
combined = combined.drop('id', axis = 1)

In [59]:
newtrain = combined.iloc[0:750000, :]
newtest = combined.iloc[750000: , :]

In [60]:
newtest = newtest.drop('Calories', axis = 1)

In [61]:
newtrain.head()

Unnamed: 0,Sex,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories,Duration_HeartRate,Duration_BodyTemp
0,0.0,36,189.0,82.0,26.0,101.0,41.0,150.0,2626.0,1066.0
1,1.0,64,163.0,60.0,8.0,85.0,39.7,34.0,680.0,317.6
2,1.0,51,161.0,64.0,7.0,84.0,39.8,29.0,588.0,278.6
3,0.0,20,192.0,90.0,25.0,105.0,40.7,140.0,2625.0,1017.5
4,1.0,38,166.0,61.0,25.0,102.0,40.6,146.0,2550.0,1015.0


In [62]:
newtest.head()

Unnamed: 0,Sex,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Duration_HeartRate,Duration_BodyTemp
0,0.0,45,177.0,81.0,7.0,87.0,39.8,609.0,278.6
1,0.0,26,200.0,97.0,20.0,101.0,40.5,2020.0,810.0
2,1.0,29,188.0,85.0,16.0,102.0,40.4,1632.0,646.4
3,1.0,39,172.0,73.0,20.0,107.0,40.6,2140.0,812.0
4,1.0,30,173.0,67.0,16.0,94.0,40.5,1504.0,648.0


In [63]:
x = newtrain.drop('Calories', axis = 1)
y = newtrain['Calories']

In [64]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 1)

In [65]:
xgb_model = XGBRegressor(random_state=21, tree_method='hist', device='cuda', n_jobs=-1)
y_pred = xgb_model.fit(x_train, y_train).predict(x_test)
y_pred = np.maximum(y_pred, 0)
np.sqrt(mean_squared_log_error(y_test, y_pred))

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




0.06403566109396933

In [87]:
x_train = newtrain.drop('Calories', axis = 1)
y_train = newtrain['Calories']
x_test = newtest

In [89]:
xgb_model = XGBRegressor(random_state=21, tree_method='hist', device='cuda', n_jobs=-1)
y_pred = xgb_model.fit(x_train, y_train).predict(x_test)
y_pred = np.maximum(y_pred, 0)

In [91]:
solution = pd.DataFrame({'id' : test['id'], 'Calories' : np.abs(y_pred)})
solution.head()

Unnamed: 0,id,Calories
0,750000,27.847992
1,750001,109.909149
2,750002,87.875198
3,750003,124.73394
4,750004,76.477043


In [93]:
solution.to_csv('Solution.csv', index = False)