### Downloads and imports

In [None]:
!pip install xgboost --upgrade

In [1]:
# import pandas library
import pandas as pd

# import xgboos library
import xgboost as xgb

# import numpy packages
import numpy as np
from numpy import mean, std, nan

# import sklearn packages
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import KNNImputer, IterativeImputer
from sklearn.preprocessing import MinMaxScaler

# import xgb optimization packages 
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

# import functools to wrap the objective function
from functools import partial

In [3]:
print(xgb.__version__)

1.6.1


In [None]:
# Download datasets
!wget https://www.dropbox.com/s/tdm9uw248tuni9s/train.csv?dl=0 -O train.csv
!wget https://www.dropbox.com/s/r26axzubyuexlsh/test.csv?dl=0 -O test.csv

### Load and prepare data

In [2]:
def load_and_print_dataset(path):
  """
    Loads the dataset into a dataframe from the given path and print the head of it.  
    
    Args:
      path (str): local path to the required dataset

    Return:
      df (pd.Dataframe): dataframe of the dataset
  """
  if path is not None:
    df = pd.read_csv(path)
    print(f"The shape of the train_dataframe: {df.shape}")
    print()
    pd.set_option('display.max_columns', 36)
    pd.set_option('display.width', 180)
    print(df.head())
    return df

In [3]:
train_path = "train.csv"
train_df   = load_and_print_dataset(train_path)

The shape of the train_dataframe: (2000000, 36)

   birth_year  birth_month  birth_time  birth_place  mother_age  marital_status  mother_education  father_age  father_education  interval_llb  cigarettes  mother_height  \
0        2017            5      1705.0          1.0          20             NaN               5.0        20.0               4.0         888.0         0.0           64.0   
1        2017           10       442.0          1.0          38             1.0               5.0        61.0               7.0         107.0         0.0           61.0   
2        2019            7      1453.0          1.0          29             1.0               5.0        29.0               3.0         888.0         0.0           63.0   
3        2016           10       859.0          1.0          28             1.0               4.0        29.0               3.0         888.0         0.0           67.0   
4        2019           10       817.0          1.0          24             2.0            

In [4]:
test_path = "test.csv"
test_df   = load_and_print_dataset(test_path)

  


The shape of the train_dataframe: (2000000, 36)

   birth_year  birth_month  birth_time  birth_place  mother_age  marital_status  mother_education  father_age  father_education  interval_llb  cigarettes  mother_height  \
0        2020            1      2354.0          1.0          22             2.0               5.0        27.0               NaN          18.0         0.0           56.0   
1        2020            1       347.0          1.0          19             2.0               3.0        20.0               3.0          11.0         0.0           60.0   
2        2020            1      1444.0          1.0          29             1.0               6.0        41.0               4.0          36.0         0.0           65.0   
3        2020            1      1118.0          1.0          32             1.0               6.0        44.0               4.0         888.0         0.0           63.0   
4        2020            1       654.0          1.0          25             1.0            

In [5]:
print(train_df['apgar5'].unique())

[ 9.  8.  7.  4. 10.  6. nan  5.  2.  0.  1.  3.]


In [6]:
print(f"Number of rows with missing values in train set: {train_df.shape[0] - train_df.dropna().shape[0]}. \
This is {(train_df.shape[0] - train_df.dropna().shape[0]) / train_df.shape[0] * 100:.2f} % of the data")

print(f"Number of rows with missing values in test set: {test_df.shape[0] - test_df.dropna().shape[0]}. \
This is {(test_df.shape[0] - test_df.dropna().shape[0]) / test_df.shape[0] * 100:.2f} % of the data")

Number of rows with missing values in train set: 555199. This is 27.76 % of the data
Number of rows with missing values in test set: 543120. This is 27.16 % of the data


In [7]:
cat_cols = train_df.select_dtypes(include=["object"]).columns

print("Amount of Nan for each categorical feature:")
print(train_df[cat_cols].isna().sum())
print()
print("Number of unique values in categorical features: ")
print(train_df[cat_cols].nunique())

Amount of Nan for each categorical feature:
pre_preg_diabetes        1903
gest_diabetes            1903
pre_preg_hypertension    1903
gest_hypertension        1903
prev_preterm_birth       1903
infertility_treatment    1903
gonorrhea                5160
syphilis                 5160
chlamydia                5160
hepatitis_b              5160
hepatitis_c              5160
labor_induction          1450
labor_augmentation       1133
steroids                 1133
antibiotics              1133
chorioamnionitis         1133
anesthesia               1133
dtype: int64

Number of unique values in categorical features: 
pre_preg_diabetes        2
gest_diabetes            2
pre_preg_hypertension    2
gest_hypertension        2
prev_preterm_birth       2
infertility_treatment    2
gonorrhea                2
syphilis                 2
chlamydia                2
hepatitis_b              2
hepatitis_c              2
labor_induction          2
labor_augmentation       2
steroids                 2
anti

Can be seen that all the object columns have only 2 values (True or False) hence, in order to use XGBOOST we need to transform this into numbers and keep the nan values. For this, I transformed all the objects to float and validated that all nan values did not remove.  

In [8]:
train_df[cat_cols]  = train_df[cat_cols].astype("float")
test_df[cat_cols]   = test_df[cat_cols].astype("float")

print(train_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000000 entries, 0 to 1999999
Data columns (total 36 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   birth_year             int64  
 1   birth_month            int64  
 2   birth_time             float64
 3   birth_place            float64
 4   mother_age             int64  
 5   marital_status         float64
 6   mother_education       float64
 7   father_age             float64
 8   father_education       float64
 9   interval_llb           float64
 10  cigarettes             float64
 11  mother_height          float64
 12  mother_bmi             float64
 13  pre_preg_weight        float64
 14  delivery_weight        float64
 15  pre_preg_diabetes      float64
 16  gest_diabetes          float64
 17  pre_preg_hypertension  float64
 18  gest_hypertension      float64
 19  prev_preterm_birth     float64
 20  infertility_treatment  float64
 21  prev_cesarian          int64  
 22  gonorrhea         

In [9]:
print("Amount of Nan for each categorical feature:")
print(train_df[cat_cols].isna().sum())

print("Amount of Nan for each categorical feature in test set:")
print(test_df[cat_cols].isna().sum())

Amount of Nan for each categorical feature:
pre_preg_diabetes        1903
gest_diabetes            1903
pre_preg_hypertension    1903
gest_hypertension        1903
prev_preterm_birth       1903
infertility_treatment    1903
gonorrhea                5160
syphilis                 5160
chlamydia                5160
hepatitis_b              5160
hepatitis_c              5160
labor_induction          1450
labor_augmentation       1133
steroids                 1133
antibiotics              1133
chorioamnionitis         1133
anesthesia               1133
dtype: int64
Amount of Nan for each categorical feature in test set:
pre_preg_diabetes        2115
gest_diabetes            2115
pre_preg_hypertension    2115
gest_hypertension        2115
prev_preterm_birth       2115
infertility_treatment    2115
gonorrhea                5689
syphilis                 5689
chlamydia                5689
hepatitis_b              5689
hepatitis_c              5689
labor_induction          1152
labor_augmentatio

In addition, on the EDA phase I saw that multiple integer columns should also be categorical, for example: year, month etc. Therefore I added to the categorical features list more columns (can be seen below).

In [10]:
print(train_df.iloc[:, ~train_df.columns.isin(cat_cols)].nunique())
print(train_df.iloc[:, ~train_df.columns.isin(cat_cols)].head())

birth_year             5
birth_month           12
birth_time          1440
birth_place            7
mother_age            39
marital_status         2
mother_education       8
father_age            76
father_education       8
interval_llb         299
cigarettes            61
mother_height         45
mother_bmi           560
pre_preg_weight      301
delivery_weight      301
prev_cesarian          3
apgar5                11
plurality              5
Female                 2
dtype: int64
   birth_year  birth_month  birth_time  birth_place  mother_age  marital_status  mother_education  father_age  father_education  interval_llb  cigarettes  mother_height  \
0        2017            5      1705.0          1.0          20             NaN               5.0        20.0               4.0         888.0         0.0           64.0   
1        2017           10       442.0          1.0          38             1.0               5.0        61.0               7.0         107.0         0.0           61.0

In [11]:
int_as_cat = ["birth_year", "birth_month", "marital_status", "mother_education", "father_education", "Female", "plurality", "apgar5", "prev_cesarian", "birth_place"]
cat_cols   = list(cat_cols) + int_as_cat
print(train_df.iloc[:, ~train_df.columns.isin(cat_cols)].head())

   birth_time  mother_age  father_age  interval_llb  cigarettes  mother_height  mother_bmi  pre_preg_weight  delivery_weight
0      1705.0          20        20.0         888.0         0.0           64.0        33.0            192.0            203.0
1       442.0          38        61.0         107.0         0.0           61.0        20.8            110.0            133.0
2      1453.0          29        29.0         888.0         0.0           63.0        23.9            135.0            162.0
3       859.0          28        29.0         888.0         0.0           67.0        29.3            187.0            222.0
4       817.0          24         NaN          36.0         0.0           68.0        38.0            250.0            251.0


### Auxiliary functions


In [12]:
def get_data_and_labels(dataframe):
  """
  Split target column from the rest of the data
  
  Args:
    dataframe (pd.Dataframe): data
  
  Returns:
    data (pd.Dataframe): data columns
    lables (np.array): labels
  """

  data    = dataframe.copy()
  labels  = data.pop("apgar5")
  assert ("apgar5" in data) == False
  return data, labels

In [13]:
def cross_validation_training(train_data, model, imputer):
  """
  Repeated stratified kfold cross validation

  Args:
    train_data (pd.Dataframe):
    model (xgboost):
    imputer (object):
  
  Returns:
    mean_rmse (float)
  """
  # define modeling pipeline
  if imputer is None:
    pipeline = Pipeline(steps = [('m', model)])
  else:
    pipeline = Pipeline(steps = [('i', imputer), ('m', model)])
  
  # split labels from train_data
  X, y = get_data_and_labels(train_data)

  # define model evaluation
  cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=2, random_state=1)

  # evaluate model
  scores = -1 * cross_val_score(pipeline, X, y, 
                                scoring = "neg_root_mean_squared_error", 
                                cv = cv, 
                                error_score = 'raise'
                                )
  
  print('Mean RMSE: %.3f (%.3f)' % (mean(scores), std(scores)))
  
  mean_rmse = mean(scores)
  return mean_rmse

In [14]:
def full_train_and_test(train_data, test_data, model, imputer):
  """
  Train XGBoost model on the whole dataset, and evaluate on the test set.

  Args:
    train_data (dataframe)
    test_data  (dataframe)
    model      (object)
    imputer    (object)

  Returns:
    tuple: (rmse_train, rmse_test)
  """
  
  # define modeling pipeline
  if imputer is None:
    pipeline = Pipeline(steps = [('m', model)], verbose=True)
  else:
    pipeline = Pipeline(steps = [('i', imputer), ('m', model)])
  
  # split labels from train_data and test data
  X_train, y_train = get_data_and_labels(train_data)
  X_test, y_test   = get_data_and_labels(test_data)

  # train model on training data
  pipeline.fit(X_train, y_train)

  # evaluate model on train and test data
  y_train_pred = pipeline.predict(X_train)
  y_test_pred  = pipeline.predict(X_test)

  # calculate the rmse of train and test sets
  rmse_train   = mean_squared_error(y_train, y_train_pred, squared = False)
  rmse_test    = mean_squared_error(y_test, y_test_pred, squared = False)

  print('RMSE train: %.3f, RMSE test: %.3f' % (rmse_train, rmse_test))
  return (rmse_train, rmse_test)

In [15]:
def objective(space, data, imputer):
  """
  Objective function we want to minimize. In this case we want to minimize the RMSE.

  Args:
    space   (dict): dictionary of hyperparameter values
    data    (pd.Dataframe): training data 
    imputer (object) 
  """

  clf = xgb.XGBRegressor(
                  n_estimators      = space['n_estimators'], 
                  max_depth         = int(space['max_depth']), 
                  gamma             = space['gamma'],
                  reg_alpha         = int(space['reg_alpha']),
                  min_child_weight  = int(space['min_child_weight']),
                  colsample_bytree  = int(space['colsample_bytree']),
                  tree_method       = "gpu_hist", 
                  )
  
  rmse = cross_validation_training(data, clf, imputer)
  
  return {'loss': rmse, 'status': STATUS_OK }

In [16]:
# dictionary of XGBoost hyperparameters that should be tuned  
space = {'max_depth':         hp.quniform("max_depth", 3, 18, 1),
        'gamma':              hp.uniform ('gamma', 1, 9),
        'reg_alpha' :         hp.quniform('reg_alpha', 40, 180, 1),
        'reg_lambda' :        hp.uniform('reg_lambda', 0, 1),
        'colsample_bytree' :  hp.uniform('colsample_bytree', 0.5, 1),
        'min_child_weight' :  hp.quniform('min_child_weight', 0, 10, 1),
        'n_estimators':       180,
        'seed':               0
    }

In [17]:
def run_optimization(train_data, max_iter=10, imputer=None):
  """
  Use bayesian optimization with hyperopt to tune the hyperparameters of the XGBoost model.

  Args:
    train_data (dataframe)
    max_iter   (int): number of iteration to run with different values
    imputer    (object)

  Returns:
    best_hyperparams (dict): the hyperparameters which got the best results.
  """
  
  trials            = Trials()

  # wrap the objective function to add more input parameters
  fmin_objective    = partial(objective, 
                           data = train_data, 
                           imputer = imputer
                           )
  
  # run optimization
  best_hyperparams  = fmin(fn = fmin_objective,
                          space = space,
                          algo = tpe.suggest,
                          max_evals = max_iter,
                          trials = trials
                           )
  
  # print different values used
  print("hyperparameter values tested:")
  for trial in trials:
    print(trial['misc']['vals'])

  return best_hyperparams

In [18]:
# dictionary to save the results
results = {
    "strategy": [],
    "rmse_train": [],
    "rmse_test": []
}

# Imputation Methods

### 1. No imputation

In [19]:
train_data = train_df.copy()
test_data  = test_df.copy()

train_rows = train_df.shape[0]
test_rows  = test_df.shape[0]

print("Train has %d rows, and Test has %d rows" % (train_rows, test_rows))

# remove rows with missing label
train_data  = train_data.dropna(subset=["apgar5"])
test_data   = test_data.dropna(subset=["apgar5"])

new_train_rows  = train_data.shape[0]
new_test_rows   = test_data.shape[0]

print("Train has %d rows, and Test has %d rows after removing nan observations" % (new_train_rows, new_test_rows))

train_percent = (1 - new_train_rows / train_rows) * 100
test_percent  = (1 - new_test_rows / test_rows) * 100

print("Train percent removed: %.2f, Test percent removed:  %.2f" % (train_percent, test_percent))

Train has 2000000 rows, and Test has 2000000 rows
Train has 1992138 rows, and Test has 1991820 rows after removing nan observations
Train percent removed: 0.39, Test percent removed:  0.41


In [20]:
best_hyperparams = run_optimization(train_data, max_iter = 5)
best_hyperparams["max_depth"] = int(best_hyperparams["max_depth"])

Mean RMSE: 0.806 (0.000)
Mean RMSE: 0.806 (0.000)
Mean RMSE: 0.806 (0.000)
Mean RMSE: 0.806 (0.000)
Mean RMSE: 0.806 (0.000)
100%|██████████| 5/5 [04:46<00:00, 57.28s/it, best loss: 0.8061049912895687]
hyperparameter values tested:
{'colsample_bytree': [0.8819374383473955], 'gamma': [8.798774251417795], 'max_depth': [9.0], 'min_child_weight': [5.0], 'reg_alpha': [106.0], 'reg_lambda': [0.9337186873233942]}
{'colsample_bytree': [0.984006653266161], 'gamma': [7.2714027171303535], 'max_depth': [15.0], 'min_child_weight': [8.0], 'reg_alpha': [146.0], 'reg_lambda': [0.060644806111325056]}
{'colsample_bytree': [0.7942081724022483], 'gamma': [6.483893876534975], 'max_depth': [13.0], 'min_child_weight': [6.0], 'reg_alpha': [171.0], 'reg_lambda': [0.8905775954447839]}
{'colsample_bytree': [0.9626004287890905], 'gamma': [5.54363237282062], 'max_depth': [15.0], 'min_child_weight': [9.0], 'reg_alpha': [99.0], 'reg_lambda': [0.49461851684470604]}
{'colsample_bytree': [0.8699091284566327], 'gamma': 

In [21]:
clf   = xgb.sklearn.XGBRegressor(
    tree_method = "gpu_hist", 
    **best_hyperparams
 )

train_score, test_score = full_train_and_test(train_data, test_data, clf, None)

results["rmse_train"].append(train_score)
results["rmse_test"].append(test_score)
results["strategy"].append("No imputation")

[Pipeline] ................. (step 1 of 1) Processing m, total=   8.4s
RMSE train: 0.798, RMSE test: 0.799


### 2. Drop rows with missing train_data

In [22]:
train_data  = train_df.copy()
test_data   = test_df.copy()

train_data  = train_data.dropna()
test_data   = test_data.dropna()

# validate that no more missing values
assert train_data.isna().sum().sum()  == 0
assert test_data.isna().sum().sum()   == 0

In [23]:
best_hyperparams = run_optimization(train_data, max_iter = 5)
best_hyperparams["max_depth"] = int(best_hyperparams["max_depth"])

Mean RMSE: 0.769 (0.000)
Mean RMSE: 0.769 (0.000)
Mean RMSE: 0.769 (0.000)
Mean RMSE: 0.769 (0.000)
Mean RMSE: 0.769 (0.000)
100%|██████████| 5/5 [03:05<00:00, 37.06s/it, best loss: 0.7685453039260338]
hyperparameter values tested:
{'colsample_bytree': [0.6816935225125191], 'gamma': [2.3530317249155575], 'max_depth': [7.0], 'min_child_weight': [3.0], 'reg_alpha': [54.0], 'reg_lambda': [0.6759086758600674]}
{'colsample_bytree': [0.5250487643754203], 'gamma': [3.9792037378795797], 'max_depth': [16.0], 'min_child_weight': [5.0], 'reg_alpha': [160.0], 'reg_lambda': [0.16267151740961527]}
{'colsample_bytree': [0.8825472808870545], 'gamma': [4.400889331551983], 'max_depth': [7.0], 'min_child_weight': [4.0], 'reg_alpha': [63.0], 'reg_lambda': [0.16421765474436045]}
{'colsample_bytree': [0.820748338957688], 'gamma': [1.489200104759992], 'max_depth': [5.0], 'min_child_weight': [10.0], 'reg_alpha': [68.0], 'reg_lambda': [0.6723541787444508]}
{'colsample_bytree': [0.6938236243065641], 'gamma': [7

In [24]:
clf   = xgb.sklearn.XGBRegressor(
    tree_method = "gpu_hist", 
    **best_hyperparams
 )

train_score, test_score = full_train_and_test(train_data, test_data, clf, None)

results["rmse_train"].append(train_score)
results["rmse_test"].append(test_score)
results["strategy"].append("Drop rows")

[Pipeline] ................. (step 1 of 1) Processing m, total=   5.9s
RMSE train: 0.762, RMSE test: 0.761


### 3. Mean for continous, mode (most frequent) for categorical

In [25]:
train_data = train_df.copy()
test_data  = test_df.copy()

# calculate mean for continous and mode for categorical only on the train set
train_con_mean = train_data.iloc[:, ~train_data.columns.isin(cat_cols)].mean()
train_cat_mode =  train_data.iloc[:, train_data.columns.isin(cat_cols)].mode().iloc[0]

# fill train set 
train_data.iloc[:, ~train_data.columns.isin(cat_cols)] = train_data.iloc[:, ~train_data.columns.isin(cat_cols)].fillna(train_con_mean)
train_data.iloc[:, train_data.columns.isin(cat_cols)] = train_data.iloc[:, train_data.columns.isin(cat_cols)].fillna(train_cat_mode)

# fill test set based on train set
test_data.iloc[:, ~test_data.columns.isin(cat_cols)] = test_data.iloc[:, ~test_data.columns.isin(cat_cols)].fillna(train_con_mean)
test_data.iloc[:, test_data.columns.isin(cat_cols)] = test_data.iloc[:, test_data.columns.isin(cat_cols)].fillna(train_cat_mode)

assert train_data.isna().sum().sum() == 0
assert test_data.isna().sum().sum()  == 0

In [26]:
best_hyperparams = run_optimization(train_data, max_iter = 5)
best_hyperparams["max_depth"] = int(best_hyperparams["max_depth"])

Mean RMSE: 0.805 (0.000)
Mean RMSE: 0.805 (0.000)
Mean RMSE: 0.805 (0.000)
Mean RMSE: 0.805 (0.000)
Mean RMSE: 0.805 (0.000)
100%|██████████| 5/5 [04:31<00:00, 54.25s/it, best loss: 0.8048680971853616]
hyperparameter values tested:
{'colsample_bytree': [0.606354636889247], 'gamma': [4.088860313315708], 'max_depth': [16.0], 'min_child_weight': [5.0], 'reg_alpha': [164.0], 'reg_lambda': [0.02605314349040977]}
{'colsample_bytree': [0.6451216624639198], 'gamma': [5.037120888381173], 'max_depth': [4.0], 'min_child_weight': [6.0], 'reg_alpha': [160.0], 'reg_lambda': [0.40767742413659147]}
{'colsample_bytree': [0.7636746656723454], 'gamma': [5.515774925772813], 'max_depth': [14.0], 'min_child_weight': [1.0], 'reg_alpha': [171.0], 'reg_lambda': [0.25400479023864175]}
{'colsample_bytree': [0.6683348898446977], 'gamma': [6.716010290766364], 'max_depth': [8.0], 'min_child_weight': [3.0], 'reg_alpha': [75.0], 'reg_lambda': [0.10667315932414756]}
{'colsample_bytree': [0.7474013559307646], 'gamma': 

In [27]:
clf   = xgb.sklearn.XGBRegressor(
    tree_method = "gpu_hist", 
    **best_hyperparams
 )

train_score, test_score = full_train_and_test(train_data, test_data, clf, None)

results["rmse_train"].append(train_score)
results["rmse_test"].append(test_score)
results["strategy"].append("Mean & Mode")

[Pipeline] ................. (step 1 of 1) Processing m, total=   8.4s
RMSE train: 0.799, RMSE test: 0.798


### 4. Median for continous, mode (most frequent) for categorical

In [28]:
train_data = train_df.copy()
test_data  = test_df.copy()

# calculate median for continous and mode for categorical only on the train set
train_con_median = train_data.iloc[:, ~train_data.columns.isin(cat_cols)].median()
train_cat_mode =  train_data.iloc[:, train_data.columns.isin(cat_cols)].mode().iloc[0]

# fill train
train_data.iloc[:, ~train_data.columns.isin(cat_cols)] = train_data.iloc[:, ~train_data.columns.isin(cat_cols)].fillna(train_con_median)
train_data.iloc[:, train_data.columns.isin(cat_cols)] = train_data.iloc[:, train_data.columns.isin(cat_cols)].fillna(train_cat_mode)

# fill test based on train
test_data.iloc[:, ~test_data.columns.isin(cat_cols)] = test_data.iloc[:, ~test_data.columns.isin(cat_cols)].fillna(train_con_median)
test_data.iloc[:, test_data.columns.isin(cat_cols)] = test_data.iloc[:, test_data.columns.isin(cat_cols)].fillna(train_cat_mode)

assert train_data.isna().sum().sum() == 0
assert test_data.isna().sum().sum()  == 0

In [29]:
best_hyperparams = run_optimization(train_data, max_iter = 5)
best_hyperparams["max_depth"] = int(best_hyperparams["max_depth"])

Mean RMSE: 0.805 (0.000)
Mean RMSE: 0.805 (0.000)
Mean RMSE: 0.805 (0.000)
Mean RMSE: 0.805 (0.000)
Mean RMSE: 0.805 (0.000)
100%|██████████| 5/5 [04:47<00:00, 57.57s/it, best loss: 0.804944449463564]
hyperparameter values tested:
{'colsample_bytree': [0.5526238130458375], 'gamma': [3.5031571203351692], 'max_depth': [16.0], 'min_child_weight': [2.0], 'reg_alpha': [150.0], 'reg_lambda': [0.5557449654199064]}
{'colsample_bytree': [0.5893982803801165], 'gamma': [8.63274052657396], 'max_depth': [13.0], 'min_child_weight': [8.0], 'reg_alpha': [71.0], 'reg_lambda': [0.24361167492016733]}
{'colsample_bytree': [0.7739606829237153], 'gamma': [6.340247902739706], 'max_depth': [12.0], 'min_child_weight': [0.0], 'reg_alpha': [145.0], 'reg_lambda': [0.19984112887126848]}
{'colsample_bytree': [0.9825973737933368], 'gamma': [4.455423960156235], 'max_depth': [3.0], 'min_child_weight': [3.0], 'reg_alpha': [69.0], 'reg_lambda': [0.7851383176193107]}
{'colsample_bytree': [0.6117604225523685], 'gamma': [3

In [30]:
clf   = xgb.sklearn.XGBRegressor(
    tree_method = "gpu_hist", 
    **best_hyperparams
 )

train_score, test_score = full_train_and_test(train_data, test_data, clf, None)

results["rmse_train"].append(train_score)
results["rmse_test"].append(test_score)
results["strategy"].append("Median & Mode")

[Pipeline] ................. (step 1 of 1) Processing m, total=   8.3s
RMSE train: 0.797, RMSE test: 0.798


### 5. KNN Imputation

For the next two imputation methods I did not used CV and Hyperparameters optimiziation because the training time took too long. So, I just trained the models and showed the results.

In [41]:
train_data = train_df.copy()
test_data  = test_df.copy()

clf = xgb.sklearn.XGBRegressor(tree_method = "gpu_hist")

# Categorical features cannot have an average hance I used 1 neighbor here
imputer_cat = KNNImputer(n_neighbors = 1)
imputer_cat.fit(train_data[cat_cols].iloc[:50000])
train_data[cat_cols] = imputer_cat.transform(train_data[cat_cols])
test_data[cat_cols]  = imputer_cat.transform(test_data[cat_cols])

# Continous features can have an average distance hence I used more than 1 neighbor
imputer_con     = KNNImputer(n_neighbors = 8, weights="uniform")

# get only continous data from train and test
con_data_train  = train_data.iloc[:, ~train_data.columns.isin(cat_cols)]
con_data_test   = test_data.iloc[:, ~test_data.columns.isin(cat_cols)]

# fit on train based on the first 50,000 rows and then transform train and test
imputer_con.fit(con_data_train.iloc[:50000])
train_data.iloc[:, ~train_data.columns.isin(cat_cols)] = imputer_con.transform(con_data_train)
test_data.iloc[:, ~test_data.columns.isin(cat_cols)]   = imputer_con.transform(con_data_test)

# evaluate model
train_score, test_score = full_train_and_test(train_data, test_data, clf, None)

results["rmse_train"].append(train_score)
results["rmse_test"].append(test_score)
results["strategy"].append("KNN method (k=8})")

[Pipeline] ................. (step 1 of 1) Processing m, total=   4.3s
RMSE train: 0.792, RMSE test: 0.801


### 6. Iterative Imputation

In [42]:
train_data = train_df.copy()
test_data  = test_df.copy()

clf  = xgb.XGBRegressor(tree_method = "gpu_hist")

# init
imputer = IterativeImputer(max_iter=10)

# fit on train based on the first 50,000 rows and then transform train and test
imputer.fit(train_data.iloc[:50000])
train_data[train_data.columns] = imputer.transform(train_data)
test_data[test_data.columns]   = imputer.transform(test_data)

train_score, test_score = full_train_and_test(train_data, test_data, clf, None)

results["rmse_train"].append(train_score)
results["rmse_test"].append(test_score)
results["strategy"].append("Iterative imputer")



[Pipeline] ................. (step 1 of 1) Processing m, total=   4.4s
RMSE train: 0.787, RMSE test: 0.795


### Comparison

In [48]:
comparison = pd.DataFrame(results).set_index("strategy").rename(columns = {"rmse_train": "RMSE Train", "rmse_test": "RMSE Test"})
comparison = comparison.round(3)

# Highlighting the minimum values of last 2 columns
comparison.style.highlight_min(color = 'green', axis = 0)

Unnamed: 0_level_0,RMSE Train,RMSE Test
strategy,Unnamed: 1_level_1,Unnamed: 2_level_1
No imputation,0.798,0.799
Drop rows,0.762,0.761
Mean & Mode,0.799,0.798
Median & Mode,0.797,0.798
KNN method (k=8}),0.792,0.801
Iterative imputer,0.787,0.795
