In [3]:
# Statistics
import pandas as pd
import numpy as np
import math as mt

# Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

# Data Preprocessing - Standardization, Encoding, Imputation
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer


# Data Preprocessing - Feature Engineering
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import mutual_info_regression
from sklearn.decomposition import PCA

# Data Preprocessing - ML Pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# ML - Modeling
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

# ML - Evaluation
from sklearn.model_selection import cross_val_score

# ML - Tuning
from sklearn.model_selection import GridSearchCV
import optuna

In [4]:
# Read train dataset
train_data = pd.read_csv('../input/30days-folds/train_folds.csv')
test_data = pd.read_csv('../input/30-days-of-ml/test.csv')
sample_submission = pd.read_csv('../input/30-days-of-ml/sample_submission.csv')

In [5]:
%%time
# target encoding
useful_features = [col for col in train_data.columns if col not in ("id", "target", "kfold")]
cat_cols = [col for col in useful_features if "cat" in col]
test_data = test_data[useful_features]

for col in cat_cols:
    temp_df = []
    temp_test_feat = None
    for fold in range(5):
        X_train = train_data[train_data.kfold != fold].reset_index(drop=True)
        X_valid = train_data[train_data.kfold == fold].reset_index(drop=True)
        feat = X_train.groupby(col)["target"].agg("mean")
        feat = feat.to_dict()
        #print(feat)
        X_valid.loc[:, f"tar_enc_{col}"] = X_valid[col].map(feat)
        temp_df.append(X_valid)
        if temp_test_feat is None:
            temp_test_feat = test_data[col].map(feat)
        else:
            temp_test_feat += test_data[col].map(feat)
    temp_test_feat /= 5
    test_data.loc[:, f"tar_enc_{col}"] = temp_test_feat
    train_data = pd.concat(temp_df)
    
useful_features = [col for col in train_data.columns if col not in ("id", "target", "kfold")]
cat_cols = [col for col in useful_features if col.startswith("cat")]
test_data = test_data[useful_features]

CPU times: user 6.06 s, sys: 730 ms, total: 6.79 s
Wall time: 6.82 s


In [8]:
%%time

def run(trial):
    fold = 0
    # Hyperparameters for Optuna
    learning_rate = trial.suggest_float("learning_rate", 1e-2, 0.25, log=True)
    reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
    reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
    subsample = trial.suggest_float("subsample", 0.1, 1.0)
    colsample_bytree = trial.suggest_float("colsample_bytree", 0.1, 1.0)
    max_depth = trial.suggest_int("max_depth", 1, 7)
    
    # Target Encoding
    X_train = train_data[train_data.kfold != fold].reset_index(drop=True)
    X_valid = train_data[train_data.kfold == fold].reset_index(drop=True)
    #X_test = test_data.copy()

    y_train = X_train.target
    y_valid = X_valid.target

    X_train = X_train[useful_features]
    X_valid = X_valid[useful_features]

    # Ordinal Encoding
    ordinal_encoder = OrdinalEncoder()
    X_train[cat_cols] = ordinal_encoder.fit_transform(X_train[cat_cols])
    X_valid[cat_cols] = ordinal_encoder.transform(X_valid[cat_cols])
    #X_test[cat_cols] = ordinal_encoder.transform(X_test[cat_cols]) # Q. The last transform

    model = XGBRegressor(
        random_state=42,
        tree_method="gpu_hist",
        gpu_id=0,
        predictor="gpu_predictor",
        n_estimators=7000, # we have early_stopping_rounds
        learning_rate=learning_rate,
        reg_lambda=reg_lambda,
        reg_alpha=reg_alpha,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        max_depth=max_depth,
    )
    model.fit(X_train, y_train, 
              early_stopping_rounds=300, 
              eval_set=[(X_valid, y_valid)], verbose=1000)
    preds_valid = model.predict(X_valid)
    #test_preds = model.predict(X_test)
    #final_predictions.append(test_preds)
    rmse = mean_squared_error(y_valid, preds_valid, squared=False)
    return rmse

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 6.68 µs


In [9]:
%%time
study = optuna.create_study(direction='minimize')
study.optimize(run, n_trials=5) # set n_triasl

[32m[I 2021-08-25 02:23:43,173][0m A new study created in memory with name: no-name-85e08a86-39bc-42cf-8002-3a7aae7a9600[0m


[0]	validation_0-rmse:7.44694
[1000]	validation_0-rmse:0.71986
[2000]	validation_0-rmse:0.71900
[2671]	validation_0-rmse:0.71903


[32m[I 2021-08-25 02:24:00,181][0m Trial 0 finished with value: 0.7189675700875123 and parameters: {'learning_rate': 0.04322192958897396, 'reg_lambda': 2.0712359017603326e-07, 'reg_alpha': 43.117837539469534, 'subsample': 0.3907645562736698, 'colsample_bytree': 0.2329778388025192, 'max_depth': 6}. Best is trial 0 with value: 0.7189675700875123.[0m


[0]	validation_0-rmse:7.69807
[1000]	validation_0-rmse:0.73009
[2000]	validation_0-rmse:0.72562
[3000]	validation_0-rmse:0.72336
[4000]	validation_0-rmse:0.72187
[5000]	validation_0-rmse:0.72079
[6000]	validation_0-rmse:0.72001
[6999]	validation_0-rmse:0.71943


[32m[I 2021-08-25 02:24:22,223][0m Trial 1 finished with value: 0.7194267651939508 and parameters: {'learning_rate': 0.010611982159756318, 'reg_lambda': 1.4024820263697298e-06, 'reg_alpha': 2.4850247913612537e-07, 'subsample': 0.7808792881961223, 'colsample_bytree': 0.19900067122025827, 'max_depth': 3}. Best is trial 0 with value: 0.7189675700875123.[0m


[0]	validation_0-rmse:6.43055
[1000]	validation_0-rmse:0.72598
[2000]	validation_0-rmse:0.72330
[3000]	validation_0-rmse:0.72210
[4000]	validation_0-rmse:0.72148
[5000]	validation_0-rmse:0.72099
[5908]	validation_0-rmse:0.72067


[32m[I 2021-08-25 02:24:35,086][0m Trial 2 finished with value: 0.7205383847882317 and parameters: {'learning_rate': 0.17531046557770022, 'reg_lambda': 0.00011200018229513348, 'reg_alpha': 6.296378980490636, 'subsample': 0.12074293373810374, 'colsample_bytree': 0.2172649214085463, 'max_depth': 1}. Best is trial 0 with value: 0.7189675700875123.[0m


[0]	validation_0-rmse:7.20646
[1000]	validation_0-rmse:0.72995
[2000]	validation_0-rmse:0.72747
[3000]	validation_0-rmse:0.72599
[4000]	validation_0-rmse:0.72497
[5000]	validation_0-rmse:0.72418
[6000]	validation_0-rmse:0.72346
[6999]	validation_0-rmse:0.72289


[32m[I 2021-08-25 02:24:50,898][0m Trial 3 finished with value: 0.7228869231654257 and parameters: {'learning_rate': 0.07443694940201989, 'reg_lambda': 3.12715674497084e-07, 'reg_alpha': 29.396597265086125, 'subsample': 0.5355849263649969, 'colsample_bytree': 0.4697687412835221, 'max_depth': 1}. Best is trial 0 with value: 0.7189675700875123.[0m


[0]	validation_0-rmse:6.89455
[1000]	validation_0-rmse:0.72786
[2000]	validation_0-rmse:0.72519
[3000]	validation_0-rmse:0.72366
[4000]	validation_0-rmse:0.72263
[5000]	validation_0-rmse:0.72191
[6000]	validation_0-rmse:0.72140
[6999]	validation_0-rmse:0.72115


[32m[I 2021-08-25 02:25:06,795][0m Trial 4 finished with value: 0.7210970603521394 and parameters: {'learning_rate': 0.11495813555984329, 'reg_lambda': 0.00047121305240065055, 'reg_alpha': 3.332074101256634e-05, 'subsample': 0.4724938177882402, 'colsample_bytree': 0.6755355165449973, 'max_depth': 1}. Best is trial 0 with value: 0.7189675700875123.[0m


CPU times: user 1min 23s, sys: 994 ms, total: 1min 24s
Wall time: 1min 23s


In [10]:
study.best_params

{'learning_rate': 0.04322192958897396,
 'reg_lambda': 2.0712359017603326e-07,
 'reg_alpha': 43.117837539469534,
 'subsample': 0.3907645562736698,
 'colsample_bytree': 0.2329778388025192,
 'max_depth': 6}

In [None]:
%%time
# With Standardization + Normalization
useful_features = [col for col in train_data.columns if col not in ("id", "target", "kfold")]
cat_cols = [col for col in useful_features if "cat" in col]
num_cols = [col for col in useful_features if col.startswith('cont')]
test_data = test_data[useful_features]

final_predictions = []
scores = []

for fold in range(5):
    # Preprocessing - Kfold
    X_train = train_data[train_data.kfold != fold].reset_index(drop=True)
    X_valid = train_data[train_data.kfold == fold].reset_index(drop=True)
    X_test = test_data.copy()
    
    y_train = X_train.target
    y_valid = X_valid.target
    
    X_train = X_train[useful_features]
    X_valid = X_valid[useful_features]
    
    # Preprocessing - Ordinal Encoding
    ordinal_encoder = OrdinalEncoder()
    X_train[cat_cols] = ordinal_encoder.fit_transform(X_train[cat_cols])
    X_valid[cat_cols] = ordinal_encoder.transform(X_valid[cat_cols])
    X_test[cat_cols] = ordinal_encoder.transform(X_test[cat_cols]) # Q. The last transform

    # Preprocessing - Standardization
    scaler = StandardScaler()
    X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
    X_valid[num_cols] = scaler.transform(X_valid[num_cols])
    X_test[num_cols] = scaler.transform(X_test[num_cols]) # Q. The last transform
    
    # Preprocessing - Normalizatino
    normalizer = Normalizer()
    X_train[num_cols] = normalizer.fit_transform(X_train[num_cols])
    X_valid[num_cols] = normalizer.transform(X_valid[num_cols])
    X_test[num_cols] = normalizer.transform(X_test[num_cols]) # Q. The last transform
    
    # Training
    #model = RandomForestRegressor(random_state=fold, n_jobs=-1)
    #model = XGBRegressor(random_state=fold, n_jobs=8)
    model = XGBRegressor(random_state=fold, tree_method='gpu_hist', gpu_id=0, predictor='gpu_predictor')
    model.fit(X_train, y_train)
    
    # Evaluation
    preds_valid = model.predict(X_valid)
    test_preds = model.predict(X_test)
    final_predictions.append(test_preds)
    rmse = mean_squared_error(y_valid, preds_valid, squared=False)
    
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))

In [None]:
%%time
# With Standardization
useful_features = [col for col in train_data.columns if col not in ("id", "target", "kfold")]
cat_cols = [col for col in useful_features if "cat" in col]
num_cols = [col for col in useful_features if col.startswith('cont')]
test_data = test_data[useful_features]

final_predictions = []
scores = []

for fold in range(5):
    # Preprocessing - Kfold
    X_train = train_data[train_data.kfold != fold].reset_index(drop=True)
    X_valid = train_data[train_data.kfold == fold].reset_index(drop=True)
    X_test = test_data.copy()
    
    y_train = X_train.target
    y_valid = X_valid.target
    
    X_train = X_train[useful_features]
    X_valid = X_valid[useful_features]
    
    # Preprocessing - Ordinal Encoding
    ordinal_encoder = OrdinalEncoder()
    X_train[cat_cols] = ordinal_encoder.fit_transform(X_train[cat_cols])
    X_valid[cat_cols] = ordinal_encoder.transform(X_valid[cat_cols])
    X_test[cat_cols] = ordinal_encoder.transform(X_test[cat_cols]) # Q. The last transform
    
    # Preprocessing - Standardization
    scaler = StandardScaler()
    X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
    X_valid[num_cols] = scaler.transform(X_valid[num_cols])
    X_test[num_cols] = scaler.transform(X_test[num_cols]) # Q. The last transform
    
    # Training
    #model = RandomForestRegressor(random_state=fold, n_jobs=-1)
    #model = XGBRegressor(random_state=fold, n_jobs=8)
    model = XGBRegressor(random_state=fold, tree_method='gpu_hist', gpu_id=0, predictor='gpu_predictor')
    model.fit(X_train, y_train)
    
    # Evaluation
    preds_valid = model.predict(X_valid)
    test_preds = model.predict(X_test)
    final_predictions.append(test_preds)
    rmse = mean_squared_error(y_valid, preds_valid, squared=False)
    
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))

In [None]:
%%time
# Log transformation + Tuning
useful_features = [col for col in train_data.columns if col not in ("id", "target", "kfold")]
cat_cols = [col for col in useful_features if "cat" in col]
num_cols = [col for col in useful_features if col.startswith('cont')]
test_data = test_data[useful_features]

for col in num_cols:
    train_data[col] = np.log1p(train_data[col])
    test_data[col] = np.log1p(test_data[col])

final_predictions = []
scores = []

for fold in range(5):
    # Preprocessing - Kfold
    X_train = train_data[train_data.kfold != fold].reset_index(drop=True)
    X_valid = train_data[train_data.kfold == fold].reset_index(drop=True)
    X_test = test_data.copy()
    
    y_train = X_train.target
    y_valid = X_valid.target
    
    X_train = X_train[useful_features]
    X_valid = X_valid[useful_features]
    
    # Preprocessing - Ordinal Encoding
    ordinal_encoder = OrdinalEncoder()
    X_train[cat_cols] = ordinal_encoder.fit_transform(X_train[cat_cols])
    X_valid[cat_cols] = ordinal_encoder.transform(X_valid[cat_cols])
    X_test[cat_cols] = ordinal_encoder.transform(X_test[cat_cols]) # Q. The last transform
    
    # Training
    #model = RandomForestRegressor(random_state=fold, n_jobs=-1)
    #model = XGBRegressor(random_state=fold, n_jobs=8)
    model = XGBRegressor(random_state=fold, tree_method='gpu_hist', gpu_id=0, predictor='gpu_predictor', 
                         learning_rate=0.1, n_estimators=1000, max_depth=3, colsample_bytree=0.3)
    model.fit(X_train, y_train)
    
    # Evaluation
    preds_valid = model.predict(X_valid)
    test_preds = model.predict(X_test)
    final_predictions.append(test_preds)
    rmse = mean_squared_error(y_valid, preds_valid, squared=False)
    
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))

print('You need to reset dataframe!')

In [None]:
%%time
# polynomial features + Tuning
useful_features = [col for col in train_data.columns if col not in ("id", "target", "kfold")]
cat_cols = [col for col in useful_features if "cat" in col]
num_cols = [col for col in useful_features if col.startswith('cont')]
test_data = test_data[useful_features]

poly = PolynomialFeatures(degree=2, 
                          interaction_only=True, # If true, only interaction features are produced: features that are products of at most degree distinct input features (so not x[1] ** 2, x[0] * x[2] ** 3, etc.).
                          include_bias=False)
train_poly = poly.fit_transform(train_data[num_cols])
test_poly = poly.fit_transform(test_data[num_cols])

df_train_poly = pd.DataFrame(train_poly, columns=[f"poly_{i}" for i in range(train_poly.shape[1])])
df_test_poly = pd.DataFrame(test_poly, columns=[f"poly_{i}" for i in range(test_poly.shape[1])])

train_data = pd.concat([train_data, df_train_poly], axis=1)
test_data = pd.concat([test_data, df_test_poly], axis=1)

useful_features = [col for col in train_data.columns if col not in ("id", "target", "kfold")]
cat_cols = [col for col in useful_features if "cat" in col]
test_data = test_data[useful_features]

final_predictions = []
scores = []

for fold in range(5):
    # Preprocessing - Kfold
    X_train = train_data[train_data.kfold != fold].reset_index(drop=True)
    X_valid = train_data[train_data.kfold == fold].reset_index(drop=True)
    X_test = test_data.copy()
    
    y_train = X_train.target
    y_valid = X_valid.target
    
    X_train = X_train[useful_features]
    X_valid = X_valid[useful_features]
    
    # Preprocessing - Ordinal Encoding
    ordinal_encoder = OrdinalEncoder()
    X_train[cat_cols] = ordinal_encoder.fit_transform(X_train[cat_cols])
    X_valid[cat_cols] = ordinal_encoder.transform(X_valid[cat_cols])
    X_test[cat_cols] = ordinal_encoder.transform(X_test[cat_cols]) # Q. The last transform
    
    # Training
    #model = RandomForestRegressor(random_state=fold, n_jobs=-1)
    #model = XGBRegressor(random_state=fold, n_jobs=8)
    model = XGBRegressor(random_state=fold, tree_method='gpu_hist', gpu_id=0, predictor='gpu_predictor', 
                         learning_rate=0.1, n_estimators=1000, max_depth=3, colsample_bytree=0.3)
    model.fit(X_train, y_train)
    
    # Evaluation
    preds_valid = model.predict(X_valid)
    test_preds = model.predict(X_test)
    final_predictions.append(test_preds)
    rmse = mean_squared_error(y_valid, preds_valid, squared=False)
    
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))

print('You need to reset dataframe!')

In [None]:
test_data

In [None]:
%%time
# One-Hot Encoding + Ordinal Encoding + Tuning
# pd.cut 
# Model Tuning + drop cat2
useful_features = [col for col in train_data.columns if col not in ("id", "target", "kfold")]
cat_cols = [col for col in useful_features if "cat" in col]
oe_cols = ['cat9']
ohe_cols = cat_cols
ohe_cols.remove('cat9')
num_cols = [col for col in useful_features if col.startswith('cont')]
test_data = test_data[useful_features]

final_predictions = []
scores = []

for fold in range(5):
    # Preprocessing - Kfold
    X_train = train_data[train_data.kfold != fold].reset_index(drop=True)
    X_valid = train_data[train_data.kfold == fold].reset_index(drop=True)
    X_test = test_data.copy()
    
    y_train = X_train.target
    y_valid = X_valid.target
    
    X_train = X_train[useful_features]
    X_valid = X_valid[useful_features]
    
    # Preprocessing - Ordinal Encoding
    ordinal_encoder = OrdinalEncoder()
    X_train[oe_cols] = ordinal_encoder.fit_transform(X_train[oe_cols])
    X_valid[oe_cols] = ordinal_encoder.transform(X_valid[oe_cols])
    X_test[oe_cols] = ordinal_encoder.transform(X_test[oe_cols]) # Q. The last transform
    
    # Preprocessing - One-Hot Encoding
    ohe = OneHotEncoder(sparse=False, handle_unknown="ignore")
    X_train_ohe = ohe.fit_transform(X_train[ohe_cols])
    X_valid_ohe = ohe.transform(X_valid[ohe_cols])
    X_test_ohe = ohe.transform(X_test[ohe_cols]) # Q. The last transform
    
    X_train_ohe = pd.DataFrame(X_train_ohe, columns=[f"ohe_{i}" for i in range(X_train_ohe.shape[1])])
    X_valid_ohe = pd.DataFrame(X_valid_ohe, columns=[f"ohe_{i}" for i in range(X_valid_ohe.shape[1])])
    X_test_ohe = pd.DataFrame(X_test_ohe, columns=[f"ohe_{i}" for i in range(X_test_ohe.shape[1])])
    
    X_train = pd.concat([X_train.drop(columns=ohe_cols), X_train_ohe], axis=1)
    X_valid = pd.concat([X_valid.drop(columns=ohe_cols), X_valid_ohe], axis=1)
    X_test = pd.concat([X_test.drop(columns=ohe_cols), X_test_ohe], axis=1)

    # Training
    #model = RandomForestRegressor(random_state=fold, n_jobs=-1)
    #model = XGBRegressor(random_state=fold, n_jobs=8)
    model = XGBRegressor(random_state=fold, tree_method='gpu_hist', gpu_id=0, predictor='gpu_predictor', 
                         learning_rate=0.1, n_estimators=1000, max_depth=3, colsample_bytree=0.3)
    model.fit(X_train, y_train)
    
    # Evaluation
    preds_valid = model.predict(X_valid)
    test_preds = model.predict(X_test)
    final_predictions.append(test_preds)
    rmse = mean_squared_error(y_valid, preds_valid, squared=False)
    
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))

print('You need to reset dataframe!')

In [None]:
%%time
# Model Tuning + drop cat2
useful_features = [col for col in train_data.columns if col not in ("id", "target", "kfold", "cat2")]
cat_cols = [col for col in useful_features if "cat" in col]
num_cols = [col for col in useful_features if col.startswith('cont')]
test_data = test_data[useful_features]

final_predictions = []
scores = []

for fold in range(5):
    # Preprocessing - Kfold
    X_train = train_data[train_data.kfold != fold].reset_index(drop=True)
    X_valid = train_data[train_data.kfold == fold].reset_index(drop=True)
    X_test = test_data.copy()
    
    y_train = X_train.target
    y_valid = X_valid.target
    
    X_train = X_train[useful_features]
    X_valid = X_valid[useful_features]
    
    # Preprocessing - Ordinal Encoding
    ordinal_encoder = OrdinalEncoder()
    X_train[cat_cols] = ordinal_encoder.fit_transform(X_train[cat_cols])
    X_valid[cat_cols] = ordinal_encoder.transform(X_valid[cat_cols])
    X_test[cat_cols] = ordinal_encoder.transform(X_test[cat_cols]) # Q. The last transform
    
    # Training
    #model = RandomForestRegressor(random_state=fold, n_jobs=-1)
    #model = XGBRegressor(random_state=fold, n_jobs=8)
    #model = XGBRegressor(random_state=fold, tree_method='gpu_hist', gpu_id=0, predictor='gpu_predictor')
    model = XGBRegressor(random_state=fold, tree_method='gpu_hist', gpu_id=0, predictor='gpu_predictor', 
                         learning_rate=0.1, n_estimators=1000, max_depth=3, colsample_bytree=0.3)
    model.fit(X_train, y_train)
    
    # Evaluation
    preds_valid = model.predict(X_valid)
    test_preds = model.predict(X_test)
    final_predictions.append(test_preds)
    rmse = mean_squared_error(y_valid, preds_valid, squared=False)
    
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))

print('You need to reset dataframe!')

In [None]:
%%time
# Model Tuning + drop cat2, cat6
useful_features = [col for col in train_data.columns if col not in ("id", "target", "kfold", "cat2", "cat6")]
cat_cols = [col for col in useful_features if "cat" in col]
num_cols = [col for col in useful_features if col.startswith('cont')]
test_data = test_data[useful_features]

final_predictions = []
scores = []

for fold in range(5):
    # Preprocessing - Kfold
    X_train = train_data[train_data.kfold != fold].reset_index(drop=True)
    X_valid = train_data[train_data.kfold == fold].reset_index(drop=True)
    X_test = test_data.copy()
    
    y_train = X_train.target
    y_valid = X_valid.target
    
    X_train = X_train[useful_features]
    X_valid = X_valid[useful_features]
    
    # Preprocessing - Ordinal Encoding
    ordinal_encoder = OrdinalEncoder()
    X_train[cat_cols] = ordinal_encoder.fit_transform(X_train[cat_cols])
    X_valid[cat_cols] = ordinal_encoder.transform(X_valid[cat_cols])
    X_test[cat_cols] = ordinal_encoder.transform(X_test[cat_cols]) # Q. The last transform
    
    # Training
    #model = RandomForestRegressor(random_state=fold, n_jobs=-1)
    #model = XGBRegressor(random_state=fold, n_jobs=8)
    #model = XGBRegressor(random_state=fold, tree_method='gpu_hist', gpu_id=0, predictor='gpu_predictor')
    model = XGBRegressor(random_state=fold, tree_method='gpu_hist', gpu_id=0, predictor='gpu_predictor', 
                         learning_rate=0.1, n_estimators=1000, max_depth=3, colsample_bytree=0.3)
    model.fit(X_train, y_train)
    
    # Evaluation
    preds_valid = model.predict(X_valid)
    test_preds = model.predict(X_test)
    final_predictions.append(test_preds)
    rmse = mean_squared_error(y_valid, preds_valid, squared=False)
    
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))

print('You need to reset dataframe!')

In [None]:
%%time
# Tuning + Standardization
useful_features = [col for col in train_data.columns if col not in ("id", "target", "kfold")]
cat_cols = [col for col in useful_features if "cat" in col]
num_cols = [col for col in useful_features if col.startswith('cont')]
test_data = test_data[useful_features]

final_predictions = []
scores = []

for fold in range(5):
    # Preprocessing - Kfold
    X_train = train_data[train_data.kfold != fold].reset_index(drop=True)
    X_valid = train_data[train_data.kfold == fold].reset_index(drop=True)
    X_test = test_data.copy()
    
    y_train = X_train.target
    y_valid = X_valid.target
    
    X_train = X_train[useful_features]
    X_valid = X_valid[useful_features]
    
    # Preprocessing - Ordinal Encoding
    ordinal_encoder = OrdinalEncoder()
    X_train[cat_cols] = ordinal_encoder.fit_transform(X_train[cat_cols])
    X_valid[cat_cols] = ordinal_encoder.transform(X_valid[cat_cols])
    X_test[cat_cols] = ordinal_encoder.transform(X_test[cat_cols]) # Q. The last transform
    
    # Preprocessing - Standardization
    scaler = StandardScaler()
    X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
    X_valid[num_cols] = scaler.transform(X_valid[num_cols])
    X_test[num_cols] = scaler.transform(X_test[num_cols]) # Q. The last transform
    
    # Training
    #model = RandomForestRegressor(random_state=fold, n_jobs=-1)
    #model = XGBRegressor(random_state=fold, n_jobs=8)
    #model = XGBRegressor(random_state=fold, tree_method='gpu_hist', gpu_id=0, predictor='gpu_predictor')
    model = XGBRegressor(random_state=fold, tree_method='gpu_hist', gpu_id=0, predictor='gpu_predictor', 
                         learning_rate=0.1, n_estimators=1000, max_depth=3, colsample_bytree=0.3)
    model.fit(X_train, y_train)
    
    # Evaluation
    preds_valid = model.predict(X_valid)
    test_preds = model.predict(X_test)
    final_predictions.append(test_preds)
    rmse = mean_squared_error(y_valid, preds_valid, squared=False)
    
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))

In [None]:
%%time
# Only Model Tuning
useful_features = [col for col in train_data.columns if col not in ("id", "target", "kfold")]
cat_cols = [col for col in useful_features if "cat" in col]
num_cols = [col for col in useful_features if col.startswith('cont')]
test_data = test_data[useful_features]

final_predictions = []
scores = []

for fold in range(5):
    # Preprocessing - Kfold
    X_train = train_data[train_data.kfold != fold].reset_index(drop=True)
    X_valid = train_data[train_data.kfold == fold].reset_index(drop=True)
    X_test = test_data.copy()
    
    y_train = X_train.target
    y_valid = X_valid.target
    
    X_train = X_train[useful_features]
    X_valid = X_valid[useful_features]
    
    # Preprocessing - Ordinal Encoding
    ordinal_encoder = OrdinalEncoder()
    X_train[cat_cols] = ordinal_encoder.fit_transform(X_train[cat_cols])
    X_valid[cat_cols] = ordinal_encoder.transform(X_valid[cat_cols])
    X_test[cat_cols] = ordinal_encoder.transform(X_test[cat_cols]) # Q. The last transform
    
    # Training
    #model = RandomForestRegressor(random_state=fold, n_jobs=-1)
    #model = XGBRegressor(random_state=fold, n_jobs=8)
    model = XGBRegressor(random_state=fold, tree_method='gpu_hist', gpu_id=0, predictor='gpu_predictor', 
                         learning_rate=0.1, n_estimators=1000, max_depth=3, colsample_bytree=0.3)
    model.fit(X_train, y_train)
    
    # Evaluation
    preds_valid = model.predict(X_valid)
    test_preds = model.predict(X_test)
    final_predictions.append(test_preds)
    rmse = mean_squared_error(y_valid, preds_valid, squared=False)
    
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))

In [None]:
# Export submission.csv
preds = np.mean(np.column_stack(final_predictions), axis=1)
preds = pd.DataFrame({'id': sample_submission.id, 'target': preds})
preds.to_csv('submission.csv', index=False)