In [1]:
# Statistics
import pandas as pd
import numpy as np
import math as mt

# Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

# Data Preprocessing - Standardization, Encoding, Imputation
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer


# Data Preprocessing - Feature Engineering
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import mutual_info_regression
from sklearn.decomposition import PCA

# Data Preprocessing - ML Pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# ML - Modeling
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

# ML - Evaluation
from sklearn.model_selection import cross_val_score

# ML - Tuning
from sklearn.model_selection import GridSearchCV

In [20]:
# Read train dataset
train_data = pd.read_csv('../input/30days-folds/train_folds.csv')
test_data = pd.read_csv('../input/30-days-of-ml/test.csv')
sample_submission = pd.read_csv('../input/30-days-of-ml/sample_submission.csv')

In [21]:
%%time
# target encoding
useful_features = [col for col in train_data.columns if col not in ("id", "target", "kfold")]
cat_cols = [col for col in useful_features if "cat" in col]
test_data = test_data[useful_features]

final_predictions = []
scores = []

for col in cat_cols:
    temp_df = []
    temp_test_feat = None
    for fold in range(5):
        X_train = train_data[train_data.kfold != fold].reset_index(drop=True)
        X_valid = train_data[train_data.kfold == fold].reset_index(drop=True)
        feat = X_train.groupby(col)["target"].agg("mean")
        feat = feat.to_dict()
        #print(feat)
        X_valid.loc[:, f"tar_enc_{col}"] = X_valid[col].map(feat)
        temp_df.append(X_valid)
        if temp_test_feat is None:
            temp_test_feat = test_data[col].map(feat)
        else:
            temp_test_feat += test_data[col].map(feat)
    temp_test_feat /= 5
    test_data.loc[:, f"tar_enc_{col}"] = temp_test_feat
    train_data = pd.concat(temp_df)

CPU times: user 6.54 s, sys: 454 ms, total: 7 s
Wall time: 7 s


In [22]:
train_data.head()

Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,...,tar_enc_cat0,tar_enc_cat1,tar_enc_cat2,tar_enc_cat3,tar_enc_cat4,tar_enc_cat5,tar_enc_cat6,tar_enc_cat7,tar_enc_cat8,tar_enc_cat9
0,1,B,B,B,C,B,B,A,E,C,...,8.245979,8.20385,8.22478,8.236717,8.240572,8.229516,8.240567,8.240285,8.280709,8.249782
1,8,B,A,A,A,B,D,A,E,C,...,8.245979,8.276689,8.244491,8.274495,8.240572,8.250754,8.240567,8.240285,8.280709,8.259165
2,13,A,B,A,C,B,B,A,E,A,...,8.23897,8.20385,8.244491,8.236717,8.240572,8.229516,8.240567,8.240285,8.230681,8.249782
3,14,B,B,A,C,B,D,A,E,C,...,8.245979,8.20385,8.244491,8.236717,8.240572,8.250754,8.240567,8.240285,8.280709,8.234356
4,25,B,B,A,C,B,D,A,E,C,...,8.245979,8.20385,8.244491,8.236717,8.240572,8.250754,8.240567,8.240285,8.280709,8.259165


In [23]:
useful_features = [col for col in train_data.columns if col not in ("id", "target", "kfold")]
cat_cols = [col for col in useful_features if col.startswith("cat")]
test_data = test_data[useful_features]

In [24]:
%%time
# Target Encoding
for fold in range(5):
    X_train = train_data[train_data.kfold != fold].reset_index(drop=True)
    X_valid = train_data[train_data.kfold == fold].reset_index(drop=True)
    X_test = test_data.copy()
    
    y_train = X_train.target
    y_valid = X_valid.target
    
    X_train = X_train[useful_features]
    X_valid = X_valid[useful_features]
    
    #print("encoding")
    ordinal_encoder = OrdinalEncoder()
    X_train[cat_cols] = ordinal_encoder.fit_transform(X_train[cat_cols])
    X_valid[cat_cols] = ordinal_encoder.transform(X_valid[cat_cols])
    X_test[cat_cols] = ordinal_encoder.transform(X_test[cat_cols]) # Q. The last transform
    
    #print("training")
    #model = RandomForestRegressor(random_state=fold, n_jobs=-1)
    model = XGBRegressor(random_state=fold, tree_method='gpu_hist', gpu_id=0, predictor='gpu_predictor')
    #model = XGBRegressor(random_state=fold, n_jobs=8)
    model.fit(X_train, y_train)
    preds_valid = model.predict(X_valid)
    test_preds = model.predict(X_test)
    final_predictions.append(test_preds)
    rmse = mean_squared_error(y_valid, preds_valid, squared=False)
    
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))

0 0.7242269269359577
1 0.7240495957228682
2 0.7255552929019189
3 0.7307805951556532
4 0.7923764903298341
0.7393977802092464 0.026602013514808294


In [25]:
X_train

Unnamed: 0,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,...,tar_enc_cat0,tar_enc_cat1,tar_enc_cat2,tar_enc_cat3,tar_enc_cat4,tar_enc_cat5,tar_enc_cat6,tar_enc_cat7,tar_enc_cat8,tar_enc_cat9
0,1.0,1.0,1.0,2.0,1.0,1.0,0.0,4.0,2.0,13.0,...,8.245979,8.203850,8.224780,8.236717,8.240572,8.229516,8.240567,8.240285,8.280709,8.249782
1,1.0,0.0,0.0,0.0,1.0,3.0,0.0,4.0,2.0,5.0,...,8.245979,8.276689,8.244491,8.274495,8.240572,8.250754,8.240567,8.240285,8.280709,8.259165
2,0.0,1.0,0.0,2.0,1.0,1.0,0.0,4.0,0.0,13.0,...,8.238970,8.203850,8.244491,8.236717,8.240572,8.229516,8.240567,8.240285,8.230681,8.249782
3,1.0,1.0,0.0,2.0,1.0,3.0,0.0,4.0,2.0,6.0,...,8.245979,8.203850,8.244491,8.236717,8.240572,8.250754,8.240567,8.240285,8.280709,8.234356
4,1.0,1.0,0.0,2.0,1.0,3.0,0.0,4.0,2.0,5.0,...,8.245979,8.203850,8.244491,8.236717,8.240572,8.250754,8.240567,8.240285,8.280709,8.259165
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
239995,0.0,0.0,0.0,2.0,1.0,3.0,1.0,4.0,6.0,5.0,...,8.239318,8.278803,8.245484,8.237347,8.241200,8.251611,8.264177,8.240682,8.257484,8.259797
239996,0.0,1.0,0.0,2.0,1.0,3.0,0.0,4.0,0.0,5.0,...,8.239318,8.203374,8.245484,8.237347,8.241200,8.251611,8.241397,8.240682,8.230168,8.259797
239997,1.0,0.0,0.0,2.0,1.0,1.0,0.0,4.0,4.0,8.0,...,8.247711,8.278803,8.245484,8.237347,8.241200,8.230389,8.241397,8.240682,8.192147,8.223627
239998,0.0,1.0,0.0,2.0,1.0,1.0,0.0,4.0,4.0,5.0,...,8.239318,8.203374,8.245484,8.237347,8.241200,8.230389,8.241397,8.240682,8.192147,8.259797


In [None]:
%%time
# With Standardization + Normalization
useful_features = [col for col in train_data.columns if col not in ("id", "target", "kfold")]
cat_cols = [col for col in useful_features if "cat" in col]
num_cols = [col for col in useful_features if col.startswith('cont')]
test_data = test_data[useful_features]

final_predictions = []
scores = []

for fold in range(5):
    # Preprocessing - Kfold
    X_train = train_data[train_data.kfold != fold].reset_index(drop=True)
    X_valid = train_data[train_data.kfold == fold].reset_index(drop=True)
    X_test = test_data.copy()
    
    y_train = X_train.target
    y_valid = X_valid.target
    
    X_train = X_train[useful_features]
    X_valid = X_valid[useful_features]
    
    # Preprocessing - Ordinal Encoding
    ordinal_encoder = OrdinalEncoder()
    X_train[cat_cols] = ordinal_encoder.fit_transform(X_train[cat_cols])
    X_valid[cat_cols] = ordinal_encoder.transform(X_valid[cat_cols])
    X_test[cat_cols] = ordinal_encoder.transform(X_test[cat_cols]) # Q. The last transform

    # Preprocessing - Standardization
    scaler = StandardScaler()
    X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
    X_valid[num_cols] = scaler.transform(X_valid[num_cols])
    X_test[num_cols] = scaler.transform(X_test[num_cols]) # Q. The last transform
    
    # Preprocessing - Normalizatino
    normalizer = Normalizer()
    X_train[num_cols] = normalizer.fit_transform(X_train[num_cols])
    X_valid[num_cols] = normalizer.transform(X_valid[num_cols])
    X_test[num_cols] = normalizer.transform(X_test[num_cols]) # Q. The last transform
    
    # Training
    #model = RandomForestRegressor(random_state=fold, n_jobs=-1)
    #model = XGBRegressor(random_state=fold, n_jobs=8)
    model = XGBRegressor(random_state=fold, tree_method='gpu_hist', gpu_id=0, predictor='gpu_predictor')
    model.fit(X_train, y_train)
    
    # Evaluation
    preds_valid = model.predict(X_valid)
    test_preds = model.predict(X_test)
    final_predictions.append(test_preds)
    rmse = mean_squared_error(y_valid, preds_valid, squared=False)
    
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))

In [None]:
%%time
# With Standardization
useful_features = [col for col in train_data.columns if col not in ("id", "target", "kfold")]
cat_cols = [col for col in useful_features if "cat" in col]
num_cols = [col for col in useful_features if col.startswith('cont')]
test_data = test_data[useful_features]

final_predictions = []
scores = []

for fold in range(5):
    # Preprocessing - Kfold
    X_train = train_data[train_data.kfold != fold].reset_index(drop=True)
    X_valid = train_data[train_data.kfold == fold].reset_index(drop=True)
    X_test = test_data.copy()
    
    y_train = X_train.target
    y_valid = X_valid.target
    
    X_train = X_train[useful_features]
    X_valid = X_valid[useful_features]
    
    # Preprocessing - Ordinal Encoding
    ordinal_encoder = OrdinalEncoder()
    X_train[cat_cols] = ordinal_encoder.fit_transform(X_train[cat_cols])
    X_valid[cat_cols] = ordinal_encoder.transform(X_valid[cat_cols])
    X_test[cat_cols] = ordinal_encoder.transform(X_test[cat_cols]) # Q. The last transform
    
    # Preprocessing - Standardization
    scaler = StandardScaler()
    X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
    X_valid[num_cols] = scaler.transform(X_valid[num_cols])
    X_test[num_cols] = scaler.transform(X_test[num_cols]) # Q. The last transform
    
    # Training
    #model = RandomForestRegressor(random_state=fold, n_jobs=-1)
    #model = XGBRegressor(random_state=fold, n_jobs=8)
    model = XGBRegressor(random_state=fold, tree_method='gpu_hist', gpu_id=0, predictor='gpu_predictor')
    model.fit(X_train, y_train)
    
    # Evaluation
    preds_valid = model.predict(X_valid)
    test_preds = model.predict(X_test)
    final_predictions.append(test_preds)
    rmse = mean_squared_error(y_valid, preds_valid, squared=False)
    
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))

In [None]:
%%time
# Log transformation + Tuning
useful_features = [col for col in train_data.columns if col not in ("id", "target", "kfold")]
cat_cols = [col for col in useful_features if "cat" in col]
num_cols = [col for col in useful_features if col.startswith('cont')]
test_data = test_data[useful_features]

for col in num_cols:
    train_data[col] = np.log1p(train_data[col])
    test_data[col] = np.log1p(test_data[col])

final_predictions = []
scores = []

for fold in range(5):
    # Preprocessing - Kfold
    X_train = train_data[train_data.kfold != fold].reset_index(drop=True)
    X_valid = train_data[train_data.kfold == fold].reset_index(drop=True)
    X_test = test_data.copy()
    
    y_train = X_train.target
    y_valid = X_valid.target
    
    X_train = X_train[useful_features]
    X_valid = X_valid[useful_features]
    
    # Preprocessing - Ordinal Encoding
    ordinal_encoder = OrdinalEncoder()
    X_train[cat_cols] = ordinal_encoder.fit_transform(X_train[cat_cols])
    X_valid[cat_cols] = ordinal_encoder.transform(X_valid[cat_cols])
    X_test[cat_cols] = ordinal_encoder.transform(X_test[cat_cols]) # Q. The last transform
    
    # Training
    #model = RandomForestRegressor(random_state=fold, n_jobs=-1)
    #model = XGBRegressor(random_state=fold, n_jobs=8)
    model = XGBRegressor(random_state=fold, tree_method='gpu_hist', gpu_id=0, predictor='gpu_predictor', 
                         learning_rate=0.1, n_estimators=1000, max_depth=3, colsample_bytree=0.3)
    model.fit(X_train, y_train)
    
    # Evaluation
    preds_valid = model.predict(X_valid)
    test_preds = model.predict(X_test)
    final_predictions.append(test_preds)
    rmse = mean_squared_error(y_valid, preds_valid, squared=False)
    
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))

print('You need to reset dataframe!')

In [None]:
%%time
# polynomial features + Tuning
useful_features = [col for col in train_data.columns if col not in ("id", "target", "kfold")]
cat_cols = [col for col in useful_features if "cat" in col]
num_cols = [col for col in useful_features if col.startswith('cont')]
test_data = test_data[useful_features]

poly = PolynomialFeatures(degree=2, 
                          interaction_only=True, # If true, only interaction features are produced: features that are products of at most degree distinct input features (so not x[1] ** 2, x[0] * x[2] ** 3, etc.).
                          include_bias=False)
train_poly = poly.fit_transform(train_data[num_cols])
test_poly = poly.fit_transform(test_data[num_cols])

df_train_poly = pd.DataFrame(train_poly, columns=[f"poly_{i}" for i in range(train_poly.shape[1])])
df_test_poly = pd.DataFrame(test_poly, columns=[f"poly_{i}" for i in range(test_poly.shape[1])])

train_data = pd.concat([train_data, df_train_poly], axis=1)
test_data = pd.concat([test_data, df_test_poly], axis=1)

useful_features = [col for col in train_data.columns if col not in ("id", "target", "kfold")]
cat_cols = [col for col in useful_features if "cat" in col]
test_data = test_data[useful_features]

final_predictions = []
scores = []

for fold in range(5):
    # Preprocessing - Kfold
    X_train = train_data[train_data.kfold != fold].reset_index(drop=True)
    X_valid = train_data[train_data.kfold == fold].reset_index(drop=True)
    X_test = test_data.copy()
    
    y_train = X_train.target
    y_valid = X_valid.target
    
    X_train = X_train[useful_features]
    X_valid = X_valid[useful_features]
    
    # Preprocessing - Ordinal Encoding
    ordinal_encoder = OrdinalEncoder()
    X_train[cat_cols] = ordinal_encoder.fit_transform(X_train[cat_cols])
    X_valid[cat_cols] = ordinal_encoder.transform(X_valid[cat_cols])
    X_test[cat_cols] = ordinal_encoder.transform(X_test[cat_cols]) # Q. The last transform
    
    # Training
    #model = RandomForestRegressor(random_state=fold, n_jobs=-1)
    #model = XGBRegressor(random_state=fold, n_jobs=8)
    model = XGBRegressor(random_state=fold, tree_method='gpu_hist', gpu_id=0, predictor='gpu_predictor', 
                         learning_rate=0.1, n_estimators=1000, max_depth=3, colsample_bytree=0.3)
    model.fit(X_train, y_train)
    
    # Evaluation
    preds_valid = model.predict(X_valid)
    test_preds = model.predict(X_test)
    final_predictions.append(test_preds)
    rmse = mean_squared_error(y_valid, preds_valid, squared=False)
    
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))

print('You need to reset dataframe!')

In [None]:
test_data

In [None]:
%%time
# One-Hot Encoding + Ordinal Encoding + Tuning
# pd.cut 
# Model Tuning + drop cat2
useful_features = [col for col in train_data.columns if col not in ("id", "target", "kfold")]
cat_cols = [col for col in useful_features if "cat" in col]
oe_cols = ['cat9']
ohe_cols = cat_cols
ohe_cols.remove('cat9')
num_cols = [col for col in useful_features if col.startswith('cont')]
test_data = test_data[useful_features]

final_predictions = []
scores = []

for fold in range(5):
    # Preprocessing - Kfold
    X_train = train_data[train_data.kfold != fold].reset_index(drop=True)
    X_valid = train_data[train_data.kfold == fold].reset_index(drop=True)
    X_test = test_data.copy()
    
    y_train = X_train.target
    y_valid = X_valid.target
    
    X_train = X_train[useful_features]
    X_valid = X_valid[useful_features]
    
    # Preprocessing - Ordinal Encoding
    ordinal_encoder = OrdinalEncoder()
    X_train[oe_cols] = ordinal_encoder.fit_transform(X_train[oe_cols])
    X_valid[oe_cols] = ordinal_encoder.transform(X_valid[oe_cols])
    X_test[oe_cols] = ordinal_encoder.transform(X_test[oe_cols]) # Q. The last transform
    
    # Preprocessing - One-Hot Encoding
    ohe = OneHotEncoder(sparse=False, handle_unknown="ignore")
    X_train_ohe = ohe.fit_transform(X_train[ohe_cols])
    X_valid_ohe = ohe.transform(X_valid[ohe_cols])
    X_test_ohe = ohe.transform(X_test[ohe_cols]) # Q. The last transform
    
    X_train_ohe = pd.DataFrame(X_train_ohe, columns=[f"ohe_{i}" for i in range(X_train_ohe.shape[1])])
    X_valid_ohe = pd.DataFrame(X_valid_ohe, columns=[f"ohe_{i}" for i in range(X_valid_ohe.shape[1])])
    X_test_ohe = pd.DataFrame(X_test_ohe, columns=[f"ohe_{i}" for i in range(X_test_ohe.shape[1])])
    
    X_train = pd.concat([X_train.drop(columns=ohe_cols), X_train_ohe], axis=1)
    X_valid = pd.concat([X_valid.drop(columns=ohe_cols), X_valid_ohe], axis=1)
    X_test = pd.concat([X_test.drop(columns=ohe_cols), X_test_ohe], axis=1)

    # Training
    #model = RandomForestRegressor(random_state=fold, n_jobs=-1)
    #model = XGBRegressor(random_state=fold, n_jobs=8)
    model = XGBRegressor(random_state=fold, tree_method='gpu_hist', gpu_id=0, predictor='gpu_predictor', 
                         learning_rate=0.1, n_estimators=1000, max_depth=3, colsample_bytree=0.3)
    model.fit(X_train, y_train)
    
    # Evaluation
    preds_valid = model.predict(X_valid)
    test_preds = model.predict(X_test)
    final_predictions.append(test_preds)
    rmse = mean_squared_error(y_valid, preds_valid, squared=False)
    
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))

print('You need to reset dataframe!')

In [None]:
%%time
# Model Tuning + drop cat2
useful_features = [col for col in train_data.columns if col not in ("id", "target", "kfold", "cat2")]
cat_cols = [col for col in useful_features if "cat" in col]
num_cols = [col for col in useful_features if col.startswith('cont')]
test_data = test_data[useful_features]

final_predictions = []
scores = []

for fold in range(5):
    # Preprocessing - Kfold
    X_train = train_data[train_data.kfold != fold].reset_index(drop=True)
    X_valid = train_data[train_data.kfold == fold].reset_index(drop=True)
    X_test = test_data.copy()
    
    y_train = X_train.target
    y_valid = X_valid.target
    
    X_train = X_train[useful_features]
    X_valid = X_valid[useful_features]
    
    # Preprocessing - Ordinal Encoding
    ordinal_encoder = OrdinalEncoder()
    X_train[cat_cols] = ordinal_encoder.fit_transform(X_train[cat_cols])
    X_valid[cat_cols] = ordinal_encoder.transform(X_valid[cat_cols])
    X_test[cat_cols] = ordinal_encoder.transform(X_test[cat_cols]) # Q. The last transform
    
    # Training
    #model = RandomForestRegressor(random_state=fold, n_jobs=-1)
    #model = XGBRegressor(random_state=fold, n_jobs=8)
    #model = XGBRegressor(random_state=fold, tree_method='gpu_hist', gpu_id=0, predictor='gpu_predictor')
    model = XGBRegressor(random_state=fold, tree_method='gpu_hist', gpu_id=0, predictor='gpu_predictor', 
                         learning_rate=0.1, n_estimators=1000, max_depth=3, colsample_bytree=0.3)
    model.fit(X_train, y_train)
    
    # Evaluation
    preds_valid = model.predict(X_valid)
    test_preds = model.predict(X_test)
    final_predictions.append(test_preds)
    rmse = mean_squared_error(y_valid, preds_valid, squared=False)
    
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))

print('You need to reset dataframe!')

In [None]:
%%time
# Model Tuning + drop cat2, cat6
useful_features = [col for col in train_data.columns if col not in ("id", "target", "kfold", "cat2", "cat6")]
cat_cols = [col for col in useful_features if "cat" in col]
num_cols = [col for col in useful_features if col.startswith('cont')]
test_data = test_data[useful_features]

final_predictions = []
scores = []

for fold in range(5):
    # Preprocessing - Kfold
    X_train = train_data[train_data.kfold != fold].reset_index(drop=True)
    X_valid = train_data[train_data.kfold == fold].reset_index(drop=True)
    X_test = test_data.copy()
    
    y_train = X_train.target
    y_valid = X_valid.target
    
    X_train = X_train[useful_features]
    X_valid = X_valid[useful_features]
    
    # Preprocessing - Ordinal Encoding
    ordinal_encoder = OrdinalEncoder()
    X_train[cat_cols] = ordinal_encoder.fit_transform(X_train[cat_cols])
    X_valid[cat_cols] = ordinal_encoder.transform(X_valid[cat_cols])
    X_test[cat_cols] = ordinal_encoder.transform(X_test[cat_cols]) # Q. The last transform
    
    # Training
    #model = RandomForestRegressor(random_state=fold, n_jobs=-1)
    #model = XGBRegressor(random_state=fold, n_jobs=8)
    #model = XGBRegressor(random_state=fold, tree_method='gpu_hist', gpu_id=0, predictor='gpu_predictor')
    model = XGBRegressor(random_state=fold, tree_method='gpu_hist', gpu_id=0, predictor='gpu_predictor', 
                         learning_rate=0.1, n_estimators=1000, max_depth=3, colsample_bytree=0.3)
    model.fit(X_train, y_train)
    
    # Evaluation
    preds_valid = model.predict(X_valid)
    test_preds = model.predict(X_test)
    final_predictions.append(test_preds)
    rmse = mean_squared_error(y_valid, preds_valid, squared=False)
    
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))

print('You need to reset dataframe!')

In [None]:
%%time
# Tuning + Standardization
useful_features = [col for col in train_data.columns if col not in ("id", "target", "kfold")]
cat_cols = [col for col in useful_features if "cat" in col]
num_cols = [col for col in useful_features if col.startswith('cont')]
test_data = test_data[useful_features]

final_predictions = []
scores = []

for fold in range(5):
    # Preprocessing - Kfold
    X_train = train_data[train_data.kfold != fold].reset_index(drop=True)
    X_valid = train_data[train_data.kfold == fold].reset_index(drop=True)
    X_test = test_data.copy()
    
    y_train = X_train.target
    y_valid = X_valid.target
    
    X_train = X_train[useful_features]
    X_valid = X_valid[useful_features]
    
    # Preprocessing - Ordinal Encoding
    ordinal_encoder = OrdinalEncoder()
    X_train[cat_cols] = ordinal_encoder.fit_transform(X_train[cat_cols])
    X_valid[cat_cols] = ordinal_encoder.transform(X_valid[cat_cols])
    X_test[cat_cols] = ordinal_encoder.transform(X_test[cat_cols]) # Q. The last transform
    
    # Preprocessing - Standardization
    scaler = StandardScaler()
    X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
    X_valid[num_cols] = scaler.transform(X_valid[num_cols])
    X_test[num_cols] = scaler.transform(X_test[num_cols]) # Q. The last transform
    
    # Training
    #model = RandomForestRegressor(random_state=fold, n_jobs=-1)
    #model = XGBRegressor(random_state=fold, n_jobs=8)
    #model = XGBRegressor(random_state=fold, tree_method='gpu_hist', gpu_id=0, predictor='gpu_predictor')
    model = XGBRegressor(random_state=fold, tree_method='gpu_hist', gpu_id=0, predictor='gpu_predictor', 
                         learning_rate=0.1, n_estimators=1000, max_depth=3, colsample_bytree=0.3)
    model.fit(X_train, y_train)
    
    # Evaluation
    preds_valid = model.predict(X_valid)
    test_preds = model.predict(X_test)
    final_predictions.append(test_preds)
    rmse = mean_squared_error(y_valid, preds_valid, squared=False)
    
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))

In [None]:
%%time
# Only Model Tuning
useful_features = [col for col in train_data.columns if col not in ("id", "target", "kfold")]
cat_cols = [col for col in useful_features if "cat" in col]
num_cols = [col for col in useful_features if col.startswith('cont')]
test_data = test_data[useful_features]

final_predictions = []
scores = []

for fold in range(5):
    # Preprocessing - Kfold
    X_train = train_data[train_data.kfold != fold].reset_index(drop=True)
    X_valid = train_data[train_data.kfold == fold].reset_index(drop=True)
    X_test = test_data.copy()
    
    y_train = X_train.target
    y_valid = X_valid.target
    
    X_train = X_train[useful_features]
    X_valid = X_valid[useful_features]
    
    # Preprocessing - Ordinal Encoding
    ordinal_encoder = OrdinalEncoder()
    X_train[cat_cols] = ordinal_encoder.fit_transform(X_train[cat_cols])
    X_valid[cat_cols] = ordinal_encoder.transform(X_valid[cat_cols])
    X_test[cat_cols] = ordinal_encoder.transform(X_test[cat_cols]) # Q. The last transform
    
    # Training
    #model = RandomForestRegressor(random_state=fold, n_jobs=-1)
    #model = XGBRegressor(random_state=fold, n_jobs=8)
    model = XGBRegressor(random_state=fold, tree_method='gpu_hist', gpu_id=0, predictor='gpu_predictor', 
                         learning_rate=0.1, n_estimators=1000, max_depth=3, colsample_bytree=0.3)
    model.fit(X_train, y_train)
    
    # Evaluation
    preds_valid = model.predict(X_valid)
    test_preds = model.predict(X_test)
    final_predictions.append(test_preds)
    rmse = mean_squared_error(y_valid, preds_valid, squared=False)
    
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))

In [None]:
# Export submission.csv
preds = np.mean(np.column_stack(final_predictions), axis=1)
preds = pd.DataFrame({'id': sample_submission.id, 'target': preds})
preds.to_csv('submission.csv', index=False)