In [1]:
# Statistics
import pandas as pd
import numpy as np
import math as mt

# Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

# Data Preprocessing - Standardization, Encoding, Imputation
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer


# Data Preprocessing - Feature Engineering
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import mutual_info_regression
from sklearn.decomposition import PCA

# Data Preprocessing - ML Pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# ML - Modeling
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

# ML - Evaluation
from sklearn.model_selection import cross_val_score

# ML - Tuning
from sklearn.model_selection import GridSearchCV

In [2]:
sample_submission = pd.read_csv('../input/30-days-of-ml/sample_submission.csv')

In [4]:
%%time
# Baseline
train_data = pd.read_csv('../input/30days-folds/train_folds.csv')
test_data = pd.read_csv('../input/30-days-of-ml/test.csv')

useful_features = [col for col in train_data.columns if col not in ("id", "target", "kfold")]
cat_cols = [col for col in useful_features if "cat" in col]
test_data = test_data[useful_features]

final_valid_predictions = {}
final_test_predictions = []
scores = []

for fold in range(5):
    # Preprocessing
    X_train = train_data[train_data.kfold != fold].reset_index(drop=True)
    X_valid = train_data[train_data.kfold == fold].reset_index(drop=True)
    X_test = test_data.copy()
    
    X_valid_ids = X_valid.id.values.tolist()
    
    y_train = X_train.target
    y_valid = X_valid.target
    
    X_train = X_train[useful_features]
    X_valid = X_valid[useful_features]
    
    # Ordinal Encoding
    ordinal_encoder = OrdinalEncoder()
    X_train[cat_cols] = ordinal_encoder.fit_transform(X_train[cat_cols])
    X_valid[cat_cols] = ordinal_encoder.transform(X_valid[cat_cols])
    X_test[cat_cols] = ordinal_encoder.transform(X_test[cat_cols]) # Q. The last transform
    
    # Training
    params = {
        'random_state': 1, 
        'booster': 'gbtree',
        'n_estimators': 10000,
        'learning_rate': 0.03628302216953097,
        'reg_lambda': 0.0008746338866473539,
        'reg_alpha': 23.13181079976304,
        'subsample': 0.7875490025178415,
        'colsample_bytree': 0.11807135201147481,
        'max_depth': 3
    }
    
    model = XGBRegressor(
        tree_method='gpu_hist', 
        gpu_id=0, 
        predictor='gpu_predictor',
        **params
    )
    model.fit(X_train, y_train, early_stopping_rounds=300, eval_set=[(X_valid, y_valid)], verbose=1000)
    
    # Evaluation and Inference
    preds_valid = model.predict(X_valid)
    test_preds = model.predict(X_test)
    
    final_valid_predictions.update(dict(zip(X_valid_ids, preds_valid)))
    final_test_predictions.append(test_preds)
    rmse = mean_squared_error(y_valid, preds_valid, squared=False)
    
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))

final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
final_valid_predictions.columns = ["id", "pred_1"]
final_valid_predictions.to_csv("train_pred_1.csv", index=False)

preds = np.mean(np.column_stack(final_test_predictions), axis=1)
preds = pd.DataFrame({'id': sample_submission.id, 'pred_1': preds})
preds.to_csv("test_pred_1.csv", index=False)

[0]	validation_0-rmse:7.50031
[1000]	validation_0-rmse:0.72395
[2000]	validation_0-rmse:0.72000
[3000]	validation_0-rmse:0.71849
[4000]	validation_0-rmse:0.71770
[5000]	validation_0-rmse:0.71730
[6000]	validation_0-rmse:0.71705
[7000]	validation_0-rmse:0.71693
[8000]	validation_0-rmse:0.71688
[8522]	validation_0-rmse:0.71687
0 0.7168573307459428
[0]	validation_0-rmse:7.49707
[1000]	validation_0-rmse:0.72356
[2000]	validation_0-rmse:0.71967
[3000]	validation_0-rmse:0.71825
[4000]	validation_0-rmse:0.71751
[5000]	validation_0-rmse:0.71714
[6000]	validation_0-rmse:0.71696
[7000]	validation_0-rmse:0.71688
[8000]	validation_0-rmse:0.71682
[8063]	validation_0-rmse:0.71681
1 0.7168120224667079
[0]	validation_0-rmse:7.49474
[1000]	validation_0-rmse:0.72527
[2000]	validation_0-rmse:0.72145
[3000]	validation_0-rmse:0.71996
[4000]	validation_0-rmse:0.71932
[5000]	validation_0-rmse:0.71894
[6000]	validation_0-rmse:0.71872
[7000]	validation_0-rmse:0.71863
[7931]	validation_0-rmse:0.71862
2 0.718599

In [5]:
%%time
# Model 2 polynominal features
train_data = pd.read_csv('../input/30days-folds/train_folds.csv')
test_data = pd.read_csv('../input/30-days-of-ml/test.csv')

useful_features = [col for col in train_data.columns if col not in ("id", "target", "kfold")]
cat_cols = [col for col in useful_features if "cat" in col]
num_cols = [col for col in useful_features if col.startswith("cont")]
test_data = test_data[useful_features]

poly = PolynomialFeatures(degree=3, 
                          interaction_only=True, # If true, only interaction features are produced: features that are products of at most degree distinct input features (so not x[1] ** 2, x[0] * x[2] ** 3, etc.).
                          include_bias=False)
train_poly = poly.fit_transform(train_data[num_cols])
test_poly = poly.fit_transform(test_data[num_cols])

df_train_poly = pd.DataFrame(train_poly, columns=[f"poly_{i}" for i in range(train_poly.shape[1])])
df_test_poly = pd.DataFrame(test_poly, columns=[f"poly_{i}" for i in range(test_poly.shape[1])])

train_data = pd.concat([train_data, df_train_poly], axis=1)
test_data = pd.concat([test_data, df_test_poly], axis=1)

useful_features = [col for col in train_data.columns if col not in ("id", "target", "kfold")]
cat_cols = [col for col in useful_features if "cat" in col]
test_data = test_data[useful_features]

final_valid_predictions = {}
final_test_predictions = []
scores = []

for fold in range(5):
    # Preprocessing
    X_train = train_data[train_data.kfold != fold].reset_index(drop=True)
    X_valid = train_data[train_data.kfold == fold].reset_index(drop=True)
    X_test = test_data.copy()
    
    X_valid_ids = X_valid.id.values.tolist()
    
    y_train = X_train.target
    y_valid = X_valid.target
    
    X_train = X_train[useful_features]
    X_valid = X_valid[useful_features]
    
    # Ordinal Encoding
    ordinal_encoder = OrdinalEncoder()
    X_train[cat_cols] = ordinal_encoder.fit_transform(X_train[cat_cols])
    X_valid[cat_cols] = ordinal_encoder.transform(X_valid[cat_cols])
    X_test[cat_cols] = ordinal_encoder.transform(X_test[cat_cols]) # Q. The last transform
    
    # Training
    params = {
        'random_state': 1, 
        'booster': 'gbtree',
        'n_estimators': 10000,
        'learning_rate': 0.03628302216953097,
        'reg_lambda': 0.0008746338866473539,
        'reg_alpha': 23.13181079976304,
        'subsample': 0.7875490025178415,
        'colsample_bytree': 0.11807135201147481,
        'max_depth': 3
    }
    
    model = XGBRegressor(
        tree_method='gpu_hist', 
        gpu_id=0, 
        predictor='gpu_predictor',
        **params
    )
    model.fit(X_train, y_train, early_stopping_rounds=300, eval_set=[(X_valid, y_valid)], verbose=1000)
    
    # Evaluation and Inference
    preds_valid = model.predict(X_valid)
    test_preds = model.predict(X_test)
    
    final_valid_predictions.update(dict(zip(X_valid_ids, preds_valid)))
    final_test_predictions.append(test_preds)
    rmse = mean_squared_error(y_valid, preds_valid, squared=False)
    
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))

final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
final_valid_predictions.columns = ["id", "pred_2"]
final_valid_predictions.to_csv("train_pred_2.csv", index=False)

preds = np.mean(np.column_stack(final_test_predictions), axis=1)
preds = pd.DataFrame({'id': sample_submission.id, 'pred_2': preds})
preds.to_csv("test_pred_2.csv", index=False)

[0]	validation_0-rmse:7.50030
[1000]	validation_0-rmse:0.72375
[2000]	validation_0-rmse:0.72144
[3000]	validation_0-rmse:0.72094
[4000]	validation_0-rmse:0.72078
[4261]	validation_0-rmse:0.72083
0 0.720767827951469
[0]	validation_0-rmse:7.49705
[1000]	validation_0-rmse:0.72352
[2000]	validation_0-rmse:0.72124
[3000]	validation_0-rmse:0.72064
[4000]	validation_0-rmse:0.72047
[4219]	validation_0-rmse:0.72045
1 0.7204373982348939
[0]	validation_0-rmse:7.49473
[1000]	validation_0-rmse:0.72539
[2000]	validation_0-rmse:0.72322
[3000]	validation_0-rmse:0.72267
[3851]	validation_0-rmse:0.72263
2 0.7225806933088028
[0]	validation_0-rmse:7.49707
[1000]	validation_0-rmse:0.72515
[2000]	validation_0-rmse:0.72277
[3000]	validation_0-rmse:0.72218
[3625]	validation_0-rmse:0.72220
3 0.72214507339082
[0]	validation_0-rmse:7.50251
[1000]	validation_0-rmse:0.72449
[2000]	validation_0-rmse:0.72208
[3000]	validation_0-rmse:0.72151
[3571]	validation_0-rmse:0.72143
4 0.7214106603740207
0.7214683306520013 0.0

In [6]:
%%time
# Model 3 targeting encoding
train_data = pd.read_csv('../input/30days-folds/train_folds.csv')
test_data = pd.read_csv('../input/30-days-of-ml/test.csv')

useful_features = [col for col in train_data.columns if col not in ("id", "target", "kfold")]
cat_cols = [col for col in useful_features if "cat" in col]
test_data = test_data[useful_features]

for col in cat_cols:
    temp_df = []
    temp_test_feat = None
    for fold in range(5):
        X_train = train_data[train_data.kfold != fold].reset_index(drop=True)
        X_valid = train_data[train_data.kfold == fold].reset_index(drop=True)
        feat = X_train.groupby(col)["target"].agg("mean")
        feat = feat.to_dict()
        #print(feat)
        X_valid.loc[:, f"tar_enc_{col}"] = X_valid[col].map(feat)
        temp_df.append(X_valid)
        if temp_test_feat is None:
            temp_test_feat = test_data[col].map(feat)
        else:
            temp_test_feat += test_data[col].map(feat)

    temp_test_feat /= 5
    test_data.loc[:, f"tar_enc_{col}"] = temp_test_feat
    train_data = pd.concat(temp_df)
    
useful_features = [col for col in train_data.columns if col not in ("id", "target", "kfold")]
cat_cols = [col for col in useful_features if col.startswith("cat")]
test_data = test_data[useful_features]


final_valid_predictions = {}
final_test_predictions = []
scores = []

for fold in range(5):
    # Preprocessing
    X_train = train_data[train_data.kfold != fold].reset_index(drop=True)
    X_valid = train_data[train_data.kfold == fold].reset_index(drop=True)
    X_test = test_data.copy()
    
    X_valid_ids = X_valid.id.values.tolist()
    
    y_train = X_train.target
    y_valid = X_valid.target
    
    X_train = X_train[useful_features]
    X_valid = X_valid[useful_features]
    
    # Ordinal Encoding
    ordinal_encoder = OrdinalEncoder()
    X_train[cat_cols] = ordinal_encoder.fit_transform(X_train[cat_cols])
    X_valid[cat_cols] = ordinal_encoder.transform(X_valid[cat_cols])
    X_test[cat_cols] = ordinal_encoder.transform(X_test[cat_cols]) # Q. The last transform
    
    # Training
    params = {
        'random_state': 1, 
        'booster': 'gbtree',
        'n_estimators': 10000,
        'learning_rate': 0.03628302216953097,
        'reg_lambda': 0.0008746338866473539,
        'reg_alpha': 23.13181079976304,
        'subsample': 0.7875490025178415,
        'colsample_bytree': 0.11807135201147481,
        'max_depth': 3
    }
    
    model = XGBRegressor(
        tree_method='gpu_hist', 
        gpu_id=0, 
        predictor='gpu_predictor',
        **params
    )
    model.fit(X_train, y_train, early_stopping_rounds=300, eval_set=[(X_valid, y_valid)], verbose=1000)
    
    # Evaluation and Inference
    preds_valid = model.predict(X_valid)
    test_preds = model.predict(X_test)
    
    final_valid_predictions.update(dict(zip(X_valid_ids, preds_valid)))
    final_test_predictions.append(test_preds)
    rmse = mean_squared_error(y_valid, preds_valid, squared=False)
    
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))

final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
final_valid_predictions.columns = ["id", "pred_3"]
final_valid_predictions.to_csv("train_pred_3.csv", index=False)

preds = np.mean(np.column_stack(final_test_predictions), axis=1)
preds = pd.DataFrame({'id': sample_submission.id, 'pred_3': preds})
preds.to_csv("test_pred_3.csv", index=False)

[0]	validation_0-rmse:7.50029
[1000]	validation_0-rmse:0.72337
[2000]	validation_0-rmse:0.71989
[3000]	validation_0-rmse:0.71849
[4000]	validation_0-rmse:0.71785
[5000]	validation_0-rmse:0.71751
[6000]	validation_0-rmse:0.71737
[7000]	validation_0-rmse:0.71730
[7745]	validation_0-rmse:0.71730
0 0.7172629713248143
[0]	validation_0-rmse:7.49700
[1000]	validation_0-rmse:0.72288
[2000]	validation_0-rmse:0.71932
[3000]	validation_0-rmse:0.71804
[4000]	validation_0-rmse:0.71753
[5000]	validation_0-rmse:0.71726
[6000]	validation_0-rmse:0.71708
[7000]	validation_0-rmse:0.71695
[7404]	validation_0-rmse:0.71696
1 0.7169390974696654
[0]	validation_0-rmse:7.49478
[1000]	validation_0-rmse:0.72462
[2000]	validation_0-rmse:0.72122
[3000]	validation_0-rmse:0.71988
[4000]	validation_0-rmse:0.71934
[5000]	validation_0-rmse:0.71904
[6000]	validation_0-rmse:0.71898
[6760]	validation_0-rmse:0.71897
2 0.7189471800619452
[0]	validation_0-rmse:7.49701
[1000]	validation_0-rmse:0.72454
[2000]	validation_0-rmse:

In [7]:
train_data = pd.read_csv('../input/30days-folds/train_folds.csv')
test_data = pd.read_csv('../input/30-days-of-ml/test.csv')

df_train1 = pd.read_csv('train_pred_1.csv')
df_train2 = pd.read_csv('train_pred_2.csv')
df_train3 = pd.read_csv('train_pred_3.csv')

df_test1 = pd.read_csv('test_pred_1.csv')
df_test2 = pd.read_csv('test_pred_2.csv')
df_test3 = pd.read_csv('test_pred_3.csv')

train_data = train_data.merge(df_train1, on="id", how="left")
train_data = train_data.merge(df_train2, on="id", how="left")
train_data = train_data.merge(df_train3, on="id", how="left")

test_data = test_data.merge(df_test1, on="id", how="left")
test_data = test_data.merge(df_test2, on="id", how="left")
test_data = test_data.merge(df_test3, on="id", how="left")

In [8]:
train_data

Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,...,cont9,cont10,cont11,cont12,cont13,target,kfold,pred_1,pred_2,pred_3
0,1,B,B,B,C,B,B,A,E,C,...,0.267559,0.237281,0.377873,0.322401,0.869850,8.113634,0,8.445581,8.404875,8.448442
1,2,B,B,A,A,B,D,A,F,A,...,0.341439,0.906013,0.921701,0.261975,0.465083,8.481233,2,8.374415,8.277811,8.316208
2,3,A,A,A,C,B,D,A,D,A,...,0.843531,0.748809,0.620126,0.541474,0.763846,8.364351,4,8.195627,8.111172,8.139681
3,4,B,B,A,C,B,D,A,E,C,...,0.574844,0.346010,0.714610,0.540150,0.280682,8.049253,3,8.388680,8.305612,8.374855
4,6,A,A,A,C,B,D,A,E,A,...,0.956692,1.000773,0.776742,0.625849,0.250823,7.972260,1,8.258226,8.220263,8.256620
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299995,499993,B,B,A,A,B,D,A,E,A,...,0.853726,0.422541,1.063463,0.697685,0.506404,7.945605,4,8.312504,8.330939,8.255457
299996,499996,A,B,A,C,B,B,A,E,E,...,0.433525,0.301015,0.268447,0.577055,0.823611,7.326118,3,7.700157,7.885316,7.723053
299997,499997,B,B,A,C,B,C,A,E,G,...,0.551825,0.661007,0.629606,0.714139,0.245732,8.706755,1,8.261337,8.171155,8.211826
299998,499998,A,B,A,C,B,B,A,E,E,...,0.351036,0.288768,0.611169,0.380254,0.332030,7.229569,3,8.095134,8.106153,8.101049


In [9]:
test_data

Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,...,cont7,cont8,cont9,cont10,cont11,cont12,cont13,pred_1,pred_2,pred_3
0,0,B,B,B,C,B,B,A,E,E,...,0.321832,0.445212,0.290258,0.244476,0.087914,0.301831,0.845702,8.088900,8.009207,8.085726
1,5,A,B,A,C,B,C,A,E,C,...,0.835961,0.391657,0.288276,0.549568,0.905097,0.850684,0.693940,8.409900,8.292212,8.392296
2,15,B,A,A,A,B,B,A,E,D,...,0.879379,0.275549,0.427871,0.491667,0.384315,0.376689,0.508099,8.427935,8.390079,8.416870
3,16,B,B,A,C,B,D,A,E,A,...,0.644315,1.024017,0.391090,0.988340,0.411828,0.393585,0.461372,8.518767,8.511569,8.521971
4,17,B,B,A,C,B,C,A,E,C,...,0.408874,0.447887,0.390253,0.648932,0.385935,0.370401,0.900412,8.164892,8.156339,8.169494
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199995,499987,B,A,A,C,B,D,A,E,E,...,1.028978,1.022741,0.683903,0.877273,0.532410,0.605397,0.884581,7.996167,8.063257,8.000681
199996,499990,B,A,A,C,B,B,A,E,C,...,0.359871,0.550013,0.492082,0.202295,0.416875,0.406205,0.758665,8.479431,8.399576,8.502170
199997,499991,A,B,B,C,B,B,A,E,C,...,0.317185,0.150340,0.122109,0.390524,0.334026,0.378987,0.839416,8.473834,8.430684,8.499910
199998,499994,A,A,A,C,B,D,A,D,A,...,0.901241,0.555339,0.844315,0.894193,0.794102,0.844279,0.890473,8.152552,8.163454,8.129901


In [10]:
from sklearn.linear_model import LinearRegression

useful_features = ["pred_1", "pred_2", "pred_3"]
test_data = test_data[useful_features]

final_predictions = []
scores = []
for fold in range(5):
    X_train =  train_data[train_data.kfold != fold].reset_index(drop=True)
    X_valid = train_data[train_data.kfold == fold].reset_index(drop=True)
    X_test = test_data.copy()

    y_train = X_train.target
    y_valid = X_valid.target
    
    X_train = X_train[useful_features]
    X_valid = X_valid[useful_features]
    
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    preds_valid = model.predict(X_valid)
    test_preds = model.predict(X_test)
    final_predictions.append(test_preds)
    rmse = mean_squared_error(y_valid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))

0 0.7168366494082297
1 0.7168093752166348
2 0.7185861690748995
3 0.7186226645301521
4 0.7173122170066967
0.7176334150473226 0.0008128222541052264


In [11]:
# Export submission.csv
preds = np.mean(np.column_stack(final_predictions), axis=1)
preds = pd.DataFrame({'id': sample_submission.id, 'target': preds})
preds.to_csv('submission.csv', index=False)

In [None]:
%%time
# With Standardization
useful_features = [col for col in train_data.columns if col not in ("id", "target", "kfold")]
cat_cols = [col for col in useful_features if "cat" in col]
num_cols = [col for col in useful_features if col.startswith('cont')]
test_data = test_data[useful_features]

final_predictions = []
scores = []

for fold in range(5):
    # Preprocessing - Kfold
    X_train = train_data[train_data.kfold != fold].reset_index(drop=True)
    X_valid = train_data[train_data.kfold == fold].reset_index(drop=True)
    X_test = test_data.copy()
    
    y_train = X_train.target
    y_valid = X_valid.target
    
    X_train = X_train[useful_features]
    X_valid = X_valid[useful_features]
    
    # Preprocessing - Ordinal Encoding
    ordinal_encoder = OrdinalEncoder()
    X_train[cat_cols] = ordinal_encoder.fit_transform(X_train[cat_cols])
    X_valid[cat_cols] = ordinal_encoder.transform(X_valid[cat_cols])
    X_test[cat_cols] = ordinal_encoder.transform(X_test[cat_cols]) # Q. The last transform
    
    # Preprocessing - Standardization
    scaler = StandardScaler()
    X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
    X_valid[num_cols] = scaler.transform(X_valid[num_cols])
    X_test[num_cols] = scaler.transform(X_test[num_cols]) # Q. The last transform
    
    # Training
    #model = RandomForestRegressor(random_state=fold, n_jobs=-1)
    #model = XGBRegressor(random_state=fold, n_jobs=8)
    model = XGBRegressor(random_state=fold, tree_method='gpu_hist', gpu_id=0, predictor='gpu_predictor')
    model.fit(X_train, y_train)
    
    # Evaluation
    preds_valid = model.predict(X_valid)
    test_preds = model.predict(X_test)
    final_predictions.append(test_preds)
    rmse = mean_squared_error(y_valid, preds_valid, squared=False)
    
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))

In [None]:
%%time
# With Normalization
useful_features = [col for col in train_data.columns if col not in ("id", "target", "kfold")]
cat_cols = [col for col in useful_features if "cat" in col]
num_cols = [col for col in useful_features if col.startswith('cont')]
test_data = test_data[useful_features]

final_predictions = []
scores = []

for fold in range(5):
    # Preprocessing - Kfold
    X_train = train_data[train_data.kfold != fold].reset_index(drop=True)
    X_valid = train_data[train_data.kfold == fold].reset_index(drop=True)
    X_test = test_data.copy()
    
    y_train = X_train.target
    y_valid = X_valid.target
    
    X_train = X_train[useful_features]
    X_valid = X_valid[useful_features]
    
    # Preprocessing - Ordinal Encoding
    ordinal_encoder = OrdinalEncoder()
    X_train[cat_cols] = ordinal_encoder.fit_transform(X_train[cat_cols])
    X_valid[cat_cols] = ordinal_encoder.transform(X_valid[cat_cols])
    X_test[cat_cols] = ordinal_encoder.transform(X_test[cat_cols]) # Q. The last transform

    # Preprocessing - Normalizatino
    normalizer = Normalizer()
    X_train[num_cols] = normalizer.fit_transform(X_train[num_cols])
    X_valid[num_cols] = normalizer.transform(X_valid[num_cols])
    X_test[num_cols] = normalizer.transform(X_test[num_cols]) # Q. The last transform
    
    # Training
    #model = RandomForestRegressor(random_state=fold, n_jobs=-1)
    #model = XGBRegressor(random_state=fold, n_jobs=8)
    model = XGBRegressor(random_state=fold, tree_method='gpu_hist', gpu_id=0, predictor='gpu_predictor')
    model.fit(X_train, y_train)
    
    # Evaluation
    preds_valid = model.predict(X_valid)
    test_preds = model.predict(X_test)
    final_predictions.append(test_preds)
    rmse = mean_squared_error(y_valid, preds_valid, squared=False)
    
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))

In [None]:
%%time
# With Standardization + Normalization
useful_features = [col for col in train_data.columns if col not in ("id", "target", "kfold")]
cat_cols = [col for col in useful_features if "cat" in col]
num_cols = [col for col in useful_features if col.startswith('cont')]
test_data = test_data[useful_features]

final_predictions = []
scores = []

for fold in range(5):
    # Preprocessing - Kfold
    X_train = train_data[train_data.kfold != fold].reset_index(drop=True)
    X_valid = train_data[train_data.kfold == fold].reset_index(drop=True)
    X_test = test_data.copy()
    
    y_train = X_train.target
    y_valid = X_valid.target
    
    X_train = X_train[useful_features]
    X_valid = X_valid[useful_features]
    
    # Preprocessing - Ordinal Encoding
    ordinal_encoder = OrdinalEncoder()
    X_train[cat_cols] = ordinal_encoder.fit_transform(X_train[cat_cols])
    X_valid[cat_cols] = ordinal_encoder.transform(X_valid[cat_cols])
    X_test[cat_cols] = ordinal_encoder.transform(X_test[cat_cols]) # Q. The last transform

    # Preprocessing - Standardization
    scaler = StandardScaler()
    X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
    X_valid[num_cols] = scaler.transform(X_valid[num_cols])
    X_test[num_cols] = scaler.transform(X_test[num_cols]) # Q. The last transform
    
    # Preprocessing - Normalizatino
    normalizer = Normalizer()
    X_train[num_cols] = normalizer.fit_transform(X_train[num_cols])
    X_valid[num_cols] = normalizer.transform(X_valid[num_cols])
    X_test[num_cols] = normalizer.transform(X_test[num_cols]) # Q. The last transform
    
    # Training
    #model = RandomForestRegressor(random_state=fold, n_jobs=-1)
    #model = XGBRegressor(random_state=fold, n_jobs=8)
    model = XGBRegressor(random_state=fold, tree_method='gpu_hist', gpu_id=0, predictor='gpu_predictor')
    model.fit(X_train, y_train)
    
    # Evaluation
    preds_valid = model.predict(X_valid)
    test_preds = model.predict(X_test)
    final_predictions.append(test_preds)
    rmse = mean_squared_error(y_valid, preds_valid, squared=False)
    
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))

In [None]:
%%time
# With Standardization
useful_features = [col for col in train_data.columns if col not in ("id", "target", "kfold")]
cat_cols = [col for col in useful_features if "cat" in col]
num_cols = [col for col in useful_features if col.startswith('cont')]
test_data = test_data[useful_features]

final_predictions = []
scores = []

for fold in range(5):
    # Preprocessing - Kfold
    X_train = train_data[train_data.kfold != fold].reset_index(drop=True)
    X_valid = train_data[train_data.kfold == fold].reset_index(drop=True)
    X_test = test_data.copy()
    
    y_train = X_train.target
    y_valid = X_valid.target
    
    X_train = X_train[useful_features]
    X_valid = X_valid[useful_features]
    
    # Preprocessing - Ordinal Encoding
    ordinal_encoder = OrdinalEncoder()
    X_train[cat_cols] = ordinal_encoder.fit_transform(X_train[cat_cols])
    X_valid[cat_cols] = ordinal_encoder.transform(X_valid[cat_cols])
    X_test[cat_cols] = ordinal_encoder.transform(X_test[cat_cols]) # Q. The last transform
    
    # Preprocessing - Standardization
    scaler = StandardScaler()
    X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
    X_valid[num_cols] = scaler.transform(X_valid[num_cols])
    X_test[num_cols] = scaler.transform(X_test[num_cols]) # Q. The last transform
    
    # Training
    #model = RandomForestRegressor(random_state=fold, n_jobs=-1)
    #model = XGBRegressor(random_state=fold, n_jobs=8)
    model = XGBRegressor(random_state=fold, tree_method='gpu_hist', gpu_id=0, predictor='gpu_predictor')
    model.fit(X_train, y_train)
    
    # Evaluation
    preds_valid = model.predict(X_valid)
    test_preds = model.predict(X_test)
    final_predictions.append(test_preds)
    rmse = mean_squared_error(y_valid, preds_valid, squared=False)
    
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))

In [None]:
%%time
# Log transformation + Tuning
useful_features = [col for col in train_data.columns if col not in ("id", "target", "kfold")]
cat_cols = [col for col in useful_features if "cat" in col]
num_cols = [col for col in useful_features if col.startswith('cont')]
test_data = test_data[useful_features]

for col in num_cols:
    train_data[col] = np.log1p(train_data[col])
    test_data[col] = np.log1p(test_data[col])

final_predictions = []
scores = []

for fold in range(5):
    # Preprocessing - Kfold
    X_train = train_data[train_data.kfold != fold].reset_index(drop=True)
    X_valid = train_data[train_data.kfold == fold].reset_index(drop=True)
    X_test = test_data.copy()
    
    y_train = X_train.target
    y_valid = X_valid.target
    
    X_train = X_train[useful_features]
    X_valid = X_valid[useful_features]
    
    # Preprocessing - Ordinal Encoding
    ordinal_encoder = OrdinalEncoder()
    X_train[cat_cols] = ordinal_encoder.fit_transform(X_train[cat_cols])
    X_valid[cat_cols] = ordinal_encoder.transform(X_valid[cat_cols])
    X_test[cat_cols] = ordinal_encoder.transform(X_test[cat_cols]) # Q. The last transform
    
    # Training
    #model = RandomForestRegressor(random_state=fold, n_jobs=-1)
    #model = XGBRegressor(random_state=fold, n_jobs=8)
    model = XGBRegressor(random_state=fold, tree_method='gpu_hist', gpu_id=0, predictor='gpu_predictor', 
                         learning_rate=0.1, n_estimators=1000, max_depth=3, colsample_bytree=0.3)
    model.fit(X_train, y_train)
    
    # Evaluation
    preds_valid = model.predict(X_valid)
    test_preds = model.predict(X_test)
    final_predictions.append(test_preds)
    rmse = mean_squared_error(y_valid, preds_valid, squared=False)
    
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))

print('You need to reset dataframe!')

In [None]:
%%time
# polynomial features + Tuning
useful_features = [col for col in train_data.columns if col not in ("id", "target", "kfold")]
cat_cols = [col for col in useful_features if "cat" in col]
num_cols = [col for col in useful_features if col.startswith('cont')]
test_data = test_data[useful_features]

poly = PolynomialFeatures(degree=2, 
                          interaction_only=True, # If true, only interaction features are produced: features that are products of at most degree distinct input features (so not x[1] ** 2, x[0] * x[2] ** 3, etc.).
                          include_bias=False)
train_poly = poly.fit_transform(train_data[num_cols])
test_poly = poly.fit_transform(test_data[num_cols])

df_train_poly = pd.DataFrame(train_poly, columns=[f"poly_{i}" for i in range(train_poly.shape[1])])
df_test_poly = pd.DataFrame(test_poly, columns=[f"poly_{i}" for i in range(test_poly.shape[1])])

train_data = pd.concat([train_data, df_train_poly], axis=1)
test_data = pd.concat([test_data, df_test_poly], axis=1)

useful_features = [col for col in train_data.columns if col not in ("id", "target", "kfold")]
cat_cols = [col for col in useful_features if "cat" in col]
test_data = test_data[useful_features]

final_predictions = []
scores = []

for fold in range(5):
    # Preprocessing - Kfold
    X_train = train_data[train_data.kfold != fold].reset_index(drop=True)
    X_valid = train_data[train_data.kfold == fold].reset_index(drop=True)
    X_test = test_data.copy()
    
    y_train = X_train.target
    y_valid = X_valid.target
    
    X_train = X_train[useful_features]
    X_valid = X_valid[useful_features]
    
    # Preprocessing - Ordinal Encoding
    ordinal_encoder = OrdinalEncoder()
    X_train[cat_cols] = ordinal_encoder.fit_transform(X_train[cat_cols])
    X_valid[cat_cols] = ordinal_encoder.transform(X_valid[cat_cols])
    X_test[cat_cols] = ordinal_encoder.transform(X_test[cat_cols]) # Q. The last transform
    
    # Training
    #model = RandomForestRegressor(random_state=fold, n_jobs=-1)
    #model = XGBRegressor(random_state=fold, n_jobs=8)
    model = XGBRegressor(random_state=fold, tree_method='gpu_hist', gpu_id=0, predictor='gpu_predictor', 
                         learning_rate=0.1, n_estimators=1000, max_depth=3, colsample_bytree=0.3)
    model.fit(X_train, y_train)
    
    # Evaluation
    preds_valid = model.predict(X_valid)
    test_preds = model.predict(X_test)
    final_predictions.append(test_preds)
    rmse = mean_squared_error(y_valid, preds_valid, squared=False)
    
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))

print('You need to reset dataframe!')

In [None]:
test_data

In [None]:
%%time
# One-Hot Encoding + Ordinal Encoding + Tuning
# pd.cut 
# Model Tuning + drop cat2
useful_features = [col for col in train_data.columns if col not in ("id", "target", "kfold")]
cat_cols = [col for col in useful_features if "cat" in col]
oe_cols = ['cat9']
ohe_cols = cat_cols
ohe_cols.remove('cat9')
num_cols = [col for col in useful_features if col.startswith('cont')]
test_data = test_data[useful_features]

final_predictions = []
scores = []

for fold in range(5):
    # Preprocessing - Kfold
    X_train = train_data[train_data.kfold != fold].reset_index(drop=True)
    X_valid = train_data[train_data.kfold == fold].reset_index(drop=True)
    X_test = test_data.copy()
    
    y_train = X_train.target
    y_valid = X_valid.target
    
    X_train = X_train[useful_features]
    X_valid = X_valid[useful_features]
    
    # Preprocessing - Ordinal Encoding
    ordinal_encoder = OrdinalEncoder()
    X_train[oe_cols] = ordinal_encoder.fit_transform(X_train[oe_cols])
    X_valid[oe_cols] = ordinal_encoder.transform(X_valid[oe_cols])
    X_test[oe_cols] = ordinal_encoder.transform(X_test[oe_cols]) # Q. The last transform
    
    # Preprocessing - One-Hot Encoding
    ohe = OneHotEncoder(sparse=False, handle_unknown="ignore")
    X_train_ohe = ohe.fit_transform(X_train[ohe_cols])
    X_valid_ohe = ohe.transform(X_valid[ohe_cols])
    X_test_ohe = ohe.transform(X_test[ohe_cols]) # Q. The last transform
    
    X_train_ohe = pd.DataFrame(X_train_ohe, columns=[f"ohe_{i}" for i in range(X_train_ohe.shape[1])])
    X_valid_ohe = pd.DataFrame(X_valid_ohe, columns=[f"ohe_{i}" for i in range(X_valid_ohe.shape[1])])
    X_test_ohe = pd.DataFrame(X_test_ohe, columns=[f"ohe_{i}" for i in range(X_test_ohe.shape[1])])
    
    X_train = pd.concat([X_train.drop(columns=ohe_cols), X_train_ohe], axis=1)
    X_valid = pd.concat([X_valid.drop(columns=ohe_cols), X_valid_ohe], axis=1)
    X_test = pd.concat([X_test.drop(columns=ohe_cols), X_test_ohe], axis=1)

    # Training
    #model = RandomForestRegressor(random_state=fold, n_jobs=-1)
    #model = XGBRegressor(random_state=fold, n_jobs=8)
    model = XGBRegressor(random_state=fold, tree_method='gpu_hist', gpu_id=0, predictor='gpu_predictor', 
                         learning_rate=0.1, n_estimators=1000, max_depth=3, colsample_bytree=0.3)
    model.fit(X_train, y_train)
    
    # Evaluation
    preds_valid = model.predict(X_valid)
    test_preds = model.predict(X_test)
    final_predictions.append(test_preds)
    rmse = mean_squared_error(y_valid, preds_valid, squared=False)
    
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))

print('You need to reset dataframe!')

In [None]:
%%time
# Model Tuning + drop cat2
useful_features = [col for col in train_data.columns if col not in ("id", "target", "kfold", "cat2")]
cat_cols = [col for col in useful_features if "cat" in col]
num_cols = [col for col in useful_features if col.startswith('cont')]
test_data = test_data[useful_features]

final_predictions = []
scores = []

for fold in range(5):
    # Preprocessing - Kfold
    X_train = train_data[train_data.kfold != fold].reset_index(drop=True)
    X_valid = train_data[train_data.kfold == fold].reset_index(drop=True)
    X_test = test_data.copy()
    
    y_train = X_train.target
    y_valid = X_valid.target
    
    X_train = X_train[useful_features]
    X_valid = X_valid[useful_features]
    
    # Preprocessing - Ordinal Encoding
    ordinal_encoder = OrdinalEncoder()
    X_train[cat_cols] = ordinal_encoder.fit_transform(X_train[cat_cols])
    X_valid[cat_cols] = ordinal_encoder.transform(X_valid[cat_cols])
    X_test[cat_cols] = ordinal_encoder.transform(X_test[cat_cols]) # Q. The last transform
    
    # Training
    #model = RandomForestRegressor(random_state=fold, n_jobs=-1)
    #model = XGBRegressor(random_state=fold, n_jobs=8)
    #model = XGBRegressor(random_state=fold, tree_method='gpu_hist', gpu_id=0, predictor='gpu_predictor')
    model = XGBRegressor(random_state=fold, tree_method='gpu_hist', gpu_id=0, predictor='gpu_predictor', 
                         learning_rate=0.1, n_estimators=1000, max_depth=3, colsample_bytree=0.3)
    model.fit(X_train, y_train)
    
    # Evaluation
    preds_valid = model.predict(X_valid)
    test_preds = model.predict(X_test)
    final_predictions.append(test_preds)
    rmse = mean_squared_error(y_valid, preds_valid, squared=False)
    
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))

print('You need to reset dataframe!')

In [None]:
%%time
# Model Tuning + drop cat2, cat6
useful_features = [col for col in train_data.columns if col not in ("id", "target", "kfold", "cat2", "cat6")]
cat_cols = [col for col in useful_features if "cat" in col]
num_cols = [col for col in useful_features if col.startswith('cont')]
test_data = test_data[useful_features]

final_predictions = []
scores = []

for fold in range(5):
    # Preprocessing - Kfold
    X_train = train_data[train_data.kfold != fold].reset_index(drop=True)
    X_valid = train_data[train_data.kfold == fold].reset_index(drop=True)
    X_test = test_data.copy()
    
    y_train = X_train.target
    y_valid = X_valid.target
    
    X_train = X_train[useful_features]
    X_valid = X_valid[useful_features]
    
    # Preprocessing - Ordinal Encoding
    ordinal_encoder = OrdinalEncoder()
    X_train[cat_cols] = ordinal_encoder.fit_transform(X_train[cat_cols])
    X_valid[cat_cols] = ordinal_encoder.transform(X_valid[cat_cols])
    X_test[cat_cols] = ordinal_encoder.transform(X_test[cat_cols]) # Q. The last transform
    
    # Training
    #model = RandomForestRegressor(random_state=fold, n_jobs=-1)
    #model = XGBRegressor(random_state=fold, n_jobs=8)
    #model = XGBRegressor(random_state=fold, tree_method='gpu_hist', gpu_id=0, predictor='gpu_predictor')
    model = XGBRegressor(random_state=fold, tree_method='gpu_hist', gpu_id=0, predictor='gpu_predictor', 
                         learning_rate=0.1, n_estimators=1000, max_depth=3, colsample_bytree=0.3)
    model.fit(X_train, y_train)
    
    # Evaluation
    preds_valid = model.predict(X_valid)
    test_preds = model.predict(X_test)
    final_predictions.append(test_preds)
    rmse = mean_squared_error(y_valid, preds_valid, squared=False)
    
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))

print('You need to reset dataframe!')

In [None]:
%%time
# Tuning + Standardization
useful_features = [col for col in train_data.columns if col not in ("id", "target", "kfold")]
cat_cols = [col for col in useful_features if "cat" in col]
num_cols = [col for col in useful_features if col.startswith('cont')]
test_data = test_data[useful_features]

final_predictions = []
scores = []

for fold in range(5):
    # Preprocessing - Kfold
    X_train = train_data[train_data.kfold != fold].reset_index(drop=True)
    X_valid = train_data[train_data.kfold == fold].reset_index(drop=True)
    X_test = test_data.copy()
    
    y_train = X_train.target
    y_valid = X_valid.target
    
    X_train = X_train[useful_features]
    X_valid = X_valid[useful_features]
    
    # Preprocessing - Ordinal Encoding
    ordinal_encoder = OrdinalEncoder()
    X_train[cat_cols] = ordinal_encoder.fit_transform(X_train[cat_cols])
    X_valid[cat_cols] = ordinal_encoder.transform(X_valid[cat_cols])
    X_test[cat_cols] = ordinal_encoder.transform(X_test[cat_cols]) # Q. The last transform
    
    # Preprocessing - Standardization
    scaler = StandardScaler()
    X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
    X_valid[num_cols] = scaler.transform(X_valid[num_cols])
    X_test[num_cols] = scaler.transform(X_test[num_cols]) # Q. The last transform
    
    # Training
    #model = RandomForestRegressor(random_state=fold, n_jobs=-1)
    #model = XGBRegressor(random_state=fold, n_jobs=8)
    #model = XGBRegressor(random_state=fold, tree_method='gpu_hist', gpu_id=0, predictor='gpu_predictor')
    model = XGBRegressor(random_state=fold, tree_method='gpu_hist', gpu_id=0, predictor='gpu_predictor', 
                         learning_rate=0.1, n_estimators=1000, max_depth=3, colsample_bytree=0.3)
    model.fit(X_train, y_train)
    
    # Evaluation
    preds_valid = model.predict(X_valid)
    test_preds = model.predict(X_test)
    final_predictions.append(test_preds)
    rmse = mean_squared_error(y_valid, preds_valid, squared=False)
    
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))

In [None]:
%%time
# Only Model Tuning
useful_features = [col for col in train_data.columns if col not in ("id", "target", "kfold")]
cat_cols = [col for col in useful_features if "cat" in col]
num_cols = [col for col in useful_features if col.startswith('cont')]
test_data = test_data[useful_features]

final_predictions = []
scores = []

for fold in range(5):
    # Preprocessing - Kfold
    X_train = train_data[train_data.kfold != fold].reset_index(drop=True)
    X_valid = train_data[train_data.kfold == fold].reset_index(drop=True)
    X_test = test_data.copy()
    
    y_train = X_train.target
    y_valid = X_valid.target
    
    X_train = X_train[useful_features]
    X_valid = X_valid[useful_features]
    
    # Preprocessing - Ordinal Encoding
    ordinal_encoder = OrdinalEncoder()
    X_train[cat_cols] = ordinal_encoder.fit_transform(X_train[cat_cols])
    X_valid[cat_cols] = ordinal_encoder.transform(X_valid[cat_cols])
    X_test[cat_cols] = ordinal_encoder.transform(X_test[cat_cols]) # Q. The last transform
    
    # Training
    #model = RandomForestRegressor(random_state=fold, n_jobs=-1)
    #model = XGBRegressor(random_state=fold, n_jobs=8)
    model = XGBRegressor(random_state=fold, tree_method='gpu_hist', gpu_id=0, predictor='gpu_predictor', 
                         learning_rate=0.1, n_estimators=1000, max_depth=3, colsample_bytree=0.3)
    model.fit(X_train, y_train)
    
    # Evaluation
    preds_valid = model.predict(X_valid)
    test_preds = model.predict(X_test)
    final_predictions.append(test_preds)
    rmse = mean_squared_error(y_valid, preds_valid, squared=False)
    
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))

In [None]:
# Export submission.csv
preds = np.mean(np.column_stack(final_predictions), axis=1)
preds = pd.DataFrame({'id': sample_submission.id, 'target': preds})
preds.to_csv('submission.csv', index=False)