# 2nd round of model spot-check.
We start with the models selected in 1st round a do some basic hyperparameter-tuning on each.

My aim is to keep the best-performing model(s) for a final tuning.

In this round I does not change the preprocessing used in the 1st round.

In [21]:
import numpy as np
import pandas as pd

# pipeline utilities
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# preprocessing
from sklearn.preprocessing import PowerTransformer, MinMaxScaler, OneHotEncoder # tuning: PolynomialFeatures
from sklearn.metrics import make_scorer, mean_squared_error

# tuning
from sklearn.model_selection import RandomizedSearchCV #, train_test_split
from scipy.stats import loguniform

# validation
from sklearn.model_selection import KFold, cross_val_score

# models
from sklearn.linear_model import ElasticNet, ElasticNetCV, BayesianRidge, ARDRegression
from sklearn.svm import LinearSVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor 

In [3]:
# Load data
df = pd.read_csv("../input/30-days-of-ml/train.csv", index_col=0)
df

Unnamed: 0_level_0,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,...,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,B,B,B,C,B,B,A,E,C,N,...,0.400361,0.160266,0.310921,0.389470,0.267559,0.237281,0.377873,0.322401,0.869850,8.113634
2,B,B,A,A,B,D,A,F,A,O,...,0.533087,0.558922,0.516294,0.594928,0.341439,0.906013,0.921701,0.261975,0.465083,8.481233
3,A,A,A,C,B,D,A,D,A,F,...,0.650609,0.375348,0.902567,0.555205,0.843531,0.748809,0.620126,0.541474,0.763846,8.364351
4,B,B,A,C,B,D,A,E,C,K,...,0.668980,0.239061,0.732948,0.679618,0.574844,0.346010,0.714610,0.540150,0.280682,8.049253
6,A,A,A,C,B,D,A,E,A,N,...,0.686964,0.420667,0.648182,0.684501,0.956692,1.000773,0.776742,0.625849,0.250823,7.972260
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
499993,B,B,A,A,B,D,A,E,A,I,...,0.769792,0.450538,0.934360,1.005077,0.853726,0.422541,1.063463,0.697685,0.506404,7.945605
499996,A,B,A,C,B,B,A,E,E,F,...,0.528056,0.508502,0.358247,0.257825,0.433525,0.301015,0.268447,0.577055,0.823611,7.326118
499997,B,B,A,C,B,C,A,E,G,F,...,0.688747,0.372425,0.364936,0.383224,0.551825,0.661007,0.629606,0.714139,0.245732,8.706755
499998,A,B,A,C,B,B,A,E,E,I,...,0.344404,0.424243,0.382028,0.468819,0.351036,0.288768,0.611169,0.380254,0.332030,7.229569


In [6]:
# work on a random sample to speed-up process
df_sample = df.sample(50000, random_state=123)
X = df_sample.drop(['target'], axis=1)
y = df_sample['target']

# X = df.drop(['target'], axis=1)
# y = df['target']
print(X.shape)
print(y.shape)

# X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=123)
# print(X_train.shape)
# print(X_val.shape)
# print(y_train.shape)
# print(y_val.shape)

(50000, 24)
(50000,)
(37500, 24)
(12500, 24)
(37500,)
(12500,)


In [7]:
# Select numerical features
numFeatures = X.select_dtypes(exclude=['object']).columns
# Select Categorical features
catFeatures = X.select_dtypes(include=['object']).columns
print(numFeatures)
print(catFeatures)

Index(['cont0', 'cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7',
       'cont8', 'cont9', 'cont10', 'cont11', 'cont12', 'cont13'],
      dtype='object')
Index(['cat0', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8',
       'cat9'],
      dtype='object')


In [31]:
# Pipeline for transforming categorical variables
catTransformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Pipeline for scaling numerical variables
numTransformer = Pipeline(steps=[
    ('scaler', MinMaxScaler()),
    ('power_transformer', PowerTransformer()),
])

# Create the preprocessing engine
preprocessor = ColumnTransformer(
    transformers=[
        ('numeric', numTransformer, numFeatures),
        ('categoric', catTransformer, catFeatures),
    ]
)

## ElasticNet

In [42]:
model = ElasticNetCV(l1_ratio=[.1, .5, .7, .9, .15, .50, .75, .95, .99, 1], n_alphas=500, cv=5, random_state=123, verbose=0)

scorer = make_scorer(score_func=mean_squared_error, greater_is_better=False)
kf = KFold(n_splits=5)

estimator = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ]
)

estimator.fit(X, y)
m = estimator.named_steps['model']
print(f'alpha: {m.alpha_}')
print(f'l1_ratio: {m.l1_ratio_}')

alpha: 0.005706722247950045
l1_ratio: 0.15


In [9]:
# test result parameters with ElasticNet and cross_val_score
model = ElasticNet(alpha=0.00570672, l1_ratio=0.15)
scorer = make_scorer(score_func=mean_squared_error, greater_is_better=False)
kf = KFold(n_splits=5)

estimator = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ]
)
results = cross_val_score(estimator, X, y, cv=kf, scoring=scorer)
print(f"model score: {abs(results.mean()):.4f}")

model score: 0.5444


## BayesianRidge

In [14]:
model = BayesianRidge(n_iter=1000)

param_distributions = {
    'model__alpha_1': loguniform(1e-8, 1e-4),
    'model__alpha_2': loguniform(1e-8, 1e-4),
    'model__lambda_1': loguniform(1e-8, 1e-4),
    'model__lambda_2': loguniform(1e-8, 1e-4),
}

scorer = make_scorer(score_func=mean_squared_error, greater_is_better=False)
kf = KFold(n_splits=5)

estimator = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ]
)

search_br = RandomizedSearchCV(
    estimator=estimator, 
    param_distributions=param_distributions,
    n_iter=20,
    scoring=scorer,
    cv=kf,
    random_state=123,
    verbose=2
)
search_br.fit(X, y)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV] END model__alpha_1=6.107686573608479e-06, model__alpha_2=1.39494581986112e-07, model__lambda_1=8.079896808319466e-08, model__lambda_2=1.6042020845182473e-06; total time=   5.5s
[CV] END model__alpha_1=6.107686573608479e-06, model__alpha_2=1.39494581986112e-07, model__lambda_1=8.079896808319466e-08, model__lambda_2=1.6042020845182473e-06; total time=   5.4s
[CV] END model__alpha_1=6.107686573608479e-06, model__alpha_2=1.39494581986112e-07, model__lambda_1=8.079896808319466e-08, model__lambda_2=1.6042020845182473e-06; total time=   5.2s
[CV] END model__alpha_1=6.107686573608479e-06, model__alpha_2=1.39494581986112e-07, model__lambda_1=8.079896808319466e-08, model__lambda_2=1.6042020845182473e-06; total time=   3.9s
[CV] END model__alpha_1=6.107686573608479e-06, model__alpha_2=1.39494581986112e-07, model__lambda_1=8.079896808319466e-08, model__lambda_2=1.6042020845182473e-06; total time=   3.4s
[CV] END model__alpha_1=7.54

RandomizedSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=False),
                   estimator=Pipeline(steps=[('preprocessor',
                                              ColumnTransformer(transformers=[('numeric',
                                                                               Pipeline(steps=[('scaler',
                                                                                                MinMaxScaler()),
                                                                                               ('power_transformer',
                                                                                                PowerTransformer())]),
                                                                               Index(['cont0', 'cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7',
       'cont8', 'cont9', 'cont10', 'cont11', 'co...
                                        'model__alpha_2': <scipy.stats._distn_infrastructure.rv_frozen objec

In [18]:
print("Best params")
print(search_br.best_params_)
print("\nScore")
print(f'{search_br.best_score_:.4f}')

Best params
{'model__alpha_1': 7.758424358283755e-06, 'model__alpha_2': 1.9581035528392234e-07, 'model__lambda_1': 2.7999780223990073e-07, 'model__lambda_2': 8.185645330667258e-08}

Score
-0.5445


## ARDRegression

In [19]:
model = ARDRegression(n_iter=1000)

param_distributions = {
    'model__alpha_1': loguniform(1e-8, 1e-4),
    'model__alpha_2': loguniform(1e-8, 1e-4),
    'model__lambda_1': loguniform(1e-8, 1e-4),
    'model__lambda_2': loguniform(1e-8, 1e-4),
}

scorer = make_scorer(score_func=mean_squared_error, greater_is_better=False)
kf = KFold(n_splits=5)

estimator = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ]
)

search_ardr = RandomizedSearchCV(
    estimator=estimator, 
    param_distributions=param_distributions,
    n_iter=20,
    scoring=scorer,
    cv=kf,
    random_state=123,
    verbose=2
)
search_ardr.fit(X, y)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV] END model__alpha_1=6.107686573608479e-06, model__alpha_2=1.39494581986112e-07, model__lambda_1=8.079896808319466e-08, model__lambda_2=1.6042020845182473e-06; total time=   4.8s
[CV] END model__alpha_1=6.107686573608479e-06, model__alpha_2=1.39494581986112e-07, model__lambda_1=8.079896808319466e-08, model__lambda_2=1.6042020845182473e-06; total time=   7.1s
[CV] END model__alpha_1=6.107686573608479e-06, model__alpha_2=1.39494581986112e-07, model__lambda_1=8.079896808319466e-08, model__lambda_2=1.6042020845182473e-06; total time=   6.4s
[CV] END model__alpha_1=6.107686573608479e-06, model__alpha_2=1.39494581986112e-07, model__lambda_1=8.079896808319466e-08, model__lambda_2=1.6042020845182473e-06; total time=   5.2s
[CV] END model__alpha_1=6.107686573608479e-06, model__alpha_2=1.39494581986112e-07, model__lambda_1=8.079896808319466e-08, model__lambda_2=1.6042020845182473e-06; total time=   5.4s
[CV] END model__alpha_1=7.54

RandomizedSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=False),
                   estimator=Pipeline(steps=[('preprocessor',
                                              ColumnTransformer(transformers=[('numeric',
                                                                               Pipeline(steps=[('scaler',
                                                                                                MinMaxScaler()),
                                                                                               ('power_transformer',
                                                                                                PowerTransformer())]),
                                                                               Index(['cont0', 'cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7',
       'cont8', 'cont9', 'cont10', 'cont11', 'co...
                                        'model__alpha_2': <scipy.stats._distn_infrastructure.rv_frozen objec

In [20]:
print("Best params")
print(search_ardr.best_params_)
print("\nScore")
print(f'{search_ardr.best_score_:.4f}')

Best params
{'model__alpha_1': 5.982625838323249e-08, 'model__alpha_2': 1.9490717640641528e-06, 'model__lambda_1': 2.4146270773370442e-08, 'model__lambda_2': 3.477821637156514e-05}

Score
-0.5447


## Support Vector Regressor with linear kernel

In [34]:
model = LinearSVR(dual=False, loss='squared_epsilon_insensitive', random_state=123)

param_distributions = {
    'model__epsilon': [0, 0.01, 0.1, 0.5, 1, 2],
    'model__C': [1, 10, 100, 1000, 10000],
}

scorer = make_scorer(score_func=mean_squared_error, greater_is_better=False)
kf = KFold(n_splits=5)

estimator = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ]
)

search_lsvr = RandomizedSearchCV(
    estimator=estimator, 
    param_distributions=param_distributions,
    n_iter=20,
    scoring=scorer,
    cv=kf,
    random_state=123,
    verbose=2
)
search_lsvr.fit(X, y)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV] END ...................model__C=10, model__epsilon=0.01; total time=   3.4s
[CV] END ...................model__C=10, model__epsilon=0.01; total time=   2.7s
[CV] END ...................model__C=10, model__epsilon=0.01; total time=   3.0s
[CV] END ...................model__C=10, model__epsilon=0.01; total time=   2.7s
[CV] END ...................model__C=10, model__epsilon=0.01; total time=   2.6s
[CV] END ...................model__C=10000, model__epsilon=2; total time=   2.5s
[CV] END ...................model__C=10000, model__epsilon=2; total time=   2.4s
[CV] END ...................model__C=10000, model__epsilon=2; total time=   2.5s
[CV] END ...................model__C=10000, model__epsilon=2; total time=   2.4s
[CV] END ...................model__C=10000, model__epsilon=2; total time=   3.0s
[CV] END .......................model__C=1, model__epsilon=2; total time=   2.3s
[CV] END .......................model__C=1, mod

RandomizedSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=False),
                   estimator=Pipeline(steps=[('preprocessor',
                                              ColumnTransformer(transformers=[('numeric',
                                                                               Pipeline(steps=[('scaler',
                                                                                                MinMaxScaler()),
                                                                                               ('power_transformer',
                                                                                                PowerTransformer())]),
                                                                               Index(['cont0', 'cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7',
       'cont8', 'cont9', 'cont10', 'cont11', 'co...
                                                                               Index(['cat0', 'cat1', 'cat2'

In [35]:
print("Best params")
print(search_lsvr.best_params_)
print("\nScore")
print(f'{search_lsvr.best_score_:.4f}')

Best params
{'model__epsilon': 0, 'model__C': 100}

Score
-0.5449


## RandomForestRegressor

In [38]:
model = RandomForestRegressor(random_state=123)

param_distributions = {
    'model__n_estimators': [100, 300, 500],
    'model__max_depth': [3, 5, 10, 15],
    'model__min_samples_leaf': [0.001, 0.01, 0.1],
    'model__max_features': ['sqrt', 'log2'],
}

scorer = make_scorer(score_func=mean_squared_error, greater_is_better=False)
kf = KFold(n_splits=5)

estimator = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ]
)

search_rfr = RandomizedSearchCV(
    estimator=estimator, 
    param_distributions=param_distributions,
    n_iter=20,
    scoring=scorer,
    cv=kf,
    random_state=123,
    verbose=2
)
search_rfr.fit(X, y)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV] END model__max_depth=15, model__max_features=log2, model__min_samples_leaf=0.001, model__n_estimators=100; total time=  20.9s
[CV] END model__max_depth=15, model__max_features=log2, model__min_samples_leaf=0.001, model__n_estimators=100; total time=  23.0s
[CV] END model__max_depth=15, model__max_features=log2, model__min_samples_leaf=0.001, model__n_estimators=100; total time=  30.5s
[CV] END model__max_depth=15, model__max_features=log2, model__min_samples_leaf=0.001, model__n_estimators=100; total time=  20.5s
[CV] END model__max_depth=15, model__max_features=log2, model__min_samples_leaf=0.001, model__n_estimators=100; total time=  20.5s
[CV] END model__max_depth=5, model__max_features=sqrt, model__min_samples_leaf=0.1, model__n_estimators=500; total time=  32.5s
[CV] END model__max_depth=5, model__max_features=sqrt, model__min_samples_leaf=0.1, model__n_estimators=500; total time=  37.6s
[CV] END model__max_depth=5

RandomizedSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=False),
                   estimator=Pipeline(steps=[('preprocessor',
                                              ColumnTransformer(transformers=[('numeric',
                                                                               Pipeline(steps=[('scaler',
                                                                                                MinMaxScaler()),
                                                                                               ('power_transformer',
                                                                                                PowerTransformer())]),
                                                                               Index(['cont0', 'cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7',
       'cont8', 'cont9', 'cont10', 'cont11', 'co...
       'cat9'],
      dtype='object'))])),
                                             ('model',
          

In [39]:
print("Best params")
print(search_rfr.best_params_)
print("\nScore")
print(f'{search_rfr.best_score_:.4f}')

Best params
{'model__n_estimators': 100, 'model__min_samples_leaf': 0.001, 'model__max_features': 'sqrt', 'model__max_depth': 15}

Score
-0.5396


## GradientBoostingRegressor

In [42]:
model = GradientBoostingRegressor(random_state=123)

param_distributions = {
    'model__loss': ['ls', 'lad', 'huber', 'quantile'],
    'model__learning_rate': [0.01, 0.1],
    'model__n_estimators': [100, 300, 500],
    'model__subsample': [0.5, 0.8, 1],
    'model__max_depth': [3, 7, 12],
    'model__min_samples_leaf': [0.001, 0.01, 0.1],
    'model__max_features': ['sqrt', 'log2'],
}

scorer = make_scorer(score_func=mean_squared_error, greater_is_better=False)
kf = KFold(n_splits=5)

estimator = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ]
)

search_gbr = RandomizedSearchCV(
    estimator=estimator, 
    param_distributions=param_distributions,
    n_iter=20,
    scoring=scorer,
    cv=kf,
    random_state=123,
    verbose=2
)
search_gbr.fit(X, y)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV] END model__learning_rate=0.01, model__loss=quantile, model__max_depth=3, model__max_features=sqrt, model__min_samples_leaf=0.01, model__n_estimators=300, model__subsample=0.8; total time=  35.8s
[CV] END model__learning_rate=0.01, model__loss=quantile, model__max_depth=3, model__max_features=sqrt, model__min_samples_leaf=0.01, model__n_estimators=300, model__subsample=0.8; total time=  33.4s
[CV] END model__learning_rate=0.01, model__loss=quantile, model__max_depth=3, model__max_features=sqrt, model__min_samples_leaf=0.01, model__n_estimators=300, model__subsample=0.8; total time=  32.6s
[CV] END model__learning_rate=0.01, model__loss=quantile, model__max_depth=3, model__max_features=sqrt, model__min_samples_leaf=0.01, model__n_estimators=300, model__subsample=0.8; total time=  37.5s
[CV] END model__learning_rate=0.01, model__loss=quantile, model__max_depth=3, model__max_features=sqrt, model__min_samples_leaf=0.01, mode

RandomizedSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=False),
                   estimator=Pipeline(steps=[('preprocessor',
                                              ColumnTransformer(transformers=[('numeric',
                                                                               Pipeline(steps=[('scaler',
                                                                                                MinMaxScaler()),
                                                                                               ('power_transformer',
                                                                                                PowerTransformer())]),
                                                                               Index(['cont0', 'cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7',
       'cont8', 'cont9', 'cont10', 'cont11', 'co...
                   param_distributions={'model__learning_rate': [0.01, 0.1],
                               

In [43]:
print("Best params")
print(search_gbr.best_params_)
print("\nScore")
print(f'{search_gbr.best_score_:.4f}')

Best params
{'model__subsample': 1, 'model__n_estimators': 300, 'model__min_samples_leaf': 0.001, 'model__max_features': 'log2', 'model__max_depth': 3, 'model__loss': 'huber', 'model__learning_rate': 0.1}

Score
-0.5327


## XGBRegressor

In [47]:
model = XGBRegressor(random_state=123)

param_distributions = {
    'model__n_estimators': [100, 300, 500],
    'model__max_depth': [3, 7, 12],
    'model__learning_rate': [0.01, 0.1, 0.5, 0.9],    
    'model__subsample': [0.3, 0.5, 0.9],
    'model__colsample_bytree': [0.1, 0.3, 0.6, 1],
    'model__booster': ['gbtree'],
}

scorer = make_scorer(score_func=mean_squared_error, greater_is_better=False)
kf = KFold(n_splits=5)

estimator = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ]
)

search_xgbr = RandomizedSearchCV(
    estimator=estimator, 
    param_distributions=param_distributions,
    n_iter=20,
    scoring=scorer,
    cv=kf,
    random_state=123,
    verbose=2
)
search_xgbr.fit(X, y)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV] END model__booster=gbtree, model__colsample_bytree=0.1, model__learning_rate=0.01, model__max_depth=7, model__n_estimators=300, model__subsample=0.5; total time=  25.7s
[CV] END model__booster=gbtree, model__colsample_bytree=0.1, model__learning_rate=0.01, model__max_depth=7, model__n_estimators=300, model__subsample=0.5; total time=  25.1s
[CV] END model__booster=gbtree, model__colsample_bytree=0.1, model__learning_rate=0.01, model__max_depth=7, model__n_estimators=300, model__subsample=0.5; total time=  23.2s
[CV] END model__booster=gbtree, model__colsample_bytree=0.1, model__learning_rate=0.01, model__max_depth=7, model__n_estimators=300, model__subsample=0.5; total time=  23.6s
[CV] END model__booster=gbtree, model__colsample_bytree=0.1, model__learning_rate=0.01, model__max_depth=7, model__n_estimators=300, model__subsample=0.5; total time=  23.0s
[CV] END model__booster=gbtree, model__colsample_bytree=0.6, model__

RandomizedSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=False),
                   estimator=Pipeline(steps=[('preprocessor',
                                              ColumnTransformer(transformers=[('numeric',
                                                                               Pipeline(steps=[('scaler',
                                                                                                MinMaxScaler()),
                                                                                               ('power_transformer',
                                                                                                PowerTransformer())]),
                                                                               Index(['cont0', 'cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7',
       'cont8', 'cont9', 'cont10', 'cont11', 'co...
                                                           verbosity=None))]),
                   n_iter=20,

In [48]:
print("Best params")
print(search_xgbr.best_params_)
print("\nScore")
print(f'{search_xgbr.best_score_:.4f}')

Best params
{'model__subsample': 0.3, 'model__n_estimators': 500, 'model__max_depth': 3, 'model__learning_rate': 0.1, 'model__colsample_bytree': 0.6, 'model__booster': 'gbtree'}

Score
-0.5307


|model                      | score
|-----------------------    |--------
|XGBRegressor               | 0.5307
|GradientBoostingRegressor  | 0.5327
|RandomForestRegressor      | 0.5396
|ElasticNet                 | 0.5444
|BayesianRidge              | 0.5445
|ARDRegression              | 0.5447
|LinearSVR                  | 0.5449

We keep XGBoost for further tuning.

## Make submission using winner of this round: XGBRegressor

In [49]:
# load test data
test_df = pd.read_csv("../input/30-days-of-ml/test.csv", index_col=0)
# split dataset
X_train = df.drop(['target'], axis=1)
y_train = df['target']
X_test = test_df
# train model
model_params = {
    'subsample': 0.3, 
    'n_estimators': 500, 
    'max_depth': 3, 
    'learning_rate': 0.1, 
    'colsample_bytree': 0.6, 
    'booster': 'gbtree'
}
model = XGBRegressor(random_state=123, **model_params)
estimator = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ]
)
estimator.fit(X_train, y_train)
# predict
y_pred = estimator.predict(X_test)
# save the predictions to a CSV file
output = pd.DataFrame({'Id': X_test.index,
                       'target': y_pred})
output.to_csv('submission.csv', index=False)

Parameters: { "model__booster", "model__colsample_bytree", "model__learning_rate", "model__max_depth", "model__n_estimators", "model__subsample" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




KeyboardInterrupt: 