In [18]:
import warnings
warnings.filterwarnings("ignore")

In [1]:
import pandas as pd

train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
sample = pd.read_csv('../input/sample_submission.csv')

In [4]:
from sklearn import model_selection

if __name__ == "__main__":
    df = train
    df["kfold"] = -1
    df = df.sample(frac=1).reset_index(drop=True)
    kf = model_selection.KFold(n_splits=5)
    for fold, (trn_, val_) in enumerate(kf.split(X=df)):
        df.loc[val_, "kfold"] = fold
    df.to_csv("../input/train_folds.csv", index=False)

In [7]:
import xgboost as xgb
import optuna
from optuna import Trial
from sklearn import metrics

In [25]:
def fit_xgb(trial, xtrain, ytrain, xvalid, yvalid):
    params = {
        "n_estimators": trial.suggest_categorical("n_estimators", [150,200,250,300]),
        "subsample": trial.suggest_discrete_uniform("subsample", 0.6,1,0.1),
        "colsample_bytree": trial.suggest_discrete_uniform("colsample_bytree", 0.6,1,0.1),
        "eta": trial.suggest_loguniform("eta",1e-2,0.1),
        "gamma": trial.suggest_loguniform("gamma",0.05,1),
        "max_depth": trial.suggest_categorical("max_depth",[5,7,9,11,13]),
        "min_child_weight": trial.suggest_int("min_child_weight",5,11),
        "random_state": 0,
    }

    model = xgb.XGBRegressor(**params)
    model.fit(xtrain, ytrain.reshape(-1,))
    
    y_valid_pred = model.predict(xvalid)
    
    log = {
        "train_rmse": metrics.mean_squared_error(
            ytrain, 
            model.predict(xtrain), 
            squared=False,
        ),
        "valid_rmse": metrics.mean_squared_error(
            yvalid, 
            y_valid_pred, 
            squared=False,
        ),
    }
    
    return model, log

In [50]:
def objective(trial):
    df = pd.read_csv("../input/train_folds.csv")
    rmse = 0
    for fold in range(5):
        df_train = df[df.kfold != fold].reset_index(drop=True)
        df_valid = df[df.kfold == fold].reset_index(drop=True)
    
        xtrain = df_train.drop(["id", "target", "kfold"], axis=1).values
        ytrain = df_train["target"].values
        xvalid = df_valid.drop(["id", "target", "kfold"], axis=1).values
        yvalid = df_valid["target"].values
        
        model, log = fit_xgb(trial, xtrain, ytrain, xvalid, yvalid)
        rmse += log["valid_rmse"]/5
        
    return rmse

In [51]:
study = optuna.create_study(
    direction="minimize",
    study_name="xgboost optimization",
)
study.optimize(
    objective, 
    n_trials=5,
)

[32m[I 2021-01-31 07:48:25,881][0m A new study created in memory with name: xgboost optimization[0m
[32m[I 2021-01-31 07:52:11,069][0m Trial 0 finished with value: 0.7030742873732294 and parameters: {'n_estimators': 250, 'subsample': 0.6, 'colsample_bytree': 0.7, 'eta': 0.06642891619379418, 'gamma': 0.9744323589241212, 'max_depth': 13, 'min_child_weight': 11}. Best is trial 0 with value: 0.7030742873732294.[0m
[32m[I 2021-01-31 07:54:58,637][0m Trial 1 finished with value: 0.7017351571788163 and parameters: {'n_estimators': 200, 'subsample': 0.6, 'colsample_bytree': 1.0, 'eta': 0.06872193183222874, 'gamma': 0.2954236553191469, 'max_depth': 11, 'min_child_weight': 9}. Best is trial 1 with value: 0.7017351571788163.[0m
[32m[I 2021-01-31 07:57:06,512][0m Trial 2 finished with value: 0.7005584079729205 and parameters: {'n_estimators': 200, 'subsample': 1.0, 'colsample_bytree': 0.7, 'eta': 0.08416026958022642, 'gamma': 0.07269558322171425, 'max_depth': 7, 'min_child_weight': 10}.

In [52]:
history = study.trials_dataframe()
history.sort_values(by="value", ascending=True)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_colsample_bytree,params_eta,params_gamma,params_max_depth,params_min_child_weight,params_n_estimators,params_subsample,state
4,4,0.699326,2021-01-31 07:58:16.329139,2021-01-31 08:02:29.001764,00:04:12.672625,0.8,0.039579,0.156162,9,10,300,1.0,COMPLETE
2,2,0.700558,2021-01-31 07:54:58.637890,2021-01-31 07:57:06.511855,00:02:07.873965,0.7,0.08416,0.072696,7,10,200,1.0,COMPLETE
1,1,0.701735,2021-01-31 07:52:11.070449,2021-01-31 07:54:58.636781,00:02:47.566332,1.0,0.068722,0.295424,11,9,200,0.6,COMPLETE
0,0,0.703074,2021-01-31 07:48:25.882488,2021-01-31 07:52:11.069266,00:03:45.186778,0.7,0.066429,0.974432,13,11,250,0.6,COMPLETE
3,3,0.706707,2021-01-31 07:57:06.512990,2021-01-31 07:58:16.327929,00:01:09.814939,0.9,0.052293,0.06407,5,6,150,0.9,COMPLETE


In [53]:
study.best_params

{'n_estimators': 300,
 'subsample': 1.0,
 'colsample_bytree': 0.8,
 'eta': 0.03957918605044537,
 'gamma': 0.15616223333330972,
 'max_depth': 9,
 'min_child_weight': 10}

In [54]:
def run(fold):
    df = pd.read_csv("../input/train_folds.csv")
    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    
    xtrain = df_train.drop(["id", "target", "kfold"], axis=1).values
    ytrain = df_train["target"].values
    xvalid = df_valid.drop(["id", "target", "kfold"], axis=1).values
    yvalid = df_valid["target"].values
    xtest = test.drop(["id"], axis=1).values
    
    model = xgb.XGBRegressor(**(study.best_params))
    model.fit(xtrain, ytrain)
    y_valid_pred = model.predict(xvalid)
    rmse = metrics.mean_squared_error(
        yvalid, 
        y_valid_pred, 
        squared=False,
    )
    print(f"{fold}, {rmse}")
    
    preds = model.predict(xtest)
    
    preds_df = pd.DataFrame()
    preds_df["id"] = test.loc[:,"id"]
    preds_df["target"] = preds
    print(preds_df.shape)
    print(preds_df.head(3))

if __name__ == "__main__":
    for j in range(5):
        run(j)

0, 0.7004395880221562
(200000, 2)
   id    target
0   0  7.913198
1   2  7.848672
2   6  7.905887
1, 0.6983342455275313
(200000, 2)
   id    target
0   0  8.011919
1   2  7.863559
2   6  7.909718
2, 0.7003803751488813
(200000, 2)
   id    target
0   0  7.934600
1   2  7.861133
2   6  7.950120
3, 0.7008556299591323
(200000, 2)
   id    target
0   0  7.933747
1   2  7.836162
2   6  7.933005
4, 0.6966185831272997
(200000, 2)
   id    target
0   0  8.010509
1   2  7.876707
2   6  7.934344


In [None]:
### end of code ###