In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from sklearn import preprocessing
import optuna

# Step 2: Load the data

Next, we'll load the training and test data.  

We set `index_col=0` in the code cell below to use the `id` column to index the DataFrame.  (*If you're not sure how this works, try temporarily removing `index_col=0` and see how it changes the result.*)

In [2]:
# Load the training data
train = pd.read_csv("../input/train-kfolds/train-folds.csv")
test = pd.read_csv("../input/30-days-of-ml/test.csv")
submission_data = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")
# Preview the data
train.head()

Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,...,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,target,kfold
0,1,B,B,B,C,B,B,A,E,C,...,0.160266,0.310921,0.38947,0.267559,0.237281,0.377873,0.322401,0.86985,8.113634,1
1,2,B,B,A,A,B,D,A,F,A,...,0.558922,0.516294,0.594928,0.341439,0.906013,0.921701,0.261975,0.465083,8.481233,3
2,3,A,A,A,C,B,D,A,D,A,...,0.375348,0.902567,0.555205,0.843531,0.748809,0.620126,0.541474,0.763846,8.364351,1
3,4,B,B,A,C,B,D,A,E,C,...,0.239061,0.732948,0.679618,0.574844,0.34601,0.71461,0.54015,0.280682,8.049253,3
4,6,A,A,A,C,B,D,A,E,A,...,0.420667,0.648182,0.684501,0.956692,1.000773,0.776742,0.625849,0.250823,7.97226,4


In [3]:
# Separate target from features
features = [col for col in train.columns if col not in ('id', 'target', 'kfold')]
object_cols = [col for col in features if 'cat' in col]

ordinal_encoder = OrdinalEncoder()

xtest = test[features]
xtest = xtest.copy()
xtest[object_cols] = ordinal_encoder.fit_transform(xtest[object_cols])

In [4]:
'''def run(trial):
    #optimize in one fold
    fold = 0
    xtrain = train[train.kfold != fold].reset_index(drop=True)
    xvalid = train[train.kfold == fold].reset_index(drop=True)

    ytrain = xtrain.target
    yvalid = xvalid.target

    xtrain = xtrain[features]
    xvalid = xvalid[features]

    xtrain[object_cols]= ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.fit_transform(xvalid[object_cols])

    learning_rate = trial.suggest_float("learning_rate", 1e-2, 0.8, log=True)
    colsample_bytree = trial.suggest_float('colsample_bytree', 0.1, 0.6)
    max_depth = trial.suggest_int('max_depth', 1, 9)
    subsample = trial.suggest_float('subsample', 0.1, 0.6)
    reg_lambda = trial.suggest_float('reg_lambda', 1e-5, 100.0)
    reg_alpha = trial.suggest_float('reg_alpha', 1e-5, 100.0)
    alpha = trial.suggest_int('alpha', 0, 100)

    model = XGBRegressor(random_state = 0, 
                         alpha=alpha,
                         n_estimators=200, 
                         tree_method='gpu_hist',
                         gpu_id=0, predictor='gpu_predictor',
                         learning_rate = learning_rate,
                         colsample_bytree = colsample_bytree,
                         max_depth = max_depth,
                         subsample = subsample,
                         reg_lambda = reg_lambda,
                         reg_alpha = reg_alpha)

    model.fit(xtrain, ytrain)
    preds_valid = model.predict(xvalid)
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    return rmse

study = optuna.create_study(direction="minimize")
study.optimize(run, n_trials=5000)
'''

'def run(trial):\n    #optimize in one fold\n    fold = 0\n    xtrain = train[train.kfold != fold].reset_index(drop=True)\n    xvalid = train[train.kfold == fold].reset_index(drop=True)\n\n    ytrain = xtrain.target\n    yvalid = xvalid.target\n\n    xtrain = xtrain[features]\n    xvalid = xvalid[features]\n\n    xtrain[object_cols]= ordinal_encoder.fit_transform(xtrain[object_cols])\n    xvalid[object_cols] = ordinal_encoder.fit_transform(xvalid[object_cols])\n\n    learning_rate = trial.suggest_float("learning_rate", 1e-2, 0.8, log=True)\n    colsample_bytree = trial.suggest_float(\'colsample_bytree\', 0.1, 0.6)\n    max_depth = trial.suggest_int(\'max_depth\', 1, 9)\n    subsample = trial.suggest_float(\'subsample\', 0.1, 0.6)\n    reg_lambda = trial.suggest_float(\'reg_lambda\', 1e-5, 100.0)\n    reg_alpha = trial.suggest_float(\'reg_alpha\', 1e-5, 100.0)\n    alpha = trial.suggest_int(\'alpha\', 0, 100)\n\n    model = XGBRegressor(random_state = 0, \n                         alpha

In [5]:
#study.best_params

In [6]:
final_preds = []
for fold in range(5):
    xtrain = train[train.kfold != fold].reset_index(drop=True)
    xvalid = train[train.kfold == fold].reset_index(drop=True)
    
    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[features]
    xvalid = xvalid[features]
    
    xtrain[object_cols]= ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.fit_transform(xvalid[object_cols])
    
    best_params = {'learning_rate': 0.34090767065203226,
                     'colsample_bytree': 0.12289350813119115,
                     'max_depth': 7,
                     'subsample': 0.5899332396539119,
                     'reg_lambda': 5.830490094721956,
                     'reg_alpha': 49.68136144185203,
                     'alpha': 30
                  }
    model = XGBRegressor(objective='reg:linear',colsample_bytree= 0.12289350813119115,
                         learning_rate=0.34090767065203226,max_depth=7, alpha=30,
                         n_estimators=200, 
                         reg_lambda=5.830490094721956,
                         reg_alpha=49.68136144185203,
                         #tree_method='gpu_hist',
                         #gpu_id=0, 
                         #predictor='gpu_predictor'
                        )

    model.fit(xtrain, ytrain)
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_preds.append(test_preds)
    print(fold, mean_squared_error(yvalid, preds_valid, squared=False))

preds = np.mean(np.column_stack(final_preds), axis = 1)   
# Save predictions to a CSV file
submission_data.target = preds
submission_data.to_csv('submission.csv', index=False)



  "because it will generate extra copies and increase " +
  "because it will generate extra copies and increase " +


0 0.7193955701643651


  "because it will generate extra copies and increase " +


1 0.716597414342513


  "because it will generate extra copies and increase " +


2 0.7214180451512023


  "because it will generate extra copies and increase " +


3 0.7149565582076789


  "because it will generate extra copies and increase " +


4 0.7157135763204374
