In [9]:
# Familiar imports
import numpy as np
import pandas as pd

# For ordinal encoding categorical variables, splitting data
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold

# For training random forest model
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import mutual_info_regression
from sklearn.cluster import KMeans
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, roc_auc_score
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import seaborn as sns
from tqdm import tqdm
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from catboost import CatBoostRegressor
from sklearn.linear_model import Ridge
from tqdm import tqdm
import optuna

from lightgbm import LGBMRegressor
import gc

In [10]:
# Load the training data
train = pd.read_csv("../input/tabular-playground-series-nov-2021/train.csv", index_col=0)
test = pd.read_csv("../input/tabular-playground-series-nov-2021/test.csv", index_col=0)

# Preview the data
train.head()

Unnamed: 0_level_0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f91,f92,f93,f94,f95,f96,f97,f98,f99,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.106643,3.59437,132.804,3.18428,0.081971,1.18859,3.73238,2.26627,2.09959,0.01233,...,1.09862,0.013331,-0.011715,0.052759,0.0654,4.21125,1.97877,0.085974,0.240496,0
1,0.125021,1.67336,76.5336,3.37825,0.0994,5.09366,1.27562,-0.471318,4.54594,0.037706,...,3.46017,0.017054,0.124863,0.154064,0.606848,-0.267928,2.57786,-0.020877,0.024719,0
2,0.03633,1.49747,233.546,2.19435,0.026914,3.12694,5.05687,3.84946,1.80187,0.056995,...,4.883,0.085222,0.032396,0.116092,-0.001688,-0.520069,2.14112,0.124464,0.148209,0
3,-0.014077,0.246,779.967,1.89064,0.006948,1.53112,2.698,4.51733,4.50332,0.123494,...,3.47439,-0.017103,-0.0081,0.062013,0.041193,0.511657,1.9686,0.040017,0.044873,0
4,-0.003259,3.71542,156.128,2.14772,0.018284,2.09859,4.15492,-0.038236,3.37145,0.034166,...,1.91059,-0.042943,0.105616,0.125072,0.037509,1.04379,1.07481,-0.012819,0.072798,1


In [11]:
y = train['target']
features = train.drop(['target'], axis=1)

X = features.copy()
X.shape

(600000, 100)

In [12]:
def create_pipeline(**kvargs):
    typ = kvargs.pop('typ')
    if typ == 'xgboost':
        model = XGBRegressor(**kvargs, n_jobs=3, tree_method='gpu_hist', gpu_id=0, eval_metric="auc")
    elif typ == 'lgbm':
        model = LGBMRegressor(**kvargs, n_jobs=3)
    else:
        model = CatBoostRegressor(**kvargs)
    
    #pipeline = Pipeline(steps=[
    #    ('preprocessor', preprocessor),
    #    ('model', model)
    #])
    return model

In [13]:
def objective(trial, X, y, typ):
    #n_estimators = trial.suggest_int("n_estimators", 500, 5000)
    learning_rate = trial.suggest_float("learning_rate", 0.001, 0.99)
    stopping_rounds = 400
    params={'typ':typ}
    if typ == 'xgboost':
        params['booster']='gbtree'
        params['n_estimators'] = trial.suggest_int("n_estimators", 5000, 20000)
        params['gamma'] = trial.suggest_float('gamma', 0, 100)
        params['max_depth'] = trial.suggest_int('max_depth', 1, 20)
        params['min_child_weight']=trial.suggest_float('min_child_weight', 0, 100)
        params['subsample']=trial.suggest_float('subsample', 0.1, 1)
        params['lambda']=trial.suggest_float('lambda', 1, 5)
        params['alpha']=trial.suggest_float('alpha', 0, 10)
    elif typ=='lgbm':
        params['n_estimators'] = trial.suggest_int("n_estimators", 500, 5000)
        params['max_depth']=trial.suggest_int('max_depth', 2, 10)
        params['num_leaves']=trial.suggest_int('num_leaves', 2, 100)
        params['reg_alpha']=trial.suggest_float('reg_alpha', 0, 10)
        params['reg_lambda']=trial.suggest_float('reg_lambda', 0, 10)
        params['min_data_in_leaf']=trial.suggest_int('min_data_in_leaf', 50, 1000)
    else:
        params['iterations'] = trial.suggest_int("iterations", 500, 5000)
        params['depth'] = trial.suggest_int("depth", 3, 15)
        params['l2_leaf_reg']=trial.suggest_float('l2_leaf_reg', 0.01, 100)
        params['bagging_temperature']=trial.suggest_float('bagging_temperature', 0, 10)
    
    #score = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')
    val_pred = np.zeros(len(y))

    mse = []
    spl = 10
    kf = KFold(n_splits=spl, shuffle=True)
    for trn_idx, val_idx in tqdm(kf.split(X,y)):
        x_train_idx = X.iloc[trn_idx]
        y_train_idx = y.iloc[trn_idx]
        x_valid_idx = X.iloc[val_idx]
        y_valid_idx = y.iloc[val_idx]

        model = create_pipeline(learning_rate=learning_rate, **params)
        verbose = -1
        if typ == 'xgboost':
            verbose = False
        if typ == 'catboost':
            verbose = 0
        model.fit(x_train_idx, y_train_idx, early_stopping_rounds = stopping_rounds, eval_set=[(x_valid_idx, y_valid_idx)], verbose = verbose)
        mse.append(roc_auc_score(y_valid_idx, model.predict(x_valid_idx)))
    
    accuracy = sum(mse)/spl
    return accuracy

In [14]:
study_name = 'tps-nov1-xgboost'  # Unique identifier of the study.
study = optuna.create_study(study_name=study_name, storage='sqlite:///'+study_name, direction="maximize", load_if_exists=True)

[32m[I 2021-11-02 20:53:12,361][0m Using an existing study with name 'tps-nov1-xgboost' instead of creating a new one.[0m


In [15]:
print(study.best_trial)

FrozenTrial(number=10, values=[0.7454465657544245], datetime_start=datetime.datetime(2021, 11, 2, 20, 13, 32, 443646), datetime_complete=datetime.datetime(2021, 11, 2, 20, 27, 25, 903705), params={'alpha': 6.482393802888533, 'gamma': 1.8562932390258453, 'lambda': 4.967845354444342, 'learning_rate': 0.012098807842026327, 'max_depth': 8, 'min_child_weight': 3.6969211150894807, 'n_estimators': 8506, 'subsample': 0.889127343194831}, distributions={'alpha': UniformDistribution(high=10.0, low=0.0), 'gamma': UniformDistribution(high=100.0, low=0.0), 'lambda': UniformDistribution(high=5.0, low=1.0), 'learning_rate': UniformDistribution(high=0.99, low=0.001), 'max_depth': IntUniformDistribution(high=20, low=1, step=1), 'min_child_weight': UniformDistribution(high=100.0, low=0.0), 'n_estimators': IntUniformDistribution(high=20000, low=5000, step=1), 'subsample': UniformDistribution(high=1.0, low=0.1)}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=11, state=TrialState.COMPLETE

In [17]:
study.optimize(lambda trial: objective(trial, X, y, "xgboost"),  n_trials=100)
print(study.best_trial)

10it [25:07, 150.77s/it]
[32m[I 2021-11-02 22:05:53,269][0m Trial 15 finished with value: 0.7464657422654262 and parameters: {'learning_rate': 0.011870884764912487, 'n_estimators': 14978, 'gamma': 0.6819548681630319, 'max_depth': 8, 'min_child_weight': 2.1684000269261805, 'subsample': 0.7939542781013894, 'lambda': 4.936709154436781, 'alpha': 5.484807051208907}. Best is trial 15 with value: 0.7464657422654262.[0m
10it [02:22, 14.23s/it]
[32m[I 2021-11-02 22:08:15,728][0m Trial 16 finished with value: 0.7349340912979542 and parameters: {'learning_rate': 0.15008704336699233, 'n_estimators': 15207, 'gamma': 13.250541895241614, 'max_depth': 7, 'min_child_weight': 13.732948809790058, 'subsample': 0.7873421980126064, 'lambda': 1.6867728569171985, 'alpha': 2.959534907761798}. Best is trial 15 with value: 0.7464657422654262.[0m
10it [02:37, 15.71s/it]
[32m[I 2021-11-02 22:10:52,980][0m Trial 17 finished with value: 0.7232875198179161 and parameters: {'learning_rate': 0.03037824214348290

FrozenTrial(number=73, values=[0.7474571691653014], datetime_start=datetime.datetime(2021, 11, 3, 3, 28, 38, 384132), datetime_complete=datetime.datetime(2021, 11, 3, 3, 34, 56, 356600), params={'alpha': 6.585083548901454, 'gamma': 0.010815026816252349, 'lambda': 3.7699082658342555, 'learning_rate': 0.05935377531038842, 'max_depth': 1, 'min_child_weight': 44.52085850052923, 'n_estimators': 11303, 'subsample': 0.9220131617216115}, distributions={'alpha': UniformDistribution(high=10.0, low=0.0), 'gamma': UniformDistribution(high=100.0, low=0.0), 'lambda': UniformDistribution(high=5.0, low=1.0), 'learning_rate': UniformDistribution(high=0.99, low=0.001), 'max_depth': IntUniformDistribution(high=20, low=1, step=1), 'min_child_weight': UniformDistribution(high=100.0, low=0.0), 'n_estimators': IntUniformDistribution(high=20000, low=5000, step=1), 'subsample': UniformDistribution(high=1.0, low=0.1)}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=74, state=TrialState.COMPLET

In [None]:
from datetime import datetime
print(datetime.now())