In [5]:
# Familiar imports
import numpy as np
import pandas as pd

# For ordinal encoding categorical variables, splitting data
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold

# For training random forest model
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import mutual_info_regression
from sklearn.cluster import KMeans
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, roc_auc_score
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import seaborn as sns
from tqdm import tqdm
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from catboost import CatBoostRegressor
from sklearn.linear_model import Ridge
from tqdm import tqdm
import optuna

from lightgbm import LGBMRegressor

In [6]:
# Load the training data
train = pd.read_csv("../input/tabular-playground-series-oct-2021/train.csv", index_col=0)
test = pd.read_csv("../input/tabular-playground-series-oct-2021/test.csv", index_col=0)

# Preview the data
train.head()

Unnamed: 0_level_0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f276,f277,f278,f279,f280,f281,f282,f283,f284,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.205979,0.410993,0.176775,0.223581,0.423543,0.47614,0.41359,0.612021,0.534873,0.147295,...,0,1,0,0,0,0,0,0,0,1
1,0.181004,0.473119,0.011734,0.213657,0.619678,0.441593,0.230407,0.686013,0.281971,0.238509,...,0,1,0,0,0,0,0,0,0,1
2,0.182583,0.307431,0.32595,0.207116,0.605699,0.309695,0.493337,0.751107,0.536272,0.286813,...,0,0,0,1,1,0,0,0,0,1
3,0.18024,0.494592,0.008367,0.22358,0.760618,0.439211,0.432055,0.776147,0.483958,0.260886,...,0,0,0,0,1,0,0,0,0,1
4,0.177172,0.495513,0.014263,0.548819,0.625396,0.562493,0.117158,0.561255,0.077115,0.158321,...,0,1,1,0,1,0,0,1,0,1


In [7]:
y = train['target']
features = train.drop(['target'], axis=1)

X = features.copy()
X.shape

(1000000, 285)

In [8]:
def create_pipeline(**kvargs):
    typ = kvargs.pop('typ')
    if typ == 'xgboost':
        model = XGBRegressor(**kvargs, n_jobs=3, tree_method='gpu_hist', gpu_id=0, eval_metric="auc")
    elif typ == 'lgbm':
        model = LGBMRegressor(**kvargs, n_jobs=3, device='gpu', metric = "auc")
    else:
        model = CatBoostRegressor(**kvargs, task_type="GPU", loss_function="RMSE")
    
    #pipeline = Pipeline(steps=[
    #    ('preprocessor', preprocessor),
    #    ('model', model)
    #])
    return model

In [9]:
def objective(trial, X, y, typ):
    #n_estimators = trial.suggest_int("n_estimators", 500, 5000)
    learning_rate = trial.suggest_float("learning_rate", 0.03, 0.5)
    stopping_rounds = 400
    params={'typ':typ}
    if typ == 'xgboost':
        params['booster']='gbtree'
        params['n_estimators'] = trial.suggest_int("n_estimators", 5000, 20000)
        params['gamma'] = trial.suggest_float('gamma', 0, 100)
        params['max_depth'] = trial.suggest_int('max_depth', 1, 10)
        params['min_child_weight']=trial.suggest_float('min_child_weight', 0, 10)
        params['subsample']=trial.suggest_float('subsample', 0.1, 1)
        params['lambda']=trial.suggest_float('lambda', 1, 5)
        params['alpha']=trial.suggest_float('alpha', 0, 10)
    elif typ=='lgbm':
        params['n_estimators'] = trial.suggest_int("n_estimators", 500, 5000)
        params['max_depth']=trial.suggest_int('max_depth', 2, 10)
        params['num_leaves']=trial.suggest_int('num_leaves', 2, 100)
        params['reg_alpha']=trial.suggest_float('reg_alpha', 0, 10)
        params['reg_lambda']=trial.suggest_float('reg_lambda', 0, 10)
        params['min_data_in_leaf']=trial.suggest_int('min_data_in_leaf', 50, 1000)
    else:
        params['iterations'] = trial.suggest_int("iterations", 500, 5000)
        params['depth'] = trial.suggest_int("depth", 3, 15)
        params['l2_leaf_reg']=trial.suggest_float('l2_leaf_reg', 0.01, 100)
        params['bagging_temperature']=trial.suggest_float('bagging_temperature', 0, 10)
    e
    #score = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')
    val_pred = np.zeros(len(y))

    mse = []
    spl = 5
    kf = KFold(n_splits=spl, shuffle=True)
    for trn_idx, val_idx in tqdm(kf.split(X,y)):
        x_train_idx = X.iloc[trn_idx]
        y_train_idx = y.iloc[trn_idx]
        x_valid_idx = X.iloc[val_idx]
        y_valid_idx = y.iloc[val_idx]

        model = create_pipeline(learning_rate=learning_rate, **params)
        verbose = -1
        if typ == 'xgboost':
            verbose = False
        if typ == 'catboost':
            verbose = 0
        model.fit(x_train_idx, y_train_idx, early_stopping_rounds = stopping_rounds, eval_set=[(x_valid_idx, y_valid_idx)], verbose = verbose)
        mse.append(roc_auc_score(y_valid_idx, model.predict(x_valid_idx)))
    
    accuracy = sum(mse)/spl
    return accuracy

In [10]:
study_name = 'tps-oct3-catboost'  # Unique identifier of the study.
study = optuna.create_study(study_name=study_name, storage='sqlite:///'+study_name+'.db', direction="maximize", load_if_exists=True)

[32m[I 2021-10-25 22:41:53,812][0m Using an existing study with name 'tps-oct3-catboost' instead of creating a new one.[0m


In [15]:
print(study.best_trial)

FrozenTrial(number=4, values=[0.8559625967288815], datetime_start=datetime.datetime(2021, 10, 23, 18, 38, 48, 939141), datetime_complete=datetime.datetime(2021, 10, 23, 18, 39, 54, 635582), params={'bagging_temperature': 1.8823650330102293, 'depth': 6, 'iterations': 4270, 'l2_leaf_reg': 30.370611316816202, 'learning_rate': 0.1443062034332685}, distributions={'bagging_temperature': UniformDistribution(high=10.0, low=0.0), 'depth': IntUniformDistribution(high=15, low=3, step=1), 'iterations': IntUniformDistribution(high=5000, low=500, step=1), 'l2_leaf_reg': UniformDistribution(high=100.0, low=0.01), 'learning_rate': UniformDistribution(high=0.5, low=0.03)}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=5, state=TrialState.COMPLETE, value=None)


In [16]:
study.optimize(lambda trial: objective(trial, X, y, "catboost"),  n_trials=100)
print(study.best_trial)

5it [01:19, 15.97s/it]om logger is already specified. Specify more than one logger at same time is not thread safe.
[32m[I 2021-10-23 20:52:53,848][0m Trial 10 finished with value: 0.8555235937737173 and parameters: {'learning_rate': 0.03702505794236759, 'iterations': 1805, 'depth': 5, 'l2_leaf_reg': 22.93190966999958, 'bagging_temperature': 2.1402332674620075}. Best is trial 4 with value: 0.8559625967288815.[0m
5it [00:44,  8.95s/it]
[32m[I 2021-10-23 20:53:38,816][0m Trial 11 finished with value: 0.8554972169391195 and parameters: {'learning_rate': 0.36157599648187966, 'iterations': 4031, 'depth': 3, 'l2_leaf_reg': 27.565969596115437, 'bagging_temperature': 3.058242807083027}. Best is trial 4 with value: 0.8559625967288815.[0m
5it [04:33, 54.64s/it]
[32m[I 2021-10-23 20:58:12,162][0m Trial 12 finished with value: 0.8516086760744301 and parameters: {'learning_rate': 0.14984318143905795, 'iterations': 4953, 'depth': 7, 'l2_leaf_reg': 99.2393561805421, 'bagging_temperature': 9.6

KeyboardInterrupt: 