In [1]:
# Familiar imports
import numpy as np
import pandas as pd

# For ordinal encoding categorical variables, splitting data
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold

# For training random forest model
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import mutual_info_regression
from sklearn.cluster import KMeans
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, roc_auc_score
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import seaborn as sns
from tqdm import tqdm
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from catboost import CatBoostRegressor
from sklearn.linear_model import Ridge
from tqdm import tqdm
import optuna

from lightgbm import LGBMRegressor

In [2]:
# Load the training data
train = pd.read_csv("../input/tabular-playground-series-nov-2021/train.csv", index_col=0)
test = pd.read_csv("../input/tabular-playground-series-nov-2021/test.csv", index_col=0)

# Preview the data
train.head()

Unnamed: 0_level_0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f91,f92,f93,f94,f95,f96,f97,f98,f99,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.106643,3.59437,132.804,3.18428,0.081971,1.18859,3.73238,2.26627,2.09959,0.01233,...,1.09862,0.013331,-0.011715,0.052759,0.0654,4.21125,1.97877,0.085974,0.240496,0
1,0.125021,1.67336,76.5336,3.37825,0.0994,5.09366,1.27562,-0.471318,4.54594,0.037706,...,3.46017,0.017054,0.124863,0.154064,0.606848,-0.267928,2.57786,-0.020877,0.024719,0
2,0.03633,1.49747,233.546,2.19435,0.026914,3.12694,5.05687,3.84946,1.80187,0.056995,...,4.883,0.085222,0.032396,0.116092,-0.001688,-0.520069,2.14112,0.124464,0.148209,0
3,-0.014077,0.246,779.967,1.89064,0.006948,1.53112,2.698,4.51733,4.50332,0.123494,...,3.47439,-0.017103,-0.0081,0.062013,0.041193,0.511657,1.9686,0.040017,0.044873,0
4,-0.003259,3.71542,156.128,2.14772,0.018284,2.09859,4.15492,-0.038236,3.37145,0.034166,...,1.91059,-0.042943,0.105616,0.125072,0.037509,1.04379,1.07481,-0.012819,0.072798,1


In [3]:
y = train['target']
features = train.drop(['target'], axis=1)

X = features.copy()
X.shape

(600000, 100)

In [4]:
def create_pipeline(**kvargs):
    typ = kvargs.pop('typ')
    if typ == 'xgboost':
        model = XGBRegressor(**kvargs, n_jobs=3, tree_method='gpu_hist', gpu_id=0, eval_metric="auc")
    elif typ == 'lgbm':
        model = LGBMRegressor(**kvargs, n_jobs=3, device='gpu', metric = "auc")
    else:
        model = CatBoostRegressor(**kvargs, task_type="GPU", loss_function="RMSE")
    
    #pipeline = Pipeline(steps=[
    #    ('preprocessor', preprocessor),
    #    ('model', model)
    #])
    return model

In [5]:
def objective(trial, X, y, typ):
    #n_estimators = trial.suggest_int("n_estimators", 500, 5000)
    learning_rate = trial.suggest_float("learning_rate", 0.03, 0.5)
    stopping_rounds = 400
    params={'typ':typ}
    if typ == 'xgboost':
        params['booster']='gbtree'
        params['n_estimators'] = trial.suggest_int("n_estimators", 5000, 20000)
        params['gamma'] = trial.suggest_float('gamma', 0, 100)
        params['max_depth'] = trial.suggest_int('max_depth', 1, 10)
        params['min_child_weight']=trial.suggest_float('min_child_weight', 0, 10)
        params['subsample']=trial.suggest_float('subsample', 0.1, 1)
        params['lambda']=trial.suggest_float('lambda', 1, 5)
        params['alpha']=trial.suggest_float('alpha', 0, 10)
    elif typ=='lgbm':
        params['n_estimators'] = trial.suggest_int("n_estimators", 500, 5000)
        params['max_depth']=trial.suggest_int('max_depth', 2, 10)
        params['num_leaves']=trial.suggest_int('num_leaves', 2, 100)
        params['reg_alpha']=trial.suggest_float('reg_alpha', 0, 10)
        params['reg_lambda']=trial.suggest_float('reg_lambda', 0, 10)
        params['min_data_in_leaf']=trial.suggest_int('min_data_in_leaf', 50, 1000)
    else:
        params['iterations'] = trial.suggest_int("iterations", 500, 10000)
        params['depth'] = trial.suggest_int("depth", 3, 15)
        params['l2_leaf_reg']=trial.suggest_float('l2_leaf_reg', 0.01, 100)
        params['bagging_temperature']=trial.suggest_float('bagging_temperature', 0, 10)
    #score = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')
    val_pred = np.zeros(len(y))

    mse = []
    spl = 10
    kf = KFold(n_splits=spl, shuffle=True)
    for trn_idx, val_idx in tqdm(kf.split(X,y)):
        x_train_idx = X.iloc[trn_idx]
        y_train_idx = y.iloc[trn_idx]
        x_valid_idx = X.iloc[val_idx]
        y_valid_idx = y.iloc[val_idx]

        model = create_pipeline(learning_rate=learning_rate, **params)
        verbose = -1
        if typ == 'xgboost':
            verbose = False
        if typ == 'catboost':
            verbose = 0
        model.fit(x_train_idx, y_train_idx, early_stopping_rounds = stopping_rounds, eval_set=[(x_valid_idx, y_valid_idx)], verbose = verbose)
        mse.append(roc_auc_score(y_valid_idx, model.predict(x_valid_idx)))
    
    accuracy = sum(mse)/spl
    return accuracy

In [6]:
study_name = 'tps-nov2-catboost'  # Unique identifier of the study.
study = optuna.create_study(study_name=study_name, storage='sqlite:///'+study_name+'.db', direction="maximize", load_if_exists=True)

[32m[I 2021-11-03 14:34:40,775][0m A new study created in RDB with name: tps-nov2-catboost[0m


In [7]:
print(study.best_trial)

ValueError: Record does not exist.

In [8]:
study.optimize(lambda trial: objective(trial, X, y, "catboost"),  n_trials=100)
print(study.best_trial)

10it [01:17,  7.76s/it]
[32m[I 2021-11-03 14:36:18,630][0m Trial 0 finished with value: 0.7366972329136144 and parameters: {'learning_rate': 0.2804374089423294, 'iterations': 911, 'depth': 10, 'l2_leaf_reg': 1.3615213224086675, 'bagging_temperature': 2.6382729132406944}. Best is trial 0 with value: 0.7366972329136144.[0m
10it [05:21, 32.10s/it]
[32m[I 2021-11-03 14:41:39,715][0m Trial 1 finished with value: 0.7472400288714074 and parameters: {'learning_rate': 0.06325556363732907, 'iterations': 9521, 'depth': 3, 'l2_leaf_reg': 57.91357941609552, 'bagging_temperature': 9.591178298981774}. Best is trial 1 with value: 0.7472400288714074.[0m
10it [07:43, 46.36s/it]
[32m[I 2021-11-03 14:49:23,359][0m Trial 2 finished with value: 0.7226780103579472 and parameters: {'learning_rate': 0.12319250141804054, 'iterations': 865, 'depth': 13, 'l2_leaf_reg': 41.641907665610695, 'bagging_temperature': 8.581173647123963}. Best is trial 1 with value: 0.7472400288714074.[0m
10it [03:19, 19.96s/it]

FrozenTrial(number=98, values=[0.7486647440823471], datetime_start=datetime.datetime(2021, 11, 4, 0, 9, 33, 942015), datetime_complete=datetime.datetime(2021, 11, 4, 0, 20, 15, 610064), params={'bagging_temperature': 4.021848378181608, 'depth': 7, 'iterations': 9560, 'l2_leaf_reg': 98.16696275126748, 'learning_rate': 0.031190673582301456}, distributions={'bagging_temperature': UniformDistribution(high=10.0, low=0.0), 'depth': IntUniformDistribution(high=15, low=3, step=1), 'iterations': IntUniformDistribution(high=10000, low=500, step=1), 'l2_leaf_reg': UniformDistribution(high=100.0, low=0.01), 'learning_rate': UniformDistribution(high=0.5, low=0.03)}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=99, state=TrialState.COMPLETE, value=None)


In [9]:
from datetime import datetime
print(datetime.now())

2021-11-04 00:32:48.886629
