# Module

In [1]:
!chmod 600 ~/.kaggle/kaggle.json

In [59]:
%%time

import random
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

from category_encoders import TargetEncoder
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import PowerTransformer, OneHotEncoder, LabelEncoder, OrdinalEncoder
from sklearn.metrics import *
from sklearn.ensemble import RandomForestRegressor

import optuna
from xgboost import XGBRegressor, callback
from lightgbm import LGBMRegressor, early_stopping, log_evaluation
from catboost import CatBoostRegressor, Pool
from sklearn.ensemble import ExtraTreesRegressor

import warnings

warnings.filterwarnings('ignore')

CPU times: user 212 µs, sys: 1e+03 ns, total: 213 µs
Wall time: 217 µs


In [3]:
SEED=2024

np.random.seed(SEED)
random.seed(SEED)

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

# Func

In [4]:
%%time

def load_data():    
    train = pd.read_csv('data/train.csv')
    test = pd.read_csv('data/test.csv')    
    all_df = pd.concat([train, test], sort=False).reset_index(drop=True)
    return train, test, all_df

def fill_nan_values(df):
    num_cols = [col for col in df.select_dtypes(exclude='object').columns if col != 'Premium Amount']
    cat_cols = df.select_dtypes(include='object').columns
    for col in num_cols:
        df[col] = df[col].fillna(df[col].median())
    for col in cat_cols:
        df[col] = df[col].fillna('missing')
    return df
    
def skewed(df, all_df):
    pt = PowerTransformer(method='yeo-johnson')
    pt.fit(df[['Annual Income']])
    all_df['transformed_Annual_Income'] = pt.transform(all_df[['Annual Income']])
    # all_df['log_Annual_Income'] = np.log1p(all_df['Annual Income'])
    return all_df
    
def date(df):
    df['Policy Start Date'] = pd.to_datetime(df['Policy Start Date'])
    df['Year'] = df['Policy Start Date'].dt.year
    df['Day'] = df['Policy Start Date'].dt.day
    df['Month'] = df['Policy Start Date'].dt.month
    df['Month_name'] = df['Policy Start Date'].dt.month_name()
    df['Day_of_week'] = df['Policy Start Date'].dt.day_name()
    df['Week'] = df['Policy Start Date'].dt.isocalendar().week
    df['Year_sin'] = np.sin(2 * np.pi * df['Year'])
    df['Year_cos'] = np.cos(2 * np.pi * df['Year'])
    min_year = df['Year'].min()
    max_year = df['Year'].max()
    df['Year_sin'] = np.sin(2 * np.pi * (df['Year'] - min_year) / (max_year - min_year))
    df['Year_cos'] = np.cos(2 * np.pi * (df['Year'] - min_year) / (max_year - min_year))
    df['Month_sin'] = np.sin(2 * np.pi * df['Month'] / 12) 
    df['Month_cos'] = np.cos(2 * np.pi * df['Month'] / 12)
    df['Day_sin'] = np.sin(2 * np.pi * df['Day'] / 31)  
    df['Day_cos'] = np.cos(2 * np.pi * df['Day'] / 31)
    df['Group']=(df['Year']-2020)*48+df['Month']*4+df['Day']//7    
    df.drop('Policy Start Date', axis=1, inplace=True)
    return df

def get_nan_cols(df):
    nan_cols = ['Marital Status', 'Customer Feedback', 'Health Score', 'Previous Claims', 'Vehicle Age', 'Credit Score', 'Insurance Duration']
    for col in nan_cols:
        col_name = col + '_NA'
        df[col_name] = df[col].isnull().astype(int)
    return df

def get_encoding(df):
    def encode_ordinal(df):
        educ = {"High School":0, "Bachelor's":1, "Master's":2, "PhD":3}
        policy = {'Basic':0, 'Comprehensive':1, 'Premium':2}
        exerc = {'Rarely':0, 'Daily':1, 'Weekly':2, 'Monthly': 3}
        # feedback = {'Poor':0, 'Average':1, 'Good':2}

        df['Education Level'] = df['Education Level'].map(educ)
        df['Policy Type'] = df['Policy Type'].map(policy)
        df['Exercise Frequency'] = df['Exercise Frequency'].map(exerc)
        # df['Customer Feedback'] = df['Customer Feedback'].map(feedback)
        df['Gender'] = df['Gender'].map({'Male':0, 'Female':1})
        df['Smoking Status'] = df['Smoking Status'].map({'Yes':1, 'No':0})
        return df
    
    def target_encoder(df):
        train = df[~df['Premium Amount'].isnull()]
        test = df[df['Premium Amount'].isnull()]
        encoder = TargetEncoder()
        categorical_cols = ['Marital Status', 'Customer Feedback']
        train[categorical_cols] = encoder.fit_transform(train[categorical_cols], train['Premium Amount'])
        test[categorical_cols] = encoder.transform(test[categorical_cols])
        df = pd.concat([train, test], sort=False).reset_index(drop=True)
        return df

    def one_hot_dummies(df, categorical):
        oh = pd.get_dummies(df[categorical])
        df = df.drop(categorical, axis=1)
        return pd.concat([df, oh], axis=1)
        return df

    df = encode_ordinal(df)
    df = target_encoder(df)

    categorical_features = df.select_dtypes(include='object').columns
    df = one_hot_dummies(df, categorical_features)
    return df

def add_new_features(df):
    df['Income_Dependents Ratio'] = df['Annual Income'] / (df['Number of Dependents'].fillna(0) + 1)
    df['Income_per_Dependent'] = df['Annual Income'] / (df['Number of Dependents'] + 1)
    df['CreditScore_InsuranceDuration'] = df['Credit Score'] * df['Insurance Duration']
    df['Health_Risk_Score'] = df['Smoking Status'].apply(lambda x: 1 if x == 'Smoker' else 0) + \
                                df['Exercise Frequency'].apply(lambda x: 1 if x == 'Low' else (0.5 if x == 'Medium' else 0)) + \
                                (100 - df['Health Score']) / 20
    df['Credit_Health_Score'] = df['Credit Score'] * df['Health Score']
    df['Health_Age_Interaction'] = df['Health Score'] * df['Age']

    df['contract_length'] = pd.cut(
        df["Insurance Duration"].fillna(99),  
        bins=[-float('inf'), 1, 3, float('inf')],  
        labels=[0, 1, 2]  
    ).astype(int)

    df['Age_Income'] = df['Age'] * df['Annual Income']

    # df["Annual_Income_Health_Score_Ratio"] = df["Health Score"] / df["Annual Income"]
    # df["Annual_Income_Age_Ratio"] = df["Annual Income"] / df["Age"]
    # df["Credit_Age"] = df["Credit Score"] / df["Age"]
    # df["Vehicle_Age_Insurance_Duration"] = df["Vehicle Age"] / df["Insurance Duration"]
    return df

def prep():
    train, test, all_df = load_data()

    all_df = skewed(train, all_df)
    all_df = date(all_df)
    all_df = get_nan_cols(all_df)
    # all_df = fill_nan_values(all_df)
    all_df = get_encoding(all_df)
    all_df = add_new_features(all_df)

    del all_df['Annual Income']
    
    train = all_df[~all_df['Premium Amount'].isnull()]
    test = all_df[all_df['Premium Amount'].isnull()]
    train.drop('id', axis=1, inplace=True)
    test.drop(['id', 'Premium Amount'], axis=1, inplace=True)
    return train, test, all_df

train, test, all_df = prep()

CPU times: user 10.3 s, sys: 3.52 s, total: 13.8 s
Wall time: 14 s


In [5]:
train.head()

Unnamed: 0,Age,Gender,Marital Status,Number of Dependents,Education Level,Health Score,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Customer Feedback,Smoking Status,Exercise Frequency,Premium Amount,transformed_Annual_Income,Year,Day,Month,Week,Year_sin,Year_cos,Month_sin,Month_cos,Day_sin,Day_cos,Group,Marital Status_NA,Customer Feedback_NA,Health Score_NA,Previous Claims_NA,Vehicle Age_NA,Credit Score_NA,Insurance Duration_NA,Occupation_Employed,Occupation_Self-Employed,Occupation_Unemployed,Location_Rural,Location_Suburban,Location_Urban,Property Type_Apartment,Property Type_Condo,Property Type_House,Month_name_April,Month_name_August,Month_name_December,Month_name_February,Month_name_January,Month_name_July,Month_name_June,Month_name_March,Month_name_May,Month_name_November,Month_name_October,Month_name_September,Day_of_week_Friday,Day_of_week_Monday,Day_of_week_Saturday,Day_of_week_Sunday,Day_of_week_Thursday,Day_of_week_Tuesday,Day_of_week_Wednesday,Income_Dependents Ratio,Income_per_Dependent,CreditScore_InsuranceDuration,Health_Risk_Score,Credit_Health_Score,Health_Age_Interaction,contract_length,Age_Income
0,19.0,1,1099.844389,1.0,1,22.598761,2,2.0,17.0,372.0,5.0,1098.892745,0,2,2869.0,-0.596487,2023,23,12,51,-0.9510565,0.309017,-2.449294e-16,1.0,-0.998717,-0.050649,195,0,0,0,0,0,0,0,False,True,False,False,False,True,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,5024.5,5024.5,1860.0,3.870062,8406.73897,429.376453,2,190931.0
1,39.0,1,1100.625116,3.0,2,15.569731,1,1.0,12.0,694.0,2.0,1094.350977,1,3,1483.0,0.336563,2023,12,6,24,-0.9510565,0.309017,1.224647e-16,-1.0,0.651372,-0.758758,169,0,0,0,0,0,0,0,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,7919.5,7919.5,1388.0,4.221513,10805.393307,607.219509,1,1235442.0
2,23.0,0,1100.625116,3.0,0,47.177549,2,1.0,14.0,,3.0,1096.284299,1,2,567.0,0.140781,2023,30,9,39,-0.9510565,0.309017,-1.0,-1.83697e-16,-0.201299,0.97953,184,0,0,0,0,0,1,0,False,True,False,False,True,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,6400.5,6400.5,,2.641123,,1085.083634,1,588846.0
3,21.0,0,1099.844389,2.0,1,10.938144,0,1.0,0.0,367.0,1.0,1098.892745,1,1,765.0,2.088459,2024,12,6,24,-2.449294e-16,1.0,1.224647e-16,-1.0,0.651372,-0.758758,217,0,0,0,0,0,0,0,False,False,False,True,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,47285.0,47285.0,367.0,4.453093,4014.298906,229.701027,0,2978955.0
4,21.0,0,1101.735535,1.0,1,20.376094,2,0.0,8.0,598.0,4.0,1098.892745,1,2,2022.0,0.555622,2021,1,12,48,0.5877853,-0.809017,-2.449294e-16,1.0,0.201299,0.97953,96,0,0,0,0,0,0,0,False,True,False,True,False,False,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,19825.5,19825.5,2392.0,3.981195,12184.903989,427.897966,2,832671.0


In [6]:
train.shape, test.shape

((1200000, 70), (800000, 69))

# Model

In [7]:
x = train.drop('Premium Amount', axis=1)
y = train['Premium Amount']

y_log = np.log1p(y)

n_splits=10
folds = KFold(n_splits=n_splits, shuffle=True, random_state=SEED)

## LGBM

In [14]:
%%time

def objective(trial):
    params = {
        'n_estimators': 300,
        'boosting_type': 'gbdt',
        'num_leaves': trial.suggest_int('num_leaves', 10, 300),
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 1e-1),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.6, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.6, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 5, 12),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 10, 100),
        'max_depth': trial.suggest_int('max_depth', -1, 12),
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-4, 10.0),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-4, 10.0),
        'min_gain_to_split': trial.suggest_float('min_gain_to_split', 0.001, 0.1),
        'n_jobs': -1,
        'verbose': -1
    }

    model = LGBMRegressor(**params)
    scores = []

    for train_idx, val_idx in folds.split(x):
        x_train, x_val = x.iloc[train_idx], x.iloc[val_idx]
        y_train, y_val = y_log.iloc[train_idx], y_log.iloc[val_idx]

        model.fit(
            x_train, y_train, 
            eval_set=[(x_val, y_val)],
            eval_metric='rmse',
            callbacks=[
                early_stopping(50),
                log_evaluation(10)
            ])
        preds = model.predict(x_val)
        score = np.sqrt(mean_squared_log_error(np.expm1(y_val), np.expm1(preds)))
        scores.append(score)
    return np.mean(scores)

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=5)

best_params = study.best_params
best_params

[I 2024-12-28 16:47:29,181] A new study created in memory with name: no-name-38400792-9876-446e-a259-4ac03953508a


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.271348 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3171
[LightGBM] [Info] Number of data points in the train set: 1080000, number of used features: 67
[LightGBM] [Info] Start training from score 6.594118
Training until validation scores don't improve for 50 rounds
[10]	valid_0's rmse: 1.08974	valid_0's l2: 1.18753
[20]	valid_0's rmse: 1.08235	valid_0's l2: 1.17147
[30]	valid_0's rmse: 1.07676	valid_0's l2: 1.15941
[40]	valid_0's rmse: 1.07185	valid_0's l2: 1.14885
[50]	valid_0's rmse: 1.06727	valid_0's l2: 1.13905
[60]	valid_0's rmse: 1.06365	valid_0's l2: 1.13136
[70]	valid_0's rmse: 1.06109	valid_0's l2: 1.12592
[80]	valid_0's rmse: 1.05904	valid_0's l2: 1.12156
[90]	valid_0's rmse: 1.05738	valid_0's l2: 1.11805
[100]	valid_0's rmse: 1.0558	valid_0's l2: 1.1147
[110]	valid_0's rmse: 1.05436	valid_0's l2: 1.11167
[120]	valid_0's rmse: 1.05331	vali

[I 2024-12-28 16:56:42,259] Trial 0 finished with value: 1.0461393922523388 and parameters: {'num_leaves': 63, 'learning_rate': 0.011545358131479038, 'feature_fraction': 0.7360534870795294, 'bagging_fraction': 0.8871424229429511, 'bagging_freq': 6, 'min_data_in_leaf': 90, 'max_depth': 10, 'lambda_l1': 5.193695206707215, 'lambda_l2': 2.2186911725807583, 'min_gain_to_split': 0.0341959596504357}. Best is trial 0 with value: 1.0461393922523388.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.137416 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3171
[LightGBM] [Info] Number of data points in the train set: 1080000, number of used features: 67
[LightGBM] [Info] Start training from score 6.594118
Training until validation scores don't improve for 50 rounds
[10]	valid_0's rmse: 1.0794	valid_0's l2: 1.16511
[20]	valid_0's rmse: 1.06805	valid_0's l2: 1.14073
[30]	valid_0's rmse: 1.06258	valid_0's l2: 1.12908
[40]	valid_0's rmse: 1.05914	valid_0's l2: 1.12177
[50]	valid_0's rmse: 1.05691	valid_0's l2: 1.11705
[60]	valid_0's rmse: 1.05551	valid_0's l2: 1.11411
[70]	valid_0's rmse: 1.05455	valid_0's l2: 1.11208
[80]	valid_0's rmse: 1.05389	valid_0's l2: 1.11069
[90]	valid_0's rmse: 1.05329	valid_0's l2: 1.10942
[100]	valid_0's rmse: 1.05275	valid_0's l2: 1.10828
[110]	valid_0's rmse:

[I 2024-12-28 17:01:35,932] Trial 1 finished with value: 1.0488494218182223 and parameters: {'num_leaves': 150, 'learning_rate': 0.04978406590673057, 'feature_fraction': 0.765524087434254, 'bagging_fraction': 0.9272980749913328, 'bagging_freq': 6, 'min_data_in_leaf': 94, 'max_depth': 4, 'lambda_l1': 2.5614039556619006, 'lambda_l2': 8.559151831040943, 'min_gain_to_split': 0.02186555663726193}. Best is trial 0 with value: 1.0461393922523388.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.068118 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3171
[LightGBM] [Info] Number of data points in the train set: 1080000, number of used features: 67
[LightGBM] [Info] Start training from score 6.594118
Training until validation scores don't improve for 50 rounds
[10]	valid_0's rmse: 1.06353	valid_0's l2: 1.13109
[20]	valid_0's rmse: 1.05278	valid_0's l2: 1.10835
[30]	valid_0's rmse: 1.04922	valid_0's l2: 1.10087
[40]	valid_0's rmse: 1.04802	valid_0's l2: 1.09835
[50]	valid_0's rmse: 1.04775	valid_0's l2: 1.09777
[60]	valid_0's rmse: 1.04757	valid_0's l2: 1.09741
[70]	valid_0's rmse: 1.04747	valid_0's l2: 1.0972
[80]	valid_0's rmse: 1.04746	valid_0's l2: 1.09717
[90]	valid_0's rmse: 1.04746	valid_0's l2: 1.09718
[100]	valid_0's rmse: 1.04744	valid_0's l2: 1.09714
[110]	valid_0's rmse:

[I 2024-12-28 17:04:19,868] Trial 2 finished with value: 1.0458146078786552 and parameters: {'num_leaves': 111, 'learning_rate': 0.0583357228522494, 'feature_fraction': 0.9607103509286345, 'bagging_fraction': 0.752032361770083, 'bagging_freq': 11, 'min_data_in_leaf': 74, 'max_depth': -1, 'lambda_l1': 4.094647896963407, 'lambda_l2': 7.310837913970018, 'min_gain_to_split': 0.004067559169870071}. Best is trial 2 with value: 1.0458146078786552.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.141261 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3171
[LightGBM] [Info] Number of data points in the train set: 1080000, number of used features: 67
[LightGBM] [Info] Start training from score 6.594118
Training until validation scores don't improve for 50 rounds
[10]	valid_0's rmse: 1.07451	valid_0's l2: 1.15457
[20]	valid_0's rmse: 1.06031	valid_0's l2: 1.12425
[30]	valid_0's rmse: 1.05443	valid_0's l2: 1.11181
[40]	valid_0's rmse: 1.05119	valid_0's l2: 1.105
[50]	valid_0's rmse: 1.04961	valid_0's l2: 1.10168
[60]	valid_0's rmse: 1.04868	valid_0's l2: 1.09974
[70]	valid_0's rmse: 1.04832	valid_0's l2: 1.09897
[80]	valid_0's rmse: 1.04808	valid_0's l2: 1.09847
[90]	valid_0's rmse: 1.04798	valid_0's l2: 1.09827
[100]	valid_0's rmse: 1.04788	valid_0's l2: 1.09805
[110]	valid_0's rmse: 1.04777	valid_0's l2: 1.09783
[120]	valid_0's rmse: 1.04772	vali

[I 2024-12-28 17:07:04,050] Trial 3 finished with value: 1.0462737369093333 and parameters: {'num_leaves': 72, 'learning_rate': 0.05654675329766405, 'feature_fraction': 0.6443110825700927, 'bagging_fraction': 0.7769490720569636, 'bagging_freq': 5, 'min_data_in_leaf': 99, 'max_depth': 8, 'lambda_l1': 7.800886963078863, 'lambda_l2': 3.1060384844175113, 'min_gain_to_split': 0.06575527160914063}. Best is trial 2 with value: 1.0458146078786552.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.134292 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3171
[LightGBM] [Info] Number of data points in the train set: 1080000, number of used features: 67
[LightGBM] [Info] Start training from score 6.594118
Training until validation scores don't improve for 50 rounds
[10]	valid_0's rmse: 1.07035	valid_0's l2: 1.14565
[20]	valid_0's rmse: 1.0583	valid_0's l2: 1.12
[30]	valid_0's rmse: 1.05431	valid_0's l2: 1.11158
[40]	valid_0's rmse: 1.05232	valid_0's l2: 1.10738
[50]	valid_0's rmse: 1.05113	valid_0's l2: 1.10486
[60]	valid_0's rmse: 1.05057	valid_0's l2: 1.10369
[70]	valid_0's rmse: 1.05027	valid_0's l2: 1.10307
[80]	valid_0's rmse: 1.05007	valid_0's l2: 1.10264
[90]	valid_0's rmse: 1.04999	valid_0's l2: 1.10248
[100]	valid_0's rmse: 1.04986	valid_0's l2: 1.1022
[110]	valid_0's rmse: 1.04975	valid_0's l2: 1.10197
[120]	valid_0's rmse: 1.0497	valid_0'

[I 2024-12-28 17:10:58,000] Trial 4 finished with value: 1.0475037794017117 and parameters: {'num_leaves': 172, 'learning_rate': 0.08337512233816616, 'feature_fraction': 0.7189856255201384, 'bagging_fraction': 0.9059229877595116, 'bagging_freq': 11, 'min_data_in_leaf': 74, 'max_depth': 5, 'lambda_l1': 2.0212296735934263, 'lambda_l2': 9.342999252031051, 'min_gain_to_split': 0.06538007298915599}. Best is trial 2 with value: 1.0458146078786552.


Best Params:  {'num_leaves': 111, 'learning_rate': 0.0583357228522494, 'feature_fraction': 0.9607103509286345, 'bagging_fraction': 0.752032361770083, 'bagging_freq': 11, 'min_data_in_leaf': 74, 'max_depth': -1, 'lambda_l1': 4.094647896963407, 'lambda_l2': 7.310837913970018, 'min_gain_to_split': 0.004067559169870071}


In [16]:
best_params

{'num_leaves': 111,
 'learning_rate': 0.0583357228522494,
 'feature_fraction': 0.9607103509286345,
 'bagging_fraction': 0.752032361770083,
 'bagging_freq': 11,
 'min_data_in_leaf': 74,
 'max_depth': -1,
 'lambda_l1': 4.094647896963407,
 'lambda_l2': 7.310837913970018,
 'min_gain_to_split': 0.004067559169870071,
 'n_estimators': 500}

In [8]:
%%time
best_params = {
 'n_estimators': 500,
 'boosting_type': 'gbdt',
 'num_leaves': 111,
 'learning_rate': 0.0583357228522494,
 'feature_fraction': 0.9607103509286345,
 'bagging_fraction': 0.752032361770083,
 'bagging_freq': 11,
 'min_data_in_leaf': 74,
 'max_depth': -1,
 'lambda_l1': 4.094647896963407,
 'lambda_l2': 7.310837913970018,
 'min_gain_to_split': 0.004067559169870071,
 'n_jobs': -1
}
# best_params['n_estimators'] = 500

folds_train = KFold(n_splits=5, shuffle=True, random_state=SEED)
models_lgb = []
lgbm_OOF = np.zeros(len(x))
lgbm_preds = np.zeros(len(test))

for train_idx, val_idx in folds_train.split(x):
    x_train, x_val = x.iloc[train_idx], x.iloc[val_idx]
    y_train, y_val = y_log.iloc[train_idx], y_log.iloc[val_idx]

    model = LGBMRegressor(**best_params)
    model.fit(
        x_train, y_train, 
        eval_set=[(x_val, y_val)],
        eval_metric='rmse',
        callbacks=[
            early_stopping(100),
            log_evaluation(50)
        ])

    lgbm_OOF[val_idx] += model.predict(x_val)
    lgbm_preds += model.predict(test) / folds_train.n_splits
    models_lgb.append(model)



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.079739 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3171
[LightGBM] [Info] Number of data points in the train set: 960000, number of used features: 67
[LightGBM] [Info] Start training from score 6.594502
Training until validation scores don't improve for 100 rounds
[50]	valid_0's rmse: 1.04741	valid_0's l2: 1.09706
[100]	valid_0's rmse: 1.04715	valid_0's l2: 1.09651
[150]	valid_0's rmse: 1.04713	valid_0's l2: 1.09648
[200]	valid_0's rmse: 1.04724	valid_0's l2: 1.09671
Early stopping, best iteration is:
[143]	valid_0's rmse: 1.0471	valid_0's l2: 1.09643
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.058187 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [

In [9]:
print("Validation RMSE:", np.sqrt(mean_squared_error(y_log, lgbm_OOF)))

Validation RMSE: 1.046063550028496


## CatBoost

In [10]:
%%time
def objective(trial):
    params = {
        "iterations": 300,
        "loss_function": "RMSE",
        "learning_rate": trial.suggest_float("learning_rate", 1e-4, 1e-1),
        "depth": trial.suggest_int("depth", 3, 12),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-4, 10.0),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 1e-3, 1.0),
        "random_strength": trial.suggest_float("random_strength", 1e-3, 10.0),
        "border_count": trial.suggest_int("border_count", 32, 255),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.6, 1.0),
        "verbose": 50,
        "random_seed": SEED,
    }

    scores = []
    for train_idx, val_idx in folds.split(x):
        x_train, x_val = x.iloc[train_idx], x.iloc[val_idx]
        y_train, y_val = y_log.iloc[train_idx], y_log.iloc[val_idx]

        model = CatBoostRegressor(**params)
        model.fit(
            x_train, y_train,
            eval_set=(x_val, y_val),
            early_stopping_rounds=50,
        )
        preds = model.predict(x_val)
        score = np.sqrt(mean_squared_log_error(np.expm1(y_val), np.expm1(preds)))
        scores.append(score)

    return np.mean(scores)

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=5)

best_params = study.best_params
best_params

[I 2024-12-30 02:08:57,120] A new study created in memory with name: no-name-6dea679d-329e-4a6b-b81b-b7ff4c73cbfb


0:	learn: 1.0911111	test: 1.0932301	best: 1.0932301 (0)	total: 126ms	remaining: 37.8s
50:	learn: 1.0567770	test: 1.0593995	best: 1.0593995 (50)	total: 4.27s	remaining: 20.9s
100:	learn: 1.0541226	test: 1.0572598	best: 1.0572551 (97)	total: 8.21s	remaining: 16.2s
150:	learn: 1.0524674	test: 1.0560781	best: 1.0560781 (150)	total: 12.1s	remaining: 11.9s
200:	learn: 1.0480310	test: 1.0518803	best: 1.0518803 (200)	total: 16s	remaining: 7.87s
250:	learn: 1.0461714	test: 1.0503686	best: 1.0503684 (248)	total: 19.8s	remaining: 3.87s
299:	learn: 1.0450795	test: 1.0499891	best: 1.0499791 (290)	total: 23.6s	remaining: 0us

bestTest = 1.049979137
bestIteration = 290

Shrink model to first 291 iterations.
0:	learn: 1.0904211	test: 1.0915492	best: 1.0915492 (0)	total: 75.3ms	remaining: 22.5s
50:	learn: 1.0563225	test: 1.0575210	best: 1.0575210 (50)	total: 3.96s	remaining: 19.4s
100:	learn: 1.0539681	test: 1.0554944	best: 1.0554944 (100)	total: 7.81s	remaining: 15.4s
150:	learn: 1.0527456	test: 1.054

[I 2024-12-30 02:12:57,007] Trial 0 finished with value: 1.0480160822640519 and parameters: {'learning_rate': 0.09295757892732069, 'depth': 7, 'l2_leaf_reg': 2.8780706448862734, 'bagging_temperature': 0.12215801350190825, 'random_strength': 8.553048856390589, 'border_count': 232, 'colsample_bylevel': 0.7252465177667906}. Best is trial 0 with value: 1.0480160822640519.


299:	learn: 1.0455391	test: 1.0437171	best: 1.0437100 (288)	total: 23s	remaining: 0us

bestTest = 1.043709967
bestIteration = 288

Shrink model to first 289 iterations.
0:	learn: 1.0947406	test: 1.0969063	best: 1.0969063 (0)	total: 178ms	remaining: 53.3s
50:	learn: 1.0657976	test: 1.0682849	best: 1.0682849 (50)	total: 8.85s	remaining: 43.2s
100:	learn: 1.0572683	test: 1.0600650	best: 1.0600650 (100)	total: 17.7s	remaining: 34.9s
150:	learn: 1.0539717	test: 1.0570805	best: 1.0570805 (150)	total: 26.4s	remaining: 26.1s
200:	learn: 1.0524890	test: 1.0558650	best: 1.0558650 (200)	total: 34.9s	remaining: 17.2s
250:	learn: 1.0514249	test: 1.0551176	best: 1.0551176 (250)	total: 43.7s	remaining: 8.53s
299:	learn: 1.0506899	test: 1.0547194	best: 1.0547190 (298)	total: 52.2s	remaining: 0us

bestTest = 1.054718992
bestIteration = 298

Shrink model to first 299 iterations.
0:	learn: 1.0946076	test: 1.0957147	best: 1.0957147 (0)	total: 173ms	remaining: 51.9s
50:	learn: 1.0663000	test: 1.0675925	bes

[I 2024-12-30 02:21:46,929] Trial 1 finished with value: 1.0529940236112476 and parameters: {'learning_rate': 0.014917130970782408, 'depth': 10, 'l2_leaf_reg': 8.271686963712773, 'bagging_temperature': 0.5701170987085561, 'random_strength': 7.562661893867209, 'border_count': 161, 'colsample_bylevel': 0.834925769328227}. Best is trial 0 with value: 1.0480160822640519.


299:	learn: 1.0512611	test: 1.0478481	best: 1.0478481 (299)	total: 51.2s	remaining: 0us

bestTest = 1.047848092
bestIteration = 299

0:	learn: 1.0933735	test: 1.0955114	best: 1.0955114 (0)	total: 48.2ms	remaining: 14.4s
50:	learn: 1.0661028	test: 1.0683104	best: 1.0683104 (50)	total: 2.28s	remaining: 11.1s
100:	learn: 1.0601073	test: 1.0623068	best: 1.0623068 (100)	total: 4.51s	remaining: 8.89s
150:	learn: 1.0574496	test: 1.0596601	best: 1.0596601 (150)	total: 6.75s	remaining: 6.66s
200:	learn: 1.0560917	test: 1.0583268	best: 1.0583268 (200)	total: 9s	remaining: 4.43s
250:	learn: 1.0551901	test: 1.0574444	best: 1.0574444 (250)	total: 11.2s	remaining: 2.19s
299:	learn: 1.0545722	test: 1.0568705	best: 1.0568705 (299)	total: 13.4s	remaining: 0us

bestTest = 1.056870485
bestIteration = 299

0:	learn: 1.0942862	test: 1.0954020	best: 1.0954020 (0)	total: 45.2ms	remaining: 13.5s
50:	learn: 1.0665731	test: 1.0673199	best: 1.0673199 (50)	total: 2.36s	remaining: 11.5s
100:	learn: 1.0606948	test:

[I 2024-12-30 02:24:12,660] Trial 2 finished with value: 1.0551769615201745 and parameters: {'learning_rate': 0.0631435155104922, 'depth': 3, 'l2_leaf_reg': 4.019517717494051, 'bagging_temperature': 0.5013490282326326, 'random_strength': 1.2761914871686129, 'border_count': 177, 'colsample_bylevel': 0.8671692491214528}. Best is trial 0 with value: 1.0480160822640519.


299:	learn: 1.0551681	test: 1.0505239	best: 1.0505239 (299)	total: 13.6s	remaining: 0us

bestTest = 1.050523904
bestIteration = 299

0:	learn: 1.0918078	test: 1.0939852	best: 1.0939852 (0)	total: 83.7ms	remaining: 25s
50:	learn: 1.0559996	test: 1.0587502	best: 1.0587502 (50)	total: 4.04s	remaining: 19.7s
100:	learn: 1.0533669	test: 1.0565895	best: 1.0565895 (100)	total: 8.19s	remaining: 16.1s
150:	learn: 1.0522020	test: 1.0558874	best: 1.0558874 (150)	total: 12.3s	remaining: 12.2s
200:	learn: 1.0508284	test: 1.0550424	best: 1.0550424 (200)	total: 16.6s	remaining: 8.2s
250:	learn: 1.0480750	test: 1.0527261	best: 1.0527261 (250)	total: 20.8s	remaining: 4.05s
299:	learn: 1.0459324	test: 1.0509369	best: 1.0509369 (299)	total: 24.8s	remaining: 0us

bestTest = 1.050936928
bestIteration = 299

0:	learn: 1.0926252	test: 1.0936712	best: 1.0936712 (0)	total: 76.4ms	remaining: 22.9s
50:	learn: 1.0562566	test: 1.0577647	best: 1.0577647 (50)	total: 4.12s	remaining: 20.1s
100:	learn: 1.0534140	test:

[I 2024-12-30 02:28:29,145] Trial 3 finished with value: 1.0493802203167084 and parameters: {'learning_rate': 0.06510612382492671, 'depth': 8, 'l2_leaf_reg': 5.355296383632152, 'bagging_temperature': 0.912371056975767, 'random_strength': 8.786294895215402, 'border_count': 33, 'colsample_bylevel': 0.8678960932922013}. Best is trial 0 with value: 1.0480160822640519.


299:	learn: 1.0467699	test: 1.0447949	best: 1.0447949 (299)	total: 24.6s	remaining: 0us

bestTest = 1.044794947
bestIteration = 299

0:	learn: 1.0953134	test: 1.0974597	best: 1.0974597 (0)	total: 53.9ms	remaining: 16.1s
50:	learn: 1.0844665	test: 1.0866692	best: 1.0866692 (50)	total: 2.71s	remaining: 13.2s
100:	learn: 1.0773586	test: 1.0796414	best: 1.0796414 (100)	total: 5.46s	remaining: 10.8s
150:	learn: 1.0727755	test: 1.0751030	best: 1.0751030 (150)	total: 8.09s	remaining: 7.99s
200:	learn: 1.0695133	test: 1.0718441	best: 1.0718441 (200)	total: 10.7s	remaining: 5.27s
250:	learn: 1.0670555	test: 1.0694014	best: 1.0694014 (250)	total: 13.3s	remaining: 2.59s
299:	learn: 1.0651496	test: 1.0674894	best: 1.0674894 (299)	total: 15.8s	remaining: 0us

bestTest = 1.067489387
bestIteration = 299

0:	learn: 1.0954762	test: 1.0965781	best: 1.0965781 (0)	total: 52.4ms	remaining: 15.7s
50:	learn: 1.0844465	test: 1.0855577	best: 1.0855577 (50)	total: 2.71s	remaining: 13.2s
100:	learn: 1.0774307	te

[I 2024-12-30 02:31:18,484] Trial 4 finished with value: 1.0655147145292492 and parameters: {'learning_rate': 0.006538206495847513, 'depth': 4, 'l2_leaf_reg': 5.4364933737594034, 'bagging_temperature': 0.5158379780286786, 'random_strength': 1.328936342686393, 'border_count': 188, 'colsample_bylevel': 0.7161842450866712}. Best is trial 0 with value: 1.0480160822640519.


299:	learn: 1.0658669	test: 1.0599809	best: 1.0599809 (299)	total: 16s	remaining: 0us

bestTest = 1.059980866
bestIteration = 299

CPU times: user 1h 18min 58s, sys: 56.7 s, total: 1h 19min 54s
Wall time: 22min 21s


{'learning_rate': 0.09295757892732069,
 'depth': 7,
 'l2_leaf_reg': 2.8780706448862734,
 'bagging_temperature': 0.12215801350190825,
 'random_strength': 8.553048856390589,
 'border_count': 232,
 'colsample_bylevel': 0.7252465177667906}

In [12]:
%%time

# best_params = {
#  'iterations': 500,
#  'loss_function': 'RMSE',
#  'learning_rate': 0.09295757892732069,
#  'depth': 7,
#  'l2_leaf_reg': 2.8780706448862734,
#  'bagging_temperature': 0.12215801350190825,
#  'random_strength': 8.553048856390589,
#  'border_count': 232,
#  'colsample_bylevel': 0.7252465177667906
#  'verbose': 50,
#  'random_seed': SEED,
# }

best_params['iterations'] = 500

folds_train = KFold(n_splits=5, shuffle=True, random_state=SEED)
models_cat = []
cat_OOF = np.zeros(len(x))
cat_preds = np.zeros(len(test))

for train_idx, val_idx in folds_train.split(x):
    x_train, x_val = x.iloc[train_idx], x.iloc[val_idx]
    y_train, y_val = y_log.iloc[train_idx], y_log.iloc[val_idx]

    model = CatBoostRegressor(
        **best_params,
        loss_function="RMSE",
        random_seed=SEED,
    )

    model.fit(
        x_train, y_train,
        eval_set=(x_val, y_val),
        early_stopping_rounds=100,
    )

    cat_OOF[val_idx] += model.predict(x_val)
    cat_preds += model.predict(test) / folds_train.n_splits
    models_cat.append(model)

0:	learn: 1.0909730	test: 1.0927722	best: 1.0927722 (0)	total: 79.7ms	remaining: 39.8s
1:	learn: 1.0869154	test: 1.0887949	best: 1.0887949 (1)	total: 154ms	remaining: 38.3s
2:	learn: 1.0847814	test: 1.0866689	best: 1.0866689 (2)	total: 232ms	remaining: 38.4s
3:	learn: 1.0825110	test: 1.0843626	best: 1.0843626 (3)	total: 306ms	remaining: 37.9s
4:	learn: 1.0807883	test: 1.0826877	best: 1.0826877 (4)	total: 372ms	remaining: 36.9s
5:	learn: 1.0785916	test: 1.0805079	best: 1.0805079 (5)	total: 450ms	remaining: 37.1s
6:	learn: 1.0774473	test: 1.0793406	best: 1.0793406 (6)	total: 525ms	remaining: 37s
7:	learn: 1.0756996	test: 1.0776127	best: 1.0776127 (7)	total: 595ms	remaining: 36.6s
8:	learn: 1.0730098	test: 1.0749406	best: 1.0749406 (8)	total: 681ms	remaining: 37.1s
9:	learn: 1.0712695	test: 1.0732165	best: 1.0732165 (9)	total: 752ms	remaining: 36.8s
10:	learn: 1.0696438	test: 1.0715697	best: 1.0715697 (10)	total: 817ms	remaining: 36.3s
11:	learn: 1.0681611	test: 1.0700968	best: 1.0700968 

In [13]:
print("Validation RMSE:", np.sqrt(mean_squared_error(y_log, cat_OOF)))

Validation RMSE: 1.0476448992876886


## XGBoost

In [37]:
from xgboost import set_config

# 전역 설정 초기화
set_config(verbosity=3)  # 허용 범위 내 값 설정

In [38]:
%%time

def objective(trial):
    params = {
        "objective": "reg:squarederror",
        "eval_metric": "rmse",
        "booster": "gbtree",
        "eta": trial.suggest_float("eta", 1e-4, 1e-1, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "lambda": trial.suggest_float("lambda", 1e-4, 10.0, log=True),
        "alpha": trial.suggest_float("alpha", 1e-4, 10.0, log=True),
        "gamma": trial.suggest_float("gamma", 0.001, 0.1),
        "seed": SEED,
        "verbosity": 3
    }

    scores = []
    for train_idx, val_idx in folds.split(x):
        x_train, x_val = x.iloc[train_idx], x.iloc[val_idx]
        y_train, y_val = y_log.iloc[train_idx], y_log.iloc[val_idx]

        model = XGBRegressor(**params)
        # early_stop = callback.EarlyStopping(rounds=50, metric_name='rmse', save_best=True)
        model.fit(
            x_train, y_train,
            eval_set=[(x_val, y_val)],
            # callbacks=[early_stop],
            verbose=True
        )
        preds = model.predict(x_val)
        score = np.sqrt(mean_squared_log_error(np.expm1(y_val), np.expm1(preds)))
        scores.append(score)

    return np.mean(scores)

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=5)

best_params = study.best_params
best_params

[I 2024-12-30 03:12:55,980] A new study created in memory with name: no-name-3228d5a3-f4bd-45c6-bdc4-1a00c5ef9d48


[03:12:59] AllReduce: 0.015108s, 1 calls @ 15108us

[03:12:59] MakeCuts: 0.015175s, 1 calls @ 15175us

[03:13:00] DEBUG: /workspace/src/gbm/gbtree.cc:130: Using tree method: 0
[0]	validation_0-rmse:1.09568
[1]	validation_0-rmse:1.09415
[2]	validation_0-rmse:1.09185
[3]	validation_0-rmse:1.09017
[4]	validation_0-rmse:1.08802
[5]	validation_0-rmse:1.08650
[6]	validation_0-rmse:1.08507
[7]	validation_0-rmse:1.08316
[8]	validation_0-rmse:1.08224
[9]	validation_0-rmse:1.08047
[10]	validation_0-rmse:1.07929
[11]	validation_0-rmse:1.07804
[12]	validation_0-rmse:1.07649
[13]	validation_0-rmse:1.07500
[14]	validation_0-rmse:1.07386
[15]	validation_0-rmse:1.07251
[16]	validation_0-rmse:1.07123
[17]	validation_0-rmse:1.07037
[18]	validation_0-rmse:1.06920
[19]	validation_0-rmse:1.06834
[20]	validation_0-rmse:1.06728
[21]	validation_0-rmse:1.06636
[22]	validation_0-rmse:1.06540
[23]	validation_0-rmse:1.06468
[24]	validation_0-rmse:1.06400
[25]	validation_0-rmse:1.06367
[26]	validation_0-rmse:1.063

[I 2024-12-30 03:16:46,509] Trial 0 finished with value: 1.0469537261948294 and parameters: {'eta': 0.026406022486331556, 'max_depth': 8, 'min_child_weight': 9, 'subsample': 0.9450644777657322, 'colsample_bytree': 0.7330569749023635, 'lambda': 0.3878173140428721, 'alpha': 3.8929025064300062, 'gamma': 0.0934054158792672}. Best is trial 0 with value: 1.0469537261948294.


[03:16:46] Configure: 0.000518s, 1 calls @ 518us

[03:16:50] AllReduce: 0.017215s, 1 calls @ 17215us

[03:16:50] MakeCuts: 0.021886s, 1 calls @ 21886us

[03:16:50] DEBUG: /workspace/src/gbm/gbtree.cc:130: Using tree method: 0
[0]	validation_0-rmse:1.09769
[1]	validation_0-rmse:1.09764
[2]	validation_0-rmse:1.09756
[3]	validation_0-rmse:1.09750
[4]	validation_0-rmse:1.09743
[5]	validation_0-rmse:1.09738
[6]	validation_0-rmse:1.09731
[7]	validation_0-rmse:1.09727
[8]	validation_0-rmse:1.09723
[9]	validation_0-rmse:1.09715
[10]	validation_0-rmse:1.09709
[11]	validation_0-rmse:1.09704
[12]	validation_0-rmse:1.09696
[13]	validation_0-rmse:1.09689
[14]	validation_0-rmse:1.09682
[15]	validation_0-rmse:1.09674
[16]	validation_0-rmse:1.09667
[17]	validation_0-rmse:1.09661
[18]	validation_0-rmse:1.09654
[19]	validation_0-rmse:1.09650
[20]	validation_0-rmse:1.09642
[21]	validation_0-rmse:1.09636
[22]	validation_0-rmse:1.09629
[23]	validation_0-rmse:1.09623
[24]	validation_0-rmse:1.09620
[25]	vali

[I 2024-12-30 03:21:29,977] Trial 1 finished with value: 1.0903526492334725 and parameters: {'eta': 0.0007821906687756904, 'max_depth': 10, 'min_child_weight': 1, 'subsample': 0.7404741775336107, 'colsample_bytree': 0.6291037700288417, 'lambda': 0.00020983373566859162, 'alpha': 0.0001701872699645133, 'gamma': 0.08012141001929707}. Best is trial 0 with value: 1.0469537261948294.


[03:21:29] Configure: 0.000481s, 1 calls @ 481us

[03:21:33] AllReduce: 0.030719s, 1 calls @ 30719us

[03:21:33] MakeCuts: 0.030786s, 1 calls @ 30786us

[03:21:34] DEBUG: /workspace/src/gbm/gbtree.cc:130: Using tree method: 0
[0]	validation_0-rmse:1.09748
[1]	validation_0-rmse:1.09726
[2]	validation_0-rmse:1.09694
[3]	validation_0-rmse:1.09669
[4]	validation_0-rmse:1.09636
[5]	validation_0-rmse:1.09615
[6]	validation_0-rmse:1.09589
[7]	validation_0-rmse:1.09561
[8]	validation_0-rmse:1.09538
[9]	validation_0-rmse:1.09507
[10]	validation_0-rmse:1.09481
[11]	validation_0-rmse:1.09454
[12]	validation_0-rmse:1.09423
[13]	validation_0-rmse:1.09392
[14]	validation_0-rmse:1.09364
[15]	validation_0-rmse:1.09334
[16]	validation_0-rmse:1.09304
[17]	validation_0-rmse:1.09279
[18]	validation_0-rmse:1.09249
[19]	validation_0-rmse:1.09223
[20]	validation_0-rmse:1.09194
[21]	validation_0-rmse:1.09168
[22]	validation_0-rmse:1.09142
[23]	validation_0-rmse:1.09119
[24]	validation_0-rmse:1.09099
[25]	vali

[I 2024-12-30 03:26:15,873] Trial 2 finished with value: 1.0747899992949665 and parameters: {'eta': 0.003337888840491961, 'max_depth': 10, 'min_child_weight': 10, 'subsample': 0.6228821663268581, 'colsample_bytree': 0.7153822568376784, 'lambda': 0.9044607326121704, 'alpha': 0.004155627069841659, 'gamma': 0.035435874598110755}. Best is trial 0 with value: 1.0469537261948294.


[03:26:15] Configure: 0.000517s, 1 calls @ 517us

[03:26:19] AllReduce: 0.015057s, 1 calls @ 15057us

[03:26:19] MakeCuts: 0.015122s, 1 calls @ 15122us

[03:26:20] DEBUG: /workspace/src/gbm/gbtree.cc:130: Using tree method: 0
[0]	validation_0-rmse:1.09747
[1]	validation_0-rmse:1.09719
[2]	validation_0-rmse:1.09691
[3]	validation_0-rmse:1.09663
[4]	validation_0-rmse:1.09635
[5]	validation_0-rmse:1.09608
[6]	validation_0-rmse:1.09581
[7]	validation_0-rmse:1.09554
[8]	validation_0-rmse:1.09534
[9]	validation_0-rmse:1.09507
[10]	validation_0-rmse:1.09480
[11]	validation_0-rmse:1.09454
[12]	validation_0-rmse:1.09427
[13]	validation_0-rmse:1.09401
[14]	validation_0-rmse:1.09375
[15]	validation_0-rmse:1.09349
[16]	validation_0-rmse:1.09323
[17]	validation_0-rmse:1.09297
[18]	validation_0-rmse:1.09271
[19]	validation_0-rmse:1.09246
[20]	validation_0-rmse:1.09220
[21]	validation_0-rmse:1.09195
[22]	validation_0-rmse:1.09170
[23]	validation_0-rmse:1.09145
[24]	validation_0-rmse:1.09125
[25]	vali

[I 2024-12-30 03:32:32,073] Trial 3 finished with value: 1.0745707707631293 and parameters: {'eta': 0.0028527628850404043, 'max_depth': 12, 'min_child_weight': 7, 'subsample': 0.710315160171412, 'colsample_bytree': 0.9348366372910019, 'lambda': 0.6410884573441809, 'alpha': 9.620068802865976, 'gamma': 0.060579989781957604}. Best is trial 0 with value: 1.0469537261948294.


[03:32:32] Configure: 0.000473s, 1 calls @ 473us

[03:32:35] AllReduce: 0.015028s, 1 calls @ 15028us

[03:32:35] MakeCuts: 0.015093s, 1 calls @ 15093us

[03:32:36] DEBUG: /workspace/src/gbm/gbtree.cc:130: Using tree method: 0
[0]	validation_0-rmse:1.09771
[1]	validation_0-rmse:1.09767
[2]	validation_0-rmse:1.09761
[3]	validation_0-rmse:1.09757
[4]	validation_0-rmse:1.09752
[5]	validation_0-rmse:1.09748
[6]	validation_0-rmse:1.09743
[7]	validation_0-rmse:1.09740
[8]	validation_0-rmse:1.09736
[9]	validation_0-rmse:1.09731
[10]	validation_0-rmse:1.09726
[11]	validation_0-rmse:1.09722
[12]	validation_0-rmse:1.09717
[13]	validation_0-rmse:1.09712
[14]	validation_0-rmse:1.09707
[15]	validation_0-rmse:1.09701
[16]	validation_0-rmse:1.09696
[17]	validation_0-rmse:1.09691
[18]	validation_0-rmse:1.09686
[19]	validation_0-rmse:1.09683
[20]	validation_0-rmse:1.09677
[21]	validation_0-rmse:1.09672
[22]	validation_0-rmse:1.09668
[23]	validation_0-rmse:1.09663
[24]	validation_0-rmse:1.09661
[25]	vali

[I 2024-12-30 03:37:30,681] Trial 4 finished with value: 1.091701645169003 and parameters: {'eta': 0.000559850383908714, 'max_depth': 11, 'min_child_weight': 1, 'subsample': 0.9618600119370613, 'colsample_bytree': 0.6512818772026936, 'lambda': 0.2740624046731456, 'alpha': 0.008646094198398457, 'gamma': 0.007848814369602032}. Best is trial 0 with value: 1.0469537261948294.


[03:37:30] Configure: 0.000483s, 1 calls @ 483us

CPU times: user 1h 24min 58s, sys: 18.9 s, total: 1h 25min 17s
Wall time: 24min 34s


{'eta': 0.026406022486331556,
 'max_depth': 8,
 'min_child_weight': 9,
 'subsample': 0.9450644777657322,
 'colsample_bytree': 0.7330569749023635,
 'lambda': 0.3878173140428721,
 'alpha': 3.8929025064300062,
 'gamma': 0.0934054158792672}

In [41]:
%%time
# best_params = {
#  "objective": "reg:squarederror",
#  "eval_metric": "rmse",
#  'booster': 'gbtree',
#  'eta': 0.026406022486331556,
#  'max_depth': 8,
#  'min_child_weight': 9,
#  'subsample': 0.9450644777657322,
#  'colsample_bytree': 0.7330569749023635,
#  'lambda': 0.3878173140428721,
#  'alpha': 3.8929025064300062,
#  'gamma': 0.0934054158792672
#  "seed": SEED,
#  "verbosity": 3
#     }

best_params['iterations'] = 500

folds_train = KFold(n_splits=5, shuffle=True, random_state=SEED)
models_cat = []
xgb_OOF = np.zeros(len(x))
xgb_preds = np.zeros(len(test))

for train_idx, val_idx in folds_train.split(x):
    x_train, x_val = x.iloc[train_idx], x.iloc[val_idx]
    y_train, y_val = y_log.iloc[train_idx], y_log.iloc[val_idx]

    model = XGBRegressor(
        **best_params,
        loss_function="RMSE",
        random_seed=SEED,
    )

    model.fit(
        x_train, y_train,
        eval_set=[(x_val, y_val)],
        # early_stopping_rounds=100,
        verbose=True
    )

    xgb_OOF[val_idx] += model.predict(x_val)
    xgb_preds += model.predict(test) / folds_train.n_splits
    models_cat.append(model)

[03:40:13] AllReduce: 0.019716s, 1 calls @ 19716us

[03:40:13] MakeCuts: 0.019913s, 1 calls @ 19913us

[03:40:13] DEBUG: /workspace/src/gbm/gbtree.cc:130: Using tree method: 0
[0]	validation_0-rmse:1.09591
[1]	validation_0-rmse:1.09411
[2]	validation_0-rmse:1.09174
[3]	validation_0-rmse:1.08949
[4]	validation_0-rmse:1.08790
[5]	validation_0-rmse:1.08628
[6]	validation_0-rmse:1.08431
[7]	validation_0-rmse:1.08296
[8]	validation_0-rmse:1.08224
[9]	validation_0-rmse:1.08079
[10]	validation_0-rmse:1.07954
[11]	validation_0-rmse:1.07790
[12]	validation_0-rmse:1.07678
[13]	validation_0-rmse:1.07532
[14]	validation_0-rmse:1.07394
[15]	validation_0-rmse:1.07310
[16]	validation_0-rmse:1.07255
[17]	validation_0-rmse:1.07152
[18]	validation_0-rmse:1.07029
[19]	validation_0-rmse:1.06942
[20]	validation_0-rmse:1.06885
[21]	validation_0-rmse:1.06833
[22]	validation_0-rmse:1.06757
[23]	validation_0-rmse:1.06683
[24]	validation_0-rmse:1.06605
[25]	validation_0-rmse:1.06516
[26]	validation_0-rmse:1.064

In [42]:
print("Validation RMSE:", np.sqrt(mean_squared_error(y_log, xgb_OOF)))

Validation RMSE: 1.04718116937329


## ExtraTrees

In [51]:
def prep_nan():
    train, test, all_df = load_data()

    all_df = skewed(train, all_df)
    all_df = date(all_df)
    all_df = get_nan_cols(all_df)
    all_df = fill_nan_values(all_df)
    all_df = get_encoding(all_df)
    all_df = add_new_features(all_df)

    del all_df['Annual Income']
    
    train = all_df[~all_df['Premium Amount'].isnull()]
    test = all_df[all_df['Premium Amount'].isnull()]
    train.drop('id', axis=1, inplace=True)
    test.drop(['id', 'Premium Amount'], axis=1, inplace=True)
    return train, test, all_df

train_nan, test_nan, all_df_nan = prep_nan()

In [52]:
x_nan = train_nan.drop('Premium Amount', axis=1)
y_nan = train_nan['Premium Amount']

y_log_nan = np.log1p(y_nan)

In [60]:
%%time

def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 150, step=50), 
        "max_depth": trial.suggest_int("max_depth", 3, 15),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 5),
        "max_features": trial.suggest_float("max_features", 0.4, 0.9),  
        "bootstrap": False,  
        "random_state": SEED,
        "n_jobs": -1
    }

    scores = []

    # x_sample = x_nan.sample(frac=0.5, random_state=SEED)
    # y_sample = y_log_nan.loc[x_sample.index]
    folds_opt = KFold(n_splits=3, shuffle=True, random_state=SEED)

    # for train_idx, val_idx in folds_opt.split(x_nan):
    for fold, (train_idx, val_idx) in tqdm(enumerate(folds_opt.split(x_nan)), total=folds_opt.get_n_splits()):
        x_train, x_val = x_nan.iloc[train_idx], x_nan.iloc[val_idx]
        y_train, y_val = y_log_nan.iloc[train_idx], y_log_nan.iloc[val_idx]

        model = ExtraTreesRegressor(**params)
        model.fit(x_train, y_train)

        preds = model.predict(x_val)
        score = np.sqrt(mean_squared_log_error(np.expm1(y_val), np.expm1(preds)))
        scores.append(score)
        print(f"Fold {fold + 1} RMSE: {score:.4f}")

    return np.mean(scores)

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=5)

best_params = study.best_params
best_params

[I 2024-12-30 07:35:14,171] A new study created in memory with name: no-name-c2016616-4a18-4ff3-8adf-d69e70c93537
 33%|███▎      | 1/3 [05:40<11:20, 340.25s/it]

Fold 1 RMSE: 1.0589


 67%|██████▋   | 2/3 [11:21<05:40, 340.74s/it]

Fold 2 RMSE: 1.0574


100%|██████████| 3/3 [16:43<00:00, 334.47s/it]
[I 2024-12-30 07:51:57,580] Trial 0 finished with value: 1.056574060530059 and parameters: {'n_estimators': 150, 'max_depth': 13, 'min_samples_split': 4, 'min_samples_leaf': 2, 'max_features': 0.7460208372574245}. Best is trial 0 with value: 1.056574060530059.


Fold 3 RMSE: 1.0534


 33%|███▎      | 1/3 [01:09<02:18, 69.18s/it]

Fold 1 RMSE: 1.0817


 67%|██████▋   | 2/3 [02:18<01:09, 69.21s/it]

Fold 2 RMSE: 1.0807


100%|██████████| 3/3 [03:27<00:00, 69.18s/it]
[I 2024-12-30 07:55:25,109] Trial 1 finished with value: 1.0797628279115008 and parameters: {'n_estimators': 100, 'max_depth': 5, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 0.6762162348267671}. Best is trial 0 with value: 1.056574060530059.


Fold 3 RMSE: 1.0769


 33%|███▎      | 1/3 [03:08<06:16, 188.29s/it]

Fold 1 RMSE: 1.0713


 67%|██████▋   | 2/3 [06:18<03:09, 189.35s/it]

Fold 2 RMSE: 1.0707


100%|██████████| 3/3 [09:32<00:00, 190.97s/it]
[I 2024-12-30 08:04:58,035] Trial 2 finished with value: 1.069260026995196 and parameters: {'n_estimators': 150, 'max_depth': 8, 'min_samples_split': 4, 'min_samples_leaf': 2, 'max_features': 0.7244901537757895}. Best is trial 0 with value: 1.056574060530059.


Fold 3 RMSE: 1.0658


 33%|███▎      | 1/3 [01:26<02:53, 86.99s/it]

Fold 1 RMSE: 1.0757


 67%|██████▋   | 2/3 [02:54<01:27, 87.46s/it]

Fold 2 RMSE: 1.0740


100%|██████████| 3/3 [04:22<00:00, 87.55s/it]
[I 2024-12-30 08:09:20,697] Trial 3 finished with value: 1.0732634321916985 and parameters: {'n_estimators': 100, 'max_depth': 8, 'min_samples_split': 6, 'min_samples_leaf': 3, 'max_features': 0.4804059437210617}. Best is trial 0 with value: 1.056574060530059.


Fold 3 RMSE: 1.0700


 33%|███▎      | 1/3 [02:24<04:49, 144.94s/it]

Fold 1 RMSE: 1.0640


 67%|██████▋   | 2/3 [04:48<02:24, 144.17s/it]

Fold 2 RMSE: 1.0616


100%|██████████| 3/3 [07:11<00:00, 143.81s/it]
[I 2024-12-30 08:16:32,120] Trial 4 finished with value: 1.0612393062040797 and parameters: {'n_estimators': 100, 'max_depth': 14, 'min_samples_split': 8, 'min_samples_leaf': 1, 'max_features': 0.4217175031911396}. Best is trial 0 with value: 1.056574060530059.


Fold 3 RMSE: 1.0581
CPU times: user 2h 25min 45s, sys: 17.1 s, total: 2h 26min 2s
Wall time: 41min 17s


{'n_estimators': 150,
 'max_depth': 13,
 'min_samples_split': 4,
 'min_samples_leaf': 2,
 'max_features': 0.7460208372574245}

In [62]:
%%time

# params = {
#     "n_estimators": 200,
#     "max_depth": 13,
#     "min_samples_split": 4,
#     "min_samples_leaf": 2,
#     "max_features": 0.7460208372574245,
#     "bootstrap": False,  
#     "random_state": SEED,
#     "n_jobs": -1
# }

best_params['n_estimators'] = 200

folds_train = KFold(n_splits=5, shuffle=True, random_state=SEED)
et_OOF = np.zeros(len(x)) 
et_preds = np.zeros(len(test)) 
models_et = []

for train_idx, val_idx in folds_train.split(x_nan):
    x_train, x_val = x_nan.iloc[train_idx], x_nan.iloc[val_idx]
    y_train, y_val = y_log_nan.iloc[train_idx], y_log_nan.iloc[val_idx]

    model = ExtraTreesRegressor(
        **best_params,
        random_state=SEED,
        n_jobs=-1
    )

    model.fit(x_train, y_train)

    et_OOF[val_idx] += model.predict(x_val)
    et_preds += model.predict(test_nan) / folds_train.n_splits
    models_et.append(model)

CPU times: user 2h 49min 20s, sys: 20 s, total: 2h 49min 40s
Wall time: 49min 7s


In [63]:
print("Validation RMSE:", np.sqrt(mean_squared_error(y_log_nan, et_OOF)))

Validation RMSE: 1.0564255745197302


In [None]:
# %%time
# def objective(trial):
#     params = {
#         "n_estimators": trial.suggest_int("n_estimators", 100, 200, step=50),
#         "max_depth": trial.suggest_int("max_depth", 3, 20),
#         "min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
#         "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 5),
#         "max_features": trial.suggest_float("max_features", 0.6, 1.0),
#         "bootstrap": True,
#         "random_state": SEED,
#     }

#     scores = []
    
#     x_sample = x_nan.sample(frac=0.5, random_state=SEED)
#     y_sample = y_log_nan.loc[x_sample.index]
#     folds_rf = KFold(n_splits=3, shuffle=True, random_state=SEED)

#     for train_idx, val_idx in folds_rf.split(x_sample):
#         x_train, x_val = x.iloc[train_idx], x.iloc[val_idx]
#         y_train, y_val = y_sample.iloc[train_idx], y_sample.iloc[val_idx]

#         model = RandomForestRegressor(**params)
#         model.fit(x_train, y_train)

#         preds = model.predict(x_val)
#         score = np.sqrt(mean_squared_log_error(np.expm1(y_val), np.expm1(preds)))
#         scores.append(score)

#     return np.mean(scores)

# study = optuna.create_study(direction="minimize")
# study.optimize(objective, n_trials=5)

# best_params = study.best_params
# best_params

In [None]:
# %%time

# # best_params = {
# #     "n_estimators": 500,
# #     "max_depth": trial.suggest_int("max_depth", 3, 20),
# #     "min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
# #     "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 5),
# #     "max_features": trial.suggest_float("max_features", 0.6, 1.0),
# #     "bootstrap": True,
# #     "random_state": SEED,
# # }

# best_params['n_estimators'] = 500

# folds_train = KFold(n_splits=5, shuffle=True, random_state=SEED)
# rf_OOF = np.zeros(len(x)) 
# rf_preds = np.zeros(len(test)) 
# models_rf = []

# for train_idx, val_idx in folds_train.split(x_nan):
#     x_train, x_val = x_nan.iloc[train_idx], x_nan.iloc[val_idx]
#     y_train, y_val = y_log_nan.iloc[train_idx], y_log_nan.iloc[val_idx]

#     model = RandomForestRegressor(
#         **best_params,
#         random_state=SEED,
#     )

#     model.fit(x_train, y_train)

#     rf_OOF[val_idx] += model.predict(x_val)
#     rf_preds += model.predict(test) / folds_train.n_splits
#     models_rf.append(model)

In [None]:
# print("Validation RMSE:", np.sqrt(mean_squared_error(y_log, rf_OOF)))

# Blending

In [64]:
%%time

def objective(trial):
    w1 = trial.suggest_float('w1', 0.0, 1.0)
    w2 = trial.suggest_float('w2', 0.0, 1.0)
    w3 = trial.suggest_float('w3', 0.0, 1.0)
    w4 = 1.0 - (w1 + w2 + w3)

    if w4 < 0 or w4 > 1:
        return float('inf')
    
    if w3 < 0 or w3 > 1:
        return float('inf')
    
    ensemble_vote = (w1 * lgbm_OOF) + (w2 * cat_OOF) + (w3 * xgb_OOF) + (w4 * et_OOF)
    rmse = np.sqrt(mean_squared_error(y_log, ensemble_vote))
    
    return rmse

study_vote = optuna.create_study(direction='minimize')
study_vote.optimize(objective, n_trials=100)

# 최적 가중치 및 RMSE 출력
print(f"Best Weights: {study_vote.best_params}")
print(f"Best RMSE: {study_vote.best_value:.4f}")

[I 2024-12-30 09:20:41,772] A new study created in memory with name: no-name-5b10aabb-a981-4afe-be28-f95eb6e37e6a
[I 2024-12-30 09:20:41,774] Trial 0 finished with value: inf and parameters: {'w1': 0.2037856966824516, 'w2': 0.7586672677831224, 'w3': 0.3344838666132287}. Best is trial 0 with value: inf.
[I 2024-12-30 09:20:41,776] Trial 1 finished with value: inf and parameters: {'w1': 0.897955597075071, 'w2': 0.4237314604983533, 'w3': 0.024950591118173437}. Best is trial 0 with value: inf.
[I 2024-12-30 09:20:41,794] Trial 2 finished with value: 1.0465886565065712 and parameters: {'w1': 0.21731252233180287, 'w2': 0.16264113226265475, 'w3': 0.5505217662516302}. Best is trial 2 with value: 1.0465886565065712.
[I 2024-12-30 09:20:41,795] Trial 3 finished with value: inf and parameters: {'w1': 0.521432195365425, 'w2': 0.8000810201086843, 'w3': 0.9619952172166097}. Best is trial 2 with value: 1.0465886565065712.
[I 2024-12-30 09:20:41,796] Trial 4 finished with value: inf and parameters: {'

Best Weights: {'w1': 0.8418197076926294, 'w2': 0.09519922121859947, 'w3': 0.049848592365118696}
Best RMSE: 1.0460
CPU times: user 1.31 s, sys: 55.3 ms, total: 1.37 s
Wall time: 1.35 s


In [71]:
best_weights = study_vote.best_params
best_weights['w4'] = 1 - best_weights['w1'] - best_weights['w2'] - best_weights['w3']
preds_exp = (best_weights['w1'] * lgbm_preds) + (best_weights['w2'] * cat_preds) + (best_weights['w3'] * xgb_preds) + (best_weights['w4'] * et_preds)
preds = np.expm1(preds_exp)

# Submission

In [68]:
submission = pd.read_csv('./data/sample_submission.csv')
submission.head()

Unnamed: 0,id,Premium Amount
0,1200000,1102.545
1,1200001,1102.545
2,1200002,1102.545
3,1200003,1102.545
4,1200004,1102.545


In [72]:
submission['Premium Amount'] = preds
submission.head()

Unnamed: 0,id,Premium Amount
0,1200000,822.524392
1,1200001,789.329706
2,1200002,793.127541
3,1200003,801.450462
4,1200004,756.95271


In [73]:
submission.to_csv('./data/04_03.csv', index=False)
!kaggle competitions submit -c playground-series-s4e12 -f "./data/04_03.csv" -m "04_03_blending4"

100%|██████████████████████████████████████| 19.8M/19.8M [00:00<00:00, 33.1MB/s]
Successfully submitted to Regression with an Insurance Dataset

Public Score : 1.04494