# Module

In [1]:
!chmod 600 ~/.kaggle/kaggle.json

In [2]:
%%time

import random
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

from category_encoders import TargetEncoder
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import PowerTransformer, OneHotEncoder, LabelEncoder, OrdinalEncoder
from sklearn.metrics import *
from sklearn.ensemble import RandomForestRegressor

import optuna
from xgboost import XGBRegressor, callback
from lightgbm import LGBMRegressor, early_stopping, log_evaluation
from catboost import CatBoostRegressor, Pool
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import Ridge
from sklearn.ensemble import StackingRegressor

import warnings

warnings.filterwarnings('ignore')

CPU times: user 1.02 s, sys: 738 ms, total: 1.75 s
Wall time: 1.48 s


In [3]:
SEED=2024

np.random.seed(SEED)
random.seed(SEED)

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

# Func

In [4]:
%%time

def load_data():    
    train = pd.read_csv('data/train.csv')
    test = pd.read_csv('data/test.csv')    
    all_df = pd.concat([train, test], sort=False).reset_index(drop=True)
    return train, test, all_df

def fill_nan_values(df):
    num_cols = [col for col in df.select_dtypes(exclude='object').columns if col != 'Premium Amount']
    cat_cols = df.select_dtypes(include='object').columns
    for col in num_cols:
        df[col] = df[col].fillna(df[col].median())
    for col in cat_cols:
        df[col] = df[col].fillna('missing')
    return df
    
def skewed(df, all_df):
    pt = PowerTransformer(method='yeo-johnson')
    pt.fit(df[['Annual Income']])
    all_df['transformed_Annual_Income'] = pt.transform(all_df[['Annual Income']])
    # all_df['log_Annual_Income'] = np.log1p(all_df['Annual Income'])
    return all_df
    
def date(df):
    df['Policy Start Date'] = pd.to_datetime(df['Policy Start Date'])
    df['Year'] = df['Policy Start Date'].dt.year
    df['Day'] = df['Policy Start Date'].dt.day
    df['Month'] = df['Policy Start Date'].dt.month
    df['Month_name'] = df['Policy Start Date'].dt.month_name()
    df['Day_of_week'] = df['Policy Start Date'].dt.day_name()
    df['Week'] = df['Policy Start Date'].dt.isocalendar().week
    df['Year_sin'] = np.sin(2 * np.pi * df['Year'])
    df['Year_cos'] = np.cos(2 * np.pi * df['Year'])
    min_year = df['Year'].min()
    max_year = df['Year'].max()
    df['Year_sin'] = np.sin(2 * np.pi * (df['Year'] - min_year) / (max_year - min_year))
    df['Year_cos'] = np.cos(2 * np.pi * (df['Year'] - min_year) / (max_year - min_year))
    df['Month_sin'] = np.sin(2 * np.pi * df['Month'] / 12) 
    df['Month_cos'] = np.cos(2 * np.pi * df['Month'] / 12)
    df['Day_sin'] = np.sin(2 * np.pi * df['Day'] / 31)  
    df['Day_cos'] = np.cos(2 * np.pi * df['Day'] / 31)
    df['Group']=(df['Year']-2020)*48+df['Month']*4+df['Day']//7    
    df.drop('Policy Start Date', axis=1, inplace=True)
    return df

def get_nan_cols(df):
    nan_cols = ['Marital Status', 'Customer Feedback', 'Health Score', 'Previous Claims', 'Vehicle Age', 'Credit Score', 'Insurance Duration']
    for col in nan_cols:
        col_name = col + '_NA'
        df[col_name] = df[col].isnull().astype(int)
    return df

def get_encoding(df):
    def encode_ordinal(df):
        educ = {"High School":0, "Bachelor's":1, "Master's":2, "PhD":3}
        policy = {'Basic':0, 'Comprehensive':1, 'Premium':2}
        exerc = {'Rarely':0, 'Daily':1, 'Weekly':2, 'Monthly': 3}
        # feedback = {'Poor':0, 'Average':1, 'Good':2}

        df['Education Level'] = df['Education Level'].map(educ)
        df['Policy Type'] = df['Policy Type'].map(policy)
        df['Exercise Frequency'] = df['Exercise Frequency'].map(exerc)
        # df['Customer Feedback'] = df['Customer Feedback'].map(feedback)
        df['Gender'] = df['Gender'].map({'Male':0, 'Female':1})
        df['Smoking Status'] = df['Smoking Status'].map({'Yes':1, 'No':0})
        return df
    
    def target_encoder(df):
        train = df[~df['Premium Amount'].isnull()]
        test = df[df['Premium Amount'].isnull()]
        encoder = TargetEncoder()
        categorical_cols = ['Marital Status', 'Customer Feedback']
        train[categorical_cols] = encoder.fit_transform(train[categorical_cols], train['Premium Amount'])
        test[categorical_cols] = encoder.transform(test[categorical_cols])
        df = pd.concat([train, test], sort=False).reset_index(drop=True)
        return df

    def one_hot_dummies(df, categorical):
        oh = pd.get_dummies(df[categorical])
        df = df.drop(categorical, axis=1)
        return pd.concat([df, oh], axis=1)
        return df

    df = encode_ordinal(df)
    df = target_encoder(df)

    categorical_features = df.select_dtypes(include='object').columns
    df = one_hot_dummies(df, categorical_features)
    return df

def add_new_features(df):
    df['Income_Dependents Ratio'] = df['Annual Income'] / (df['Number of Dependents'].fillna(0) + 1)
    df['Income_per_Dependent'] = df['Annual Income'] / (df['Number of Dependents'] + 1)
    df['CreditScore_InsuranceDuration'] = df['Credit Score'] * df['Insurance Duration']
    df['Health_Risk_Score'] = df['Smoking Status'].apply(lambda x: 1 if x == 'Smoker' else 0) + \
                                df['Exercise Frequency'].apply(lambda x: 1 if x == 'Low' else (0.5 if x == 'Medium' else 0)) + \
                                (100 - df['Health Score']) / 20
    df['Credit_Health_Score'] = df['Credit Score'] * df['Health Score']
    df['Health_Age_Interaction'] = df['Health Score'] * df['Age']

    df['contract_length'] = pd.cut(
        df["Insurance Duration"].fillna(99),  
        bins=[-float('inf'), 1, 3, float('inf')],  
        labels=[0, 1, 2]  
    ).astype(int)

    df['Age_Income'] = df['Age'] * df['Annual Income']

    # df["Annual_Income_Health_Score_Ratio"] = df["Health Score"] / df["Annual Income"]
    # df["Annual_Income_Age_Ratio"] = df["Annual Income"] / df["Age"]
    # df["Credit_Age"] = df["Credit Score"] / df["Age"]
    # df["Vehicle_Age_Insurance_Duration"] = df["Vehicle Age"] / df["Insurance Duration"]
    return df

def prep():
    train, test, all_df = load_data()

    all_df = skewed(train, all_df)
    all_df = date(all_df)
    all_df = get_nan_cols(all_df)
    # all_df = fill_nan_values(all_df)
    all_df = get_encoding(all_df)
    all_df = add_new_features(all_df)

    del all_df['Annual Income']
    
    train = all_df[~all_df['Premium Amount'].isnull()]
    test = all_df[all_df['Premium Amount'].isnull()]
    train.drop('id', axis=1, inplace=True)
    test.drop(['id', 'Premium Amount'], axis=1, inplace=True)
    return train, test, all_df

def prep_nan():
    train, test, all_df = load_data()

    all_df = skewed(train, all_df)
    all_df = date(all_df)
    all_df = get_nan_cols(all_df)
    all_df = fill_nan_values(all_df)
    all_df = get_encoding(all_df)
    all_df = add_new_features(all_df)

    del all_df['Annual Income']
    
    train = all_df[~all_df['Premium Amount'].isnull()]
    test = all_df[all_df['Premium Amount'].isnull()]
    train.drop('id', axis=1, inplace=True)
    test.drop(['id', 'Premium Amount'], axis=1, inplace=True)
    return train, test, all_df

train, test, all_df = prep()

CPU times: user 9.16 s, sys: 3.17 s, total: 12.3 s
Wall time: 12.4 s


In [5]:
train.head()

Unnamed: 0,Age,Gender,Marital Status,Number of Dependents,Education Level,Health Score,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Customer Feedback,Smoking Status,Exercise Frequency,Premium Amount,transformed_Annual_Income,Year,Day,Month,Week,Year_sin,Year_cos,Month_sin,Month_cos,Day_sin,Day_cos,Group,Marital Status_NA,Customer Feedback_NA,Health Score_NA,Previous Claims_NA,Vehicle Age_NA,Credit Score_NA,Insurance Duration_NA,Occupation_Employed,Occupation_Self-Employed,Occupation_Unemployed,Location_Rural,Location_Suburban,Location_Urban,Property Type_Apartment,Property Type_Condo,Property Type_House,Month_name_April,Month_name_August,Month_name_December,Month_name_February,Month_name_January,Month_name_July,Month_name_June,Month_name_March,Month_name_May,Month_name_November,Month_name_October,Month_name_September,Day_of_week_Friday,Day_of_week_Monday,Day_of_week_Saturday,Day_of_week_Sunday,Day_of_week_Thursday,Day_of_week_Tuesday,Day_of_week_Wednesday,Income_Dependents Ratio,Income_per_Dependent,CreditScore_InsuranceDuration,Health_Risk_Score,Credit_Health_Score,Health_Age_Interaction,contract_length,Age_Income
0,19.0,1,1099.844389,1.0,1,22.598761,2,2.0,17.0,372.0,5.0,1098.892745,0,2,2869.0,-0.596487,2023,23,12,51,-0.9510565,0.309017,-2.449294e-16,1.0,-0.998717,-0.050649,195,0,0,0,0,0,0,0,False,True,False,False,False,True,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,5024.5,5024.5,1860.0,3.870062,8406.73897,429.376453,2,190931.0
1,39.0,1,1100.625116,3.0,2,15.569731,1,1.0,12.0,694.0,2.0,1094.350977,1,3,1483.0,0.336563,2023,12,6,24,-0.9510565,0.309017,1.224647e-16,-1.0,0.651372,-0.758758,169,0,0,0,0,0,0,0,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,7919.5,7919.5,1388.0,4.221513,10805.393307,607.219509,1,1235442.0
2,23.0,0,1100.625116,3.0,0,47.177549,2,1.0,14.0,,3.0,1096.284299,1,2,567.0,0.140781,2023,30,9,39,-0.9510565,0.309017,-1.0,-1.83697e-16,-0.201299,0.97953,184,0,0,0,0,0,1,0,False,True,False,False,True,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,6400.5,6400.5,,2.641123,,1085.083634,1,588846.0
3,21.0,0,1099.844389,2.0,1,10.938144,0,1.0,0.0,367.0,1.0,1098.892745,1,1,765.0,2.088459,2024,12,6,24,-2.449294e-16,1.0,1.224647e-16,-1.0,0.651372,-0.758758,217,0,0,0,0,0,0,0,False,False,False,True,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,47285.0,47285.0,367.0,4.453093,4014.298906,229.701027,0,2978955.0
4,21.0,0,1101.735535,1.0,1,20.376094,2,0.0,8.0,598.0,4.0,1098.892745,1,2,2022.0,0.555622,2021,1,12,48,0.5877853,-0.809017,-2.449294e-16,1.0,0.201299,0.97953,96,0,0,0,0,0,0,0,False,True,False,True,False,False,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,19825.5,19825.5,2392.0,3.981195,12184.903989,427.897966,2,832671.0


In [6]:
train.shape, test.shape

((1200000, 70), (800000, 69))

# Model

In [7]:
x = train.drop('Premium Amount', axis=1)
y = train['Premium Amount']

y_log = np.log1p(y)

n_splits=10
folds = KFold(n_splits=n_splits, shuffle=True, random_state=SEED)
folds_train = KFold(n_splits=5, shuffle=True, random_state=SEED)

## LGBM

In [None]:
# %%time

# def objective(trial):
#     params = {
#         'n_estimators': 300,
#         'boosting_type': 'gbdt',
#         'num_leaves': trial.suggest_int('num_leaves', 10, 300),
#         'learning_rate': trial.suggest_float('learning_rate', 1e-4, 1e-1),
#         'feature_fraction': trial.suggest_float('feature_fraction', 0.6, 1.0),
#         'bagging_fraction': trial.suggest_float('bagging_fraction', 0.6, 1.0),
#         'bagging_freq': trial.suggest_int('bagging_freq', 5, 12),
#         'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 10, 100),
#         'max_depth': trial.suggest_int('max_depth', -1, 12),
#         'lambda_l1': trial.suggest_float('lambda_l1', 1e-4, 10.0),
#         'lambda_l2': trial.suggest_float('lambda_l2', 1e-4, 10.0),
#         'min_gain_to_split': trial.suggest_float('min_gain_to_split', 0.001, 0.1),
#         'n_jobs': -1,
#         'verbose': -1
#     }

#     model = LGBMRegressor(**params)
#     scores = []

#     for train_idx, val_idx in folds.split(x):
#         x_train, x_val = x.iloc[train_idx], x.iloc[val_idx]
#         y_train, y_val = y_log.iloc[train_idx], y_log.iloc[val_idx]

#         model.fit(
#             x_train, y_train, 
#             eval_set=[(x_val, y_val)],
#             eval_metric='rmse',
#             callbacks=[
#                 early_stopping(50),
#                 log_evaluation(10)
#             ])
#         preds = model.predict(x_val)
#         score = np.sqrt(mean_squared_log_error(np.expm1(y_val), np.expm1(preds)))
#         scores.append(score)
#     return np.mean(scores)

# study = optuna.create_study(direction='minimize')
# study.optimize(objective, n_trials=5)

# best_params = study.best_params
# best_params

In [16]:
best_params

{'num_leaves': 111,
 'learning_rate': 0.0583357228522494,
 'feature_fraction': 0.9607103509286345,
 'bagging_fraction': 0.752032361770083,
 'bagging_freq': 11,
 'min_data_in_leaf': 74,
 'max_depth': -1,
 'lambda_l1': 4.094647896963407,
 'lambda_l2': 7.310837913970018,
 'min_gain_to_split': 0.004067559169870071,
 'n_estimators': 500}

In [12]:
%%time
best_params = {
 'n_estimators': 500,
 'boosting_type': 'gbdt',
 'num_leaves': 111,
 'learning_rate': 0.0583357228522494,
 'feature_fraction': 0.9607103509286345,
 'bagging_fraction': 0.752032361770083,
 'bagging_freq': 11,
 'min_data_in_leaf': 74,
 'max_depth': -1,
 'lambda_l1': 4.094647896963407,
 'lambda_l2': 7.310837913970018,
 'min_gain_to_split': 0.004067559169870071,
 'n_jobs': -1
}
# best_params['n_estimators'] = 500

models_lgb = []
lgbm_OOF = np.zeros(len(x))
lgbm_preds = np.zeros(len(test))

for train_idx, val_idx in folds_train.split(x):
    x_train, x_val = x.iloc[train_idx], x.iloc[val_idx]
    y_train, y_val = y_log.iloc[train_idx], y_log.iloc[val_idx]

    model = LGBMRegressor(**best_params)
    model.fit(
        x_train, y_train, 
        eval_set=[(x_val, y_val)],
        eval_metric='rmse',
        callbacks=[
            early_stopping(100),
            log_evaluation(50)
        ])

    lgbm_OOF[val_idx] += model.predict(x_val)
    lgbm_preds += model.predict(test) / folds_train.n_splits
    models_lgb.append(model)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.054218 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3171
[LightGBM] [Info] Number of data points in the train set: 960000, number of used features: 67
[LightGBM] [Info] Start training from score 6.594502
Training until validation scores don't improve for 100 rounds
[50]	valid_0's rmse: 1.04741	valid_0's l2: 1.09706
[100]	valid_0's rmse: 1.04715	valid_0's l2: 1.09651
[150]	valid_0's rmse: 1.04713	valid_0's l2: 1.09648
[200]	valid_0's rmse: 1.04724	valid_0's l2: 1.09671
Early stopping, best iteration is:
[143]	valid_0's rmse: 1.0471	valid_0's l2: 1.09643
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.050776 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [

In [13]:
print("Validation RMSE:", np.sqrt(mean_squared_error(y_log, lgbm_OOF)))

Validation RMSE: 1.046063550028496


## CatBoost

In [None]:
# %%time
# def objective(trial):
#     params = {
#         "iterations": 300,
#         "loss_function": "RMSE",
#         "learning_rate": trial.suggest_float("learning_rate", 1e-4, 1e-1),
#         "depth": trial.suggest_int("depth", 3, 12),
#         "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-4, 10.0),
#         "bagging_temperature": trial.suggest_float("bagging_temperature", 1e-3, 1.0),
#         "random_strength": trial.suggest_float("random_strength", 1e-3, 10.0),
#         "border_count": trial.suggest_int("border_count", 32, 255),
#         "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.6, 1.0),
#         "verbose": 50,
#         "random_seed": SEED,
#     }

#     scores = []
#     for train_idx, val_idx in folds.split(x):
#         x_train, x_val = x.iloc[train_idx], x.iloc[val_idx]
#         y_train, y_val = y_log.iloc[train_idx], y_log.iloc[val_idx]

#         model = CatBoostRegressor(**params)
#         model.fit(
#             x_train, y_train,
#             eval_set=(x_val, y_val),
#             early_stopping_rounds=50,
#         )
#         preds = model.predict(x_val)
#         score = np.sqrt(mean_squared_log_error(np.expm1(y_val), np.expm1(preds)))
#         scores.append(score)

#     return np.mean(scores)

# study = optuna.create_study(direction="minimize")
# study.optimize(objective, n_trials=5)

# best_params = study.best_params
# best_params

In [18]:
%%time

best_params = {
 'iterations':1000,
 'loss_function': 'RMSE',
 'learning_rate': 0.09295757892732069,
 'depth': 7,
 'l2_leaf_reg': 2.8780706448862734,
 'bagging_temperature': 0.12215801350190825,
 'random_strength': 8.553048856390589,
 'border_count': 232,
 'colsample_bylevel': 0.7252465177667906,
 'verbose': 50,
 'random_seed': SEED,
}

# best_params['iterations'] = 500

models_cat = []
cat_OOF = np.zeros(len(x))
cat_preds = np.zeros(len(test))

for train_idx, val_idx in folds_train.split(x):
    x_train, x_val = x.iloc[train_idx], x.iloc[val_idx]
    y_train, y_val = y_log.iloc[train_idx], y_log.iloc[val_idx]

    model = CatBoostRegressor(
        **best_params,
    )

    model.fit(
        x_train, y_train,
        eval_set=(x_val, y_val),
        early_stopping_rounds=100,
    )

    cat_OOF[val_idx] += model.predict(x_val)
    cat_preds += model.predict(test) / folds_train.n_splits
    models_cat.append(model)

0:	learn: 1.0909730	test: 1.0927722	best: 1.0927722 (0)	total: 66.2ms	remaining: 1m 6s
50:	learn: 1.0572052	test: 1.0593770	best: 1.0593770 (50)	total: 3.64s	remaining: 1m 7s
100:	learn: 1.0545232	test: 1.0571008	best: 1.0571008 (100)	total: 6.98s	remaining: 1m 2s
150:	learn: 1.0530146	test: 1.0560597	best: 1.0560597 (150)	total: 10.1s	remaining: 57s
200:	learn: 1.0477474	test: 1.0509845	best: 1.0509845 (200)	total: 13.2s	remaining: 52.6s
250:	learn: 1.0460310	test: 1.0498070	best: 1.0498070 (250)	total: 16.6s	remaining: 49.4s
300:	learn: 1.0448203	test: 1.0493980	best: 1.0493980 (300)	total: 19.9s	remaining: 46.2s
350:	learn: 1.0437929	test: 1.0491639	best: 1.0491619 (344)	total: 23.3s	remaining: 43.1s
400:	learn: 1.0427509	test: 1.0489238	best: 1.0489188 (397)	total: 26.8s	remaining: 40s
450:	learn: 1.0417748	test: 1.0487576	best: 1.0487576 (450)	total: 30.2s	remaining: 36.8s
500:	learn: 1.0409085	test: 1.0487307	best: 1.0487204 (494)	total: 33.6s	remaining: 33.5s
550:	learn: 1.04007

In [19]:
print("Validation RMSE:", np.sqrt(mean_squared_error(y_log, cat_OOF)))

Validation RMSE: 1.0475759976811865


## XGBoost

In [20]:
from xgboost import set_config

# 전역 설정 초기화
set_config(verbosity=3)  # 허용 범위 내 값 설정

In [None]:
# %%time

# def objective(trial):
#     params = {
#         "objective": "reg:squarederror",
#         "eval_metric": "rmse",
#         "booster": "gbtree",
#         "eta": trial.suggest_float("eta", 1e-4, 1e-1, log=True),
#         "max_depth": trial.suggest_int("max_depth", 3, 12),
#         "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
#         "subsample": trial.suggest_float("subsample", 0.6, 1.0),
#         "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
#         "lambda": trial.suggest_float("lambda", 1e-4, 10.0, log=True),
#         "alpha": trial.suggest_float("alpha", 1e-4, 10.0, log=True),
#         "gamma": trial.suggest_float("gamma", 0.001, 0.1),
#         "seed": SEED,
#         "verbosity": 3
#     }

#     scores = []
#     for train_idx, val_idx in folds.split(x):
#         x_train, x_val = x.iloc[train_idx], x.iloc[val_idx]
#         y_train, y_val = y_log.iloc[train_idx], y_log.iloc[val_idx]

#         model = XGBRegressor(**params)
#         # early_stop = callback.EarlyStopping(rounds=50, metric_name='rmse', save_best=True)
#         model.fit(
#             x_train, y_train,
#             eval_set=[(x_val, y_val)],
#             # callbacks=[early_stop],
#             verbose=True
#         )
#         preds = model.predict(x_val)
#         score = np.sqrt(mean_squared_log_error(np.expm1(y_val), np.expm1(preds)))
#         scores.append(score)

#     return np.mean(scores)

# study = optuna.create_study(direction="minimize")
# study.optimize(objective, n_trials=5)

# best_params = study.best_params
# best_params

In [22]:
%%time
best_params = {
 "objective": "reg:squarederror",
 "eval_metric": "rmse",
 'booster': 'gbtree',
 'eta': 0.026406022486331556,
 'max_depth': 8,
 'min_child_weight': 9,
 'subsample': 0.9450644777657322,
 'colsample_bytree': 0.7330569749023635,
 'lambda': 0.3878173140428721,
 'alpha': 3.8929025064300062,
 'gamma': 0.0934054158792672,
 "seed": SEED,
 "verbosity": 3
 }

# best_params['iterations'] = 500

folds_train = KFold(n_splits=5, shuffle=True, random_state=SEED)
models_cat = []
xgb_OOF = np.zeros(len(x))
xgb_preds = np.zeros(len(test))

for train_idx, val_idx in folds_train.split(x):
    x_train, x_val = x.iloc[train_idx], x.iloc[val_idx]
    y_train, y_val = y_log.iloc[train_idx], y_log.iloc[val_idx]

    model = XGBRegressor(
        **best_params,
        loss_function="RMSE",
        random_seed=SEED,
    )

    model.fit(
        x_train, y_train,
        eval_set=[(x_val, y_val)],
        # early_stopping_rounds=100,
        verbose=True
    )

    xgb_OOF[val_idx] += model.predict(x_val)
    xgb_preds += model.predict(test) / folds_train.n_splits
    models_cat.append(model)

[11:12:26] AllReduce: 0.016872s, 1 calls @ 16872us

[11:12:26] MakeCuts: 0.01726s, 1 calls @ 17260us

[11:12:27] DEBUG: /workspace/src/gbm/gbtree.cc:130: Using tree method: 0
[0]	validation_0-rmse:1.09523
[1]	validation_0-rmse:1.09369
[2]	validation_0-rmse:1.09140
[3]	validation_0-rmse:1.08973
[4]	validation_0-rmse:1.08757
[5]	validation_0-rmse:1.08607
[6]	validation_0-rmse:1.08466
[7]	validation_0-rmse:1.08276
[8]	validation_0-rmse:1.08182
[9]	validation_0-rmse:1.08007
[10]	validation_0-rmse:1.07885
[11]	validation_0-rmse:1.07757
[12]	validation_0-rmse:1.07603
[13]	validation_0-rmse:1.07457
[14]	validation_0-rmse:1.07342
[15]	validation_0-rmse:1.07209
[16]	validation_0-rmse:1.07083
[17]	validation_0-rmse:1.06997
[18]	validation_0-rmse:1.06882
[19]	validation_0-rmse:1.06797
[20]	validation_0-rmse:1.06692
[21]	validation_0-rmse:1.06602
[22]	validation_0-rmse:1.06506
[23]	validation_0-rmse:1.06434
[24]	validation_0-rmse:1.06367
[25]	validation_0-rmse:1.06332
[26]	validation_0-rmse:1.0628

In [23]:
print("Validation RMSE:", np.sqrt(mean_squared_error(y_log, xgb_OOF)))

Validation RMSE: 1.0470646013549936


## ExtraTrees

In [24]:
train_nan, test_nan, all_df_nan = prep_nan()

In [25]:
x_nan = train_nan.drop('Premium Amount', axis=1)
y_nan = train_nan['Premium Amount']

y_log_nan = np.log1p(y_nan)

In [None]:
# %%time

# def objective(trial):
#     params = {
#         "n_estimators": trial.suggest_int("n_estimators", 50, 150, step=50), 
#         "max_depth": trial.suggest_int("max_depth", 3, 15),
#         "min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
#         "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 5),
#         "max_features": trial.suggest_float("max_features", 0.4, 0.9),  
#         "bootstrap": False,  
#         "random_state": SEED,
#         "n_jobs": -1
#     }

#     scores = []

#     # x_sample = x_nan.sample(frac=0.5, random_state=SEED)
#     # y_sample = y_log_nan.loc[x_sample.index]
#     folds_opt = KFold(n_splits=3, shuffle=True, random_state=SEED)

#     # for train_idx, val_idx in folds_opt.split(x_nan):
#     for fold, (train_idx, val_idx) in tqdm(enumerate(folds_opt.split(x_nan)), total=folds_opt.get_n_splits()):
#         x_train, x_val = x_nan.iloc[train_idx], x_nan.iloc[val_idx]
#         y_train, y_val = y_log_nan.iloc[train_idx], y_log_nan.iloc[val_idx]

#         model = ExtraTreesRegressor(**params)
#         model.fit(x_train, y_train)

#         preds = model.predict(x_val)
#         score = np.sqrt(mean_squared_log_error(np.expm1(y_val), np.expm1(preds)))
#         scores.append(score)
#         print(f"Fold {fold + 1} RMSE: {score:.4f}")

#     return np.mean(scores)

# study = optuna.create_study(direction="minimize")
# study.optimize(objective, n_trials=5)

# best_params = study.best_params
# best_params

In [28]:
%%time

best_params = {
    "n_estimators": 200,
    "max_depth": 13,
    "min_samples_split": 4,
    "min_samples_leaf": 2,
    "max_features": 0.7460208372574245,
    "bootstrap": False,  
    "random_state": SEED,
    "n_jobs": -1
}

# best_params['n_estimators'] = 200

et_OOF = np.zeros(len(x_nan)) 
et_preds = np.zeros(len(test_nan)) 
models_et = []

for fold, (train_idx, val_idx) in tqdm(enumerate(folds_train.split(x_nan)), total=folds_train.get_n_splits()):
    x_train, x_val = x_nan.iloc[train_idx], x_nan.iloc[val_idx]
    y_train, y_val = y_log_nan.iloc[train_idx], y_log_nan.iloc[val_idx]

    model = ExtraTreesRegressor(
        **best_params,
    )

    model.fit(x_train, y_train)

    et_OOF[val_idx] += model.predict(x_val)
    et_preds += model.predict(test_nan) / folds_train.n_splits
    models_et.append(model)

100%|██████████| 5/5 [44:07<00:00, 529.47s/it]

CPU times: user 2h 44min 20s, sys: 15.6 s, total: 2h 44min 36s
Wall time: 44min 7s





In [29]:
print("Validation RMSE:", np.sqrt(mean_squared_error(y_log_nan, et_OOF)))

Validation RMSE: 1.0564255745197302


# Blending

In [64]:
%%time

def objective(trial):
    w1 = trial.suggest_float('w1', 0.0, 1.0)
    w2 = trial.suggest_float('w2', 0.0, 1.0)
    w3 = trial.suggest_float('w3', 0.0, 1.0)
    w4 = 1.0 - (w1 + w2 + w3)

    if w4 < 0 or w4 > 1:
        return float('inf')
    
    if w3 < 0 or w3 > 1:
        return float('inf')
    
    ensemble_vote = (w1 * lgbm_OOF) + (w2 * cat_OOF) + (w3 * xgb_OOF) + (w4 * et_OOF)
    rmse = np.sqrt(mean_squared_error(y_log, ensemble_vote))
    
    return rmse

study_vote = optuna.create_study(direction='minimize')
study_vote.optimize(objective, n_trials=100)

# 최적 가중치 및 RMSE 출력
print(f"Best Weights: {study_vote.best_params}")
print(f"Best RMSE: {study_vote.best_value:.4f}")

[I 2024-12-30 09:20:41,772] A new study created in memory with name: no-name-5b10aabb-a981-4afe-be28-f95eb6e37e6a
[I 2024-12-30 09:20:41,774] Trial 0 finished with value: inf and parameters: {'w1': 0.2037856966824516, 'w2': 0.7586672677831224, 'w3': 0.3344838666132287}. Best is trial 0 with value: inf.
[I 2024-12-30 09:20:41,776] Trial 1 finished with value: inf and parameters: {'w1': 0.897955597075071, 'w2': 0.4237314604983533, 'w3': 0.024950591118173437}. Best is trial 0 with value: inf.
[I 2024-12-30 09:20:41,794] Trial 2 finished with value: 1.0465886565065712 and parameters: {'w1': 0.21731252233180287, 'w2': 0.16264113226265475, 'w3': 0.5505217662516302}. Best is trial 2 with value: 1.0465886565065712.
[I 2024-12-30 09:20:41,795] Trial 3 finished with value: inf and parameters: {'w1': 0.521432195365425, 'w2': 0.8000810201086843, 'w3': 0.9619952172166097}. Best is trial 2 with value: 1.0465886565065712.
[I 2024-12-30 09:20:41,796] Trial 4 finished with value: inf and parameters: {'

Best Weights: {'w1': 0.8418197076926294, 'w2': 0.09519922121859947, 'w3': 0.049848592365118696}
Best RMSE: 1.0460
CPU times: user 1.31 s, sys: 55.3 ms, total: 1.37 s
Wall time: 1.35 s


In [71]:
best_weights = study_vote.best_params
best_weights['w4'] = 1 - best_weights['w1'] - best_weights['w2'] - best_weights['w3']
preds_exp = (best_weights['w1'] * lgbm_preds) + (best_weights['w2'] * cat_preds) + (best_weights['w3'] * xgb_preds) + (best_weights['w4'] * et_preds)
preds = np.expm1(preds_exp)

# Stacking

In [50]:
OOF_std = np.std([lgbm_OOF, xgb_OOF, cat_OOF, et_OOF], axis=0)
pred_std = np.std([lgbm_preds, xgb_preds, cat_preds, et_preds], axis=0)

OOF_mean = np.mean([lgbm_OOF, xgb_OOF, cat_OOF, et_OOF], axis=0)
OOF_min = np.min([lgbm_OOF, xgb_OOF, cat_OOF, et_OOF], axis=0)
OOF_max = np.max([lgbm_OOF, xgb_OOF, cat_OOF, et_OOF], axis=0)

pred_mean = np.mean([lgbm_preds, xgb_preds, cat_preds, et_preds], axis=0)
pred_min = np.min([lgbm_preds, xgb_preds, cat_preds, et_preds], axis=0)
pred_max = np.max([lgbm_preds, xgb_preds, cat_preds, et_preds], axis=0)

stacked_train = np.column_stack((lgbm_OOF, xgb_OOF, cat_OOF, et_OOF, train_nan['transformed_Annual_Income'], train_nan['Credit Score'], OOF_std, OOF_mean, OOF_min, OOF_max))
stacked_test = np.column_stack((lgbm_preds, xgb_preds, cat_preds, et_preds, test_nan['transformed_Annual_Income'], test_nan['Credit Score'], pred_std, pred_mean, pred_min, pred_max))

In [43]:
def objective(trial):
    alpha = trial.suggest_loguniform('alpha', 1e-3, 2)
    ridge = Ridge(alpha=alpha)
    
    score = cross_val_score(ridge, stacked_train, y_log_nan, cv=5, scoring='neg_root_mean_squared_error')
    return score.mean()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

[I 2024-12-30 12:26:50,227] A new study created in memory with name: no-name-7d6bbfa1-1878-409f-8dc0-8e3046cf4ad1


[I 2024-12-30 12:26:51,019] Trial 0 finished with value: -1.045705408684945 and parameters: {'alpha': 0.0059625243249216645}. Best is trial 0 with value: -1.045705408684945.
[I 2024-12-30 12:26:51,733] Trial 1 finished with value: -1.0457054221736843 and parameters: {'alpha': 1.1594567652674188}. Best is trial 0 with value: -1.045705408684945.
[I 2024-12-30 12:26:52,461] Trial 2 finished with value: -1.0457054065333868 and parameters: {'alpha': 0.0445799810134213}. Best is trial 2 with value: -1.0457054065333868.
[I 2024-12-30 12:26:53,175] Trial 3 finished with value: -1.045705408976231 and parameters: {'alpha': 0.0010317987837480335}. Best is trial 2 with value: -1.0457054065333868.
[I 2024-12-30 12:26:53,959] Trial 4 finished with value: -1.0457054084884927 and parameters: {'alpha': 0.009324676967112495}. Best is trial 2 with value: -1.0457054065333868.
[I 2024-12-30 12:26:54,701] Trial 5 finished with value: -1.0457054045965024 and parameters: {'alpha': 0.08343844740358762}. Best i

In [44]:
best_alpha = study.best_params['alpha']
print(f"Best alpha: {best_alpha}")
print(f"Best score: {study.best_value}")

Best alpha: 0.43800046250992997
Best score: -1.0457053965327976


1. -1.0457168927031868
2. -1.045713173575535
3. -1.045716496317705
4. -1.0457053965327976

In [45]:
meta_model = Ridge(alpha=best_alpha)
meta_model.fit(stacked_train, y_log_nan)

stacked_preds = meta_model.predict(stacked_test)
print(stacked_preds[:10])

[6.66749907 6.6732041  6.67855243 6.69526801 6.63341814 6.66475219
 6.87281544 6.61364164 5.37320411 6.71519926]


In [46]:
preds = np.expm1(stacked_preds)

In [53]:
np.savez('./data/pred_oof_data.npz',
         lgbm_OOF=lgbm_OOF,
         xgb_OOF=xgb_OOF,
         cat_OOF=cat_OOF,
         et_OOF=et_OOF,
         lgbm_preds=lgbm_preds,
         xgb_preds=xgb_preds,
         cat_preds=cat_preds,
         et_preds=et_preds,
         OOF_std=OOF_std,
         OOF_mean=OOF_mean,
         OOF_min=OOF_min,
         OOF_max=OOF_max,
         pred_std=pred_std,
         pred_mean=pred_mean,
         pred_min=pred_min,
         pred_max=pred_max)

In [51]:
from sklearn.neural_network import MLPRegressor

def objective(trial):
    hidden_layer_sizes = trial.suggest_categorical('hidden_layer_sizes', [(64,), (128,), (64, 32)])
    alpha = trial.suggest_loguniform('alpha', 1e-4, 1e-1)
    learning_rate_init = trial.suggest_loguniform('learning_rate_init', 1e-4, 1e-1)

    mlp = MLPRegressor(
        hidden_layer_sizes=hidden_layer_sizes,
        alpha=alpha,
        learning_rate_init=learning_rate_init,
        max_iter=1000,
        # early_stopping=True,  # 조기 종료 활성화
        # n_iter_no_change=100,
        random_state=SEED
    )
    
    score = cross_val_score(mlp, stacked_train, y_log_nan, cv=5, scoring='neg_root_mean_squared_error')
    return score.mean()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=5)

print(f"Best score: {study.best_value}")

[I 2024-12-30 12:35:40,986] A new study created in memory with name: no-name-4dce1a77-d95d-423a-81ee-5d0dd862fb93
[I 2024-12-30 12:44:53,988] Trial 0 finished with value: -1.093189955782216 and parameters: {'hidden_layer_sizes': (128,), 'alpha': 0.001902309883123831, 'learning_rate_init': 0.03589685347393951}. Best is trial 0 with value: -1.093189955782216.
[I 2024-12-30 12:48:31,307] Trial 1 finished with value: -1.0973197475704348 and parameters: {'hidden_layer_sizes': (64,), 'alpha': 0.0355846550468324, 'learning_rate_init': 0.09652935270287906}. Best is trial 0 with value: -1.093189955782216.
[I 2024-12-30 13:03:09,342] Trial 2 finished with value: -1.0595535365866153 and parameters: {'hidden_layer_sizes': (64, 32), 'alpha': 0.002234138975871941, 'learning_rate_init': 0.0001376235781803412}. Best is trial 2 with value: -1.0595535365866153.
[I 2024-12-30 13:58:12,612] Trial 3 finished with value: -1.0595774593455944 and parameters: {'hidden_layer_sizes': (64, 32), 'alpha': 0.0004680

Best score: -1.0471217891981346


In [52]:
best_params = study.best_params
print(f"Best alpha: {best_params}")
print(f"Best score: {study.best_value}")

Best alpha: {'hidden_layer_sizes': (64, 32), 'alpha': 0.009512335506083183, 'learning_rate_init': 0.00040041326023672764}
Best score: -1.0471217891981346


- best_value가 Ridge보다 낮아 본 학습 시행 X

In [None]:
# mlp_meta = MLPRegressor(
#     **best_params, 
#     max_iter=1000, 
#     early_stopping=True, 
#     n_iter_no_change=100,
#     random_state=SEED
#     )
# mlp_meta.fit(stacked_train, y_log_nan)

# final_preds = np.expm1(mlp_meta.predict(stacked_test))

# Submission

In [47]:
submission = pd.read_csv('./data/sample_submission.csv')
submission.head()

Unnamed: 0,id,Premium Amount
0,1200000,1102.545
1,1200001,1102.545
2,1200002,1102.545
3,1200003,1102.545
4,1200004,1102.545


In [48]:
submission['Premium Amount'] = preds
submission.head()

Unnamed: 0,id,Premium Amount
0,1200000,785.426348
1,1200001,789.925756
2,1200002,794.167218
3,1200003,807.570608
4,1200004,759.075779


In [49]:
submission.to_csv('./data/04_03.csv', index=False)
!kaggle competitions submit -c playground-series-s4e12 -f "./data/04_03.csv" -m "04_03_stacking_01"

100%|██████████████████████████████████████| 19.8M/19.8M [00:00<00:00, 46.5MB/s]
Successfully submitted to Regression with an Insurance Dataset

Public Score : 
1. 1.04477