# Module

> LGBM
- standard : train - 1.0376 / OOF - 1.0461
- robuast : train - 1.0383 / OOF - 1.0461
- frequency encoding : train - 1.0381 / OOF - 1.0464
- cat_feature(small) : train - 1.0381 / OOF - 1.0465

In [1]:
!chmod 600 ~/.kaggle/kaggle.json

In [34]:
%%time

import random
import pandas as pd
import numpy as np
import polars as pl
from tqdm import tqdm
import category_encoders as ce
from IPython.display import clear_output

from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer  
from sklearn.base import clone
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PowerTransformer, OneHotEncoder, LabelEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import *
from sklearn.metrics import *
from sklearn.ensemble import HistGradientBoostingRegressor

import optuna
import lightgbm as lgb
from lightgbm import early_stopping  
from catboost import CatBoostRegressor, CatBoostClassifier, Pool

import warnings

warnings.filterwarnings('ignore')

CPU times: user 128 µs, sys: 4 µs, total: 132 µs
Wall time: 134 µs


In [37]:
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

In [4]:
SEED = 2024

np.random.seed(SEED)
random.seed(SEED)

# Func

In [67]:
%%time

def load_data():    
    train = pd.read_csv('data/train.csv')
    test = pd.read_csv('data/test.csv')    
    all_df = pd.concat([train, test], sort=False).reset_index(drop=True)
    return train, test, all_df

def split_data(df):
    train = df[~df['Premium Amount'].isnull()]
    test = df[df['Premium Amount'].isnull()]
    return train, test

def skewed(df, all_df):
    pt = PowerTransformer(method='yeo-johnson')
    pt.fit(df[['Annual Income']])
    all_df['transformed_Annual_Income'] = pt.transform(all_df[['Annual Income']])
    return all_df

def fill_nan_values(df):
    num_cols = ['Age', 'Annual Income', 'Number of Dependents', 'Health Score', 'Previous Claims', 'Credit Score']
    cat_cols = ['Marital Status', 'Occupation', 'Customer Feedback']
    for col in num_cols:
        df[col] = df[col].fillna(df[col].median())
    for col in cat_cols:
        df[col] = df[col].fillna('missing')
    return df

def date(df):
    df['Policy Start Date'] = pd.to_datetime(df['Policy Start Date'])
    df['Year'] = df['Policy Start Date'].dt.year
    df['Day'] = df['Policy Start Date'].dt.day
    df['Month'] = df['Policy Start Date'].dt.month
    df['Month_name'] = df['Policy Start Date'].dt.month_name()
    df['Day_of_week'] = df['Policy Start Date'].dt.day_name()
    df['Week'] = df['Policy Start Date'].dt.isocalendar().week
    df['Year_sin'] = np.sin(2 * np.pi * df['Year'])
    df['Year_cos'] = np.cos(2 * np.pi * df['Year'])
    min_year = df['Year'].min()
    max_year = df['Year'].max()
    df['Year_sin'] = np.sin(2 * np.pi * (df['Year'] - min_year) / (max_year - min_year))
    df['Year_cos'] = np.cos(2 * np.pi * (df['Year'] - min_year) / (max_year - min_year))
    df['Month_sin'] = np.sin(2 * np.pi * df['Month'] / 12) 
    df['Month_cos'] = np.cos(2 * np.pi * df['Month'] / 12)
    df['Day_sin'] = np.sin(2 * np.pi * df['Day'] / 31)  
    df['Day_cos'] = np.cos(2 * np.pi * df['Day'] / 31)
    df['Group']=(df['Year']-2020)*48+df['Month']*4+df['Day']//7    
    df.drop('Policy Start Date', axis=1, inplace=True)
    return df

def normalize(df):
    num_cols = ['Age', 'transformed_Annual_Income', 'Number of Dependents', 'Health Score', 'Previous Claims', 'Vehicle Age', 'Credit Score', 'Insurance Duration', \
        'Income to Dependents Ratio', 'Income_per_Dependent', 'CreditScore_InsuranceDuration', 'Health_Risk_Score', 'Credit_Health_Score', 'Health_Age_Interaction', 'contract length']
    scaler = StandardScaler()    
    df[num_cols] = scaler.fit_transform(df[num_cols])
    return df, scaler

def get_encoding(df):
    def encode_ordinal(df):
        educ = {"High School":0, "Bachelor's":1, "Master's":2, "PhD":3}
        policy = {'Basic':0, 'Comprehensive':1, 'Premium':2}
        exerc = {'Rarely':0, 'Daily':1, 'Weekly':2, 'Monthly': 3}
        feedback = {'Poor':0, 'Average':1, 'Good':2, "Unknown": 0}

        df['Education Level'] = df['Education Level'].map(educ)
        df['Policy Type'] = df['Policy Type'].map(policy)
        df['Exercise Frequency'] = df['Exercise Frequency'].map(exerc)
        df['Customer Feedback'] = df['Customer Feedback'].map(feedback)
        df['Gender'] = df['Gender'].map({'Male':0, 'Female':1})
        df['Smoking Status'] = df['Smoking Status'].map({'Yes':1, 'No':0})
        return df
    df = encode_ordinal(df)
    return df

def add_new_features(df):
    df['Income to Dependents Ratio'] = df['Annual Income'] / (df['Number of Dependents'].fillna(0) + 1)
    df['Income_per_Dependent'] = df['Annual Income'] / (df['Number of Dependents'] + 1)
    df['CreditScore_InsuranceDuration'] = df['Credit Score'] * df['Insurance Duration']
    df['Health_Risk_Score'] = df['Smoking Status'].apply(lambda x: 1 if x == 'Smoker' else 0) + \
                                df['Exercise Frequency'].apply(lambda x: 1 if x == 'Low' else (0.5 if x == 'Medium' else 0)) + \
                                (100 - df['Health Score']) / 20
    df['Credit_Health_Score'] = df['Credit Score'] * df['Health Score']
    df['Health_Age_Interaction'] = df['Health Score'] * df['Age']
    df['contract length'] = pd.cut(
        df["Insurance Duration"].fillna(99),  
        bins=[-float('inf'), 1, 3, float('inf')],  
        labels=[0, 1, 2]  
    ).astype(int)

    return df

def prep():
    train, test, all_df = load_data()
    all_df = fill_nan_values(all_df)
    all_df = skewed(train, all_df)
    train, test = split_data(all_df)
    all_df = date(all_df)
    all_df = get_encoding(all_df)
    all_df = add_new_features(all_df)
    all_df, scaler = normalize(all_df)

    all_df.drop(['Annual Income'], axis=1, inplace=True)
    
    train = all_df[~all_df['Premium Amount'].isnull()]
    test = all_df[all_df['Premium Amount'].isnull()]

    train.drop('id', axis=1, inplace=True)
    test.drop(['id', 'Premium Amount'], axis=1, inplace=True)
    
    return train, test, all_df, scaler

train, test, all_df, scaler = prep()

CPU times: user 8.95 s, sys: 1.69 s, total: 10.6 s
Wall time: 10.7 s


In [60]:
all_df.head()

Unnamed: 0,id,Age,Gender,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount,transformed_Annual_Income,Year,Day,Month,Month_name,Day_of_week,Week,Year_sin,Year_cos,Month_sin,Month_cos,Day_sin,Day_cos,Group,Income to Dependents Ratio,Income_per_Dependent,CreditScore_InsuranceDuration,Health_Risk_Score,Credit_Health_Score,Health_Age_Interaction,contract length
0,0,-1.648144,1,Married,-0.747146,1,Self-Employed,-0.24956,Urban,2,1.216065,1.286552,-1.566552,-0.007136,0.0,0,2,House,2869.0,-0.611507,2023,23,12,December,Saturday,51,-0.9510565,0.309017,-2.449294e-16,1.0,-0.998717,-0.050649,195,-0.497167,-0.497167,-0.644098,0.24956,-0.836328,-1.009994,0.64693
1,1,-0.159286,1,Divorced,0.734098,2,missing,-0.844005,Rural,1,-0.003025,0.42069,0.714332,-1.163606,1.0,1,3,House,1483.0,0.339158,2023,12,6,June,Monday,24,-0.9510565,0.309017,1.224647e-16,-1.0,0.651372,-0.758758,169,-0.343561,-0.343561,-0.916301,0.844005,-0.539777,-0.721079,-0.809527
2,2,-1.350373,0,Divorced,0.734098,0,Self-Employed,1.829065,Suburban,2,-0.003025,0.767034,0.013066,-0.778116,2.0,1,2,House,567.0,0.139681,2023,30,9,September,Saturday,39,-0.9510565,0.309017,-1.0,-1.83697e-16,-0.201299,0.97953,184,-0.424158,-0.424158,-0.687351,-1.829065,1.594762,0.055236,-0.809527
3,3,-1.499259,0,Married,-0.006524,1,missing,-1.235698,Rural,0,-0.003025,-1.65738,-1.601969,-1.549096,0.0,1,1,Apartment,765.0,2.124129,2024,12,6,June,Wednesday,24,-2.449294e-16,1.0,1.224647e-16,-1.0,0.651372,-0.758758,217,1.745127,1.745127,-1.505115,1.235698,-1.379374,-1.334377,-2.265985
4,4,-1.499259,0,Single,-0.747146,1,Self-Employed,-0.437531,Rural,2,-1.222116,-0.272,0.034317,-0.392626,0.0,1,2,House,2022.0,0.562353,2021,1,12,December,Wednesday,48,0.5877853,-0.809017,-2.449294e-16,1.0,0.201299,0.97953,96,0.288157,0.288157,-0.337292,0.437531,-0.369225,-1.012396,0.64693


In [61]:
train.shape, test.shape

((1200000, 40), (800000, 39))

# AbdML

In [21]:
import os
import sys

sys.path.append(os.path.abspath("../AbdML"))

from main import AbdBase

In [63]:
cat_c = [col for col in train.columns if train[col].dtype=='object']
cat_c

['Marital Status',
 'Occupation',
 'Location',
 'Property Type',
 'Month_name',
 'Day_of_week']

In [68]:
def update(df):
    
    global cat_c

    for c in cat_c:
        df[c] = df[c].fillna('None').astype('category')
                
    return df

train = update(train)
test = update(test)

## CatBoost

In [69]:
n_splits = 10

cat_base = AbdBase(train_data=train, test_data=test, target_column='Premium Amount', gpu=False,
                 problem_type="regression", metric="mse", seed=SEED,
                 n_splits=n_splits, early_stop=True, num_classes=0, cat_features = cat_c,
                 fold_type='RKF')

[31m*** AbdBase ['V_1.3'] ***

[31m *** Available Settings *** 

[31mAvailable Models: [36mLGBM, [36mCAT, [36mXGB, [36mVoting, [36mTABNET
[31mAvailable Metrics: [36mroc_auc, [36maccuracy, [36mf1, [36mprecision, [36mrecall, [36mrmse, [36mwmae, [36mrmsle, [36mmae, [36mr2, [36mmse
[31mAvailable Problem Types: [36mclassification, [36mregression
[31mAvailable Fold Types: [36mSKF, [36mKF, [36mGKF, [36mGSKF, [36mRKF
[31m
 *** Configuration *** 

[31mProblem Type Selected: [36mREGRESSION
[31mMetric Selected: [36mMSE
[31mFold Type Selected: [36mRKF
[31mCalculate Train Probabilities: [36mFalse
[31mCalculate Test Probabilities: [36mFalse
[31mEarly Stopping: [36mTrue
[31mGPU: [36mFalse


In [70]:
%%time

param_space = {
    "iterations":200,
    "learning_rate": (1e-4, 1e-1),
    "depth": (3, 12),
    "l2_leaf_reg": (1e-4, 10.0),
    "bagging_temperature": (1e-3, 1.0),
    "random_strength": (1e-3, 10.0),
    "border_count": (32, 255),
    "colsample_bylevel": (0.6, 1.0),
}

cat_study = cat_base.RUN_OPTUNA(
    MODEL_NAME="CAT",
    PARAMS=param_space,
    DIRECTION='minimize',
    TRIALS=5,
    ENABLE_PRUNER=True,              # Early termination of ineffective attempts
    PRUNER_PARAMS={'n_startup_trials': 3, 'n_warmup_steps': 3, 'interval_steps': 3},
    y_log=False
)

cat_study.best_params

[I 2024-12-27 13:54:21,410] A new study created in memory with name: no-name-495002c8-4f1e-4340-9e85-5231c76539bd
Training Folds: 100%|██████████| 10/10 [45:40<00:00, 274.06s/it]
[I 2024-12-27 14:40:02,168] Trial 0 finished with value: 727038.2209 and parameters: {'learning_rate': 0.0013292918943162175, 'depth': 12, 'l2_leaf_reg': 0.4570563099801455, 'bagging_temperature': 0.06251373574521749, 'random_strength': 0.004207988669606638, 'border_count': 66, 'colsample_bylevel': 0.6180690932801379}. Best is trial 0 with value: 727038.2209.
Training Folds: 100%|██████████| 10/10 [21:38<00:00, 129.86s/it]
[I 2024-12-27 15:01:40,931] Trial 1 finished with value: 723463.6337 and parameters: {'learning_rate': 0.0396760507705299, 'depth': 9, 'l2_leaf_reg': 0.3470266988650412, 'bagging_temperature': 0.00115279871282324, 'random_strength': 7.579479953348009, 'border_count': 218, 'colsample_bylevel': 0.6687417180293094}. Best is trial 1 with value: 723463.6337.
Training Folds: 100%|██████████| 10/10

CPU times: user 4h 47min 51s, sys: 4min 31s, total: 4h 52min 22s
Wall time: 1h 44min 45s


{'learning_rate': 0.0396760507705299,
 'depth': 9,
 'l2_leaf_reg': 0.3470266988650412,
 'bagging_temperature': 0.00115279871282324,
 'random_strength': 7.579479953348009,
 'border_count': 218,
 'colsample_bylevel': 0.6687417180293094}

In [71]:
%%time

Params = cat_study.best_params
Params['iterations'] = 500

# Params = {
#  'iterations': 500,
#  'learning_rate': 0.0396760507705299,
#  'depth': 9,
#  'l2_leaf_reg': 0.3470266988650412,
#  'bagging_temperature': 0.00115279871282324,
#  'random_strength': 7.579479953348009,
#  'border_count': 218,
#  'colsample_bylevel': 0.6687417180293094,
#  }

results_cat = cat_base.Train_ML(Params,'CAT', e_stop=100, y_log=False)

Training Folds: 100%|██████████| 10/10 [38:17<00:00, 229.71s/it]

Overall Train MSE: 700538.5335
Overall OOF MSE: 702809.6251 
CPU times: user 1h 49min 24s, sys: 2min 12s, total: 1h 51min 36s
Wall time: 38min 17s





## non log featrue

In [73]:
%%time

train['nonlog'] = results_cat[0]
test['nonlog'] = results_cat[1]

scaler_nonlog = StandardScaler()
train['nonlog'] = scaler_nonlog.fit_transform(train['nonlog'].values.reshape(-1, 1))
test['nonlog'] = scaler_nonlog.transform(test['nonlog'].values.reshape(-1, 1))

CPU times: user 16.2 ms, sys: 22.4 ms, total: 38.7 ms
Wall time: 378 ms


In [74]:
import joblib

joblib.dump((results_cat[0], results_cat[1]), './data/cat_non_loged_01.pkl')

['./data/cat_non_loged_01.pkl']

## LGBM

In [75]:
lgb_base = AbdBase(train_data=train, test_data=test, target_column='Premium Amount', gpu=False,
                 problem_type="regression", metric="rmsle", seed=SEED,
                 n_splits=n_splits, early_stop=True, num_classes=0, cat_features = cat_c,
                 fold_type='RKF')

[31m*** AbdBase ['V_1.3'] ***

[31m *** Available Settings *** 

[31mAvailable Models: [36mLGBM, [36mCAT, [36mXGB, [36mVoting, [36mTABNET
[31mAvailable Metrics: [36mroc_auc, [36maccuracy, [36mf1, [36mprecision, [36mrecall, [36mrmse, [36mwmae, [36mrmsle, [36mmae, [36mr2, [36mmse
[31mAvailable Problem Types: [36mclassification, [36mregression
[31mAvailable Fold Types: [36mSKF, [36mKF, [36mGKF, [36mGSKF, [36mRKF
[31m
 *** Configuration *** 

[31mProblem Type Selected: [36mREGRESSION
[31mMetric Selected: [36mRMSLE
[31mFold Type Selected: [36mRKF
[31mCalculate Train Probabilities: [36mFalse
[31mCalculate Test Probabilities: [36mFalse
[31mEarly Stopping: [36mTrue
[31mGPU: [36mFalse


In [44]:
# %%time

# param_space = {
#     'estimators': 200,
#     'boosting_type': 'gbdt',
#     'num_leaves': (10, 300),                        
#     'learning_rate': (1e-4, 1e-1),                  
#     'feature_fraction': (0.6, 1.0),                 
#     'bagging_fraction': (0.6, 1.0),                 
#     'bagging_freq': (5, 12),                        
#     'min_data_in_leaf': (10, 100),                  
#     'max_depth': (-1, 12),                          
#     'lambda_l1': (1e-4, 10.0),                      
#     'lambda_l2': (1e-4, 10.0),                      
#     'min_gain_to_split': (0.001, 0.1),
#     'n_jobs': -1
# }

# lgb_study = lgb_base.RUN_OPTUNA(
#     MODEL_NAME="LGBM",
#     PARAMS=param_space,
#     DIRECTION='minimize',
#     TRIALS=5,
#     ENABLE_PRUNER=True,              # Early termination of ineffective attempts
#     PRUNER_PARAMS={'n_startup_trials': 3, 'n_warmup_steps': 3, 'interval_steps': 3},
#     y_log=True
# )

# lgb_study.best_params

[I 2024-12-27 12:57:02,743] A new study created in memory with name: no-name-1879bd31-0bea-4769-9f71-319effed11c3
Training Folds: 100%|██████████| 10/10 [03:55<00:00, 23.53s/it]
[I 2024-12-27 13:00:58,081] Trial 0 finished with value: 1.0464 and parameters: {'num_leaves': 118, 'learning_rate': 0.07114476009343425, 'feature_fraction': 0.8720536237417198, 'bagging_fraction': 0.8146346649119967, 'bagging_freq': 6, 'min_data_in_leaf': 24, 'max_depth': -1, 'lambda_l1': 2.1423021757741068, 'lambda_l2': 0.10129197956845731, 'min_gain_to_split': 0.02607024758370768}. Best is trial 0 with value: 1.0464.
Training Folds: 100%|██████████| 10/10 [01:52<00:00, 11.23s/it]
[I 2024-12-27 13:02:50,454] Trial 1 finished with value: 1.0584 and parameters: {'num_leaves': 15, 'learning_rate': 0.0812324508558869, 'feature_fraction': 0.9179681421265244, 'bagging_fraction': 0.6687417180293094, 'bagging_freq': 6, 'min_data_in_leaf': 26, 'max_depth': 3, 'lambda_l1': 0.042051564509138675, 'lambda_l2': 0.014445251

CPU times: user 31min 57s, sys: 5.29 s, total: 32min 2s
Wall time: 11min 3s


{'num_leaves': 118,
 'learning_rate': 0.07114476009343425,
 'feature_fraction': 0.8720536237417198,
 'bagging_fraction': 0.8146346649119967,
 'bagging_freq': 6,
 'min_data_in_leaf': 24,
 'max_depth': -1,
 'lambda_l1': 2.1423021757741068,
 'lambda_l2': 0.10129197956845731,
 'min_gain_to_split': 0.02607024758370768}

In [76]:
%%time

# Params = lgb_study.best_params
# Params['estimators'] = 500

Params = {
 'estimators': 500,
 'boosting_type': 'gbdt',
 'num_leaves': 118,
 'learning_rate': 0.07114476009343425,
 'feature_fraction': 0.8720536237417198,
 'bagging_fraction': 0.8146346649119967,
 'bagging_freq': 6,
 'min_data_in_leaf': 24,
 'max_depth': -1,
 'lambda_l1': 2.1423021757741068,
 'lambda_l2': 0.10129197956845731,
 'min_gain_to_split': 0.02607024758370768,
 'n_jobs': -1
}

results_lgb = lgb_base.Train_ML(Params,'LGBM', e_stop=100, y_log=True)

Training Folds: 100%|██████████| 10/10 [03:04<00:00, 18.48s/it]

Overall Train RMSLE: 1.0355
Overall OOF RMSLE: 1.0462 





CPU times: user 8min 56s, sys: 6.7 s, total: 9min 3s
Wall time: 3min 5s


In [77]:
results_lgb

# 1. OOF 에측결과, 2. test 예측결과, 3. 최종학습모델 4.fold별 모델 리스트, 5. OOF 스코어, 6. 학습데이터 스코어

(array([965.56338689, 745.27492854, 813.61677092, ..., 185.12529166,
        765.53797643, 312.59713686]),
 array([763.41781038, 800.1242218 , 805.32200649, ..., 811.32989461,
        808.72443438, 779.18220033]),
 LGBMRegressor(bagging_fraction=0.8146346649119967, bagging_freq=6, device='cpu',
               estimators=500, feature_fraction=0.8720536237417198,
               lambda_l1=2.1423021757741068, lambda_l2=0.10129197956845731,
               learning_rate=0.07114476009343425, min_data_in_leaf=24,
               min_gain_to_split=0.02607024758370768, n_jobs=-1, num_leaves=118,
               random_state=2024, verbose=-1),
 [LGBMRegressor(bagging_fraction=0.8146346649119967, bagging_freq=6, device='cpu',
                estimators=500, feature_fraction=0.8720536237417198,
                lambda_l1=2.1423021757741068, lambda_l2=0.10129197956845731,
                learning_rate=0.07114476009343425, min_data_in_leaf=24,
                min_gain_to_split=0.02607024758370768, n_job

# Submission

In [78]:
submission = pd.read_csv('./data/sample_submission.csv')
submission.head()

Unnamed: 0,id,Premium Amount
0,1200000,1102.545
1,1200001,1102.545
2,1200002,1102.545
3,1200003,1102.545
4,1200004,1102.545


In [79]:
submission['Premium Amount'] = results_lgb[1]
submission.head()

Unnamed: 0,id,Premium Amount
0,1200000,763.41781
1,1200001,800.124222
2,1200002,805.322006
3,1200003,802.896055
4,1200004,766.846497


In [80]:
submission.to_csv('./data/05_01_Blending.csv', index=False)
!kaggle competitions submit -c playground-series-s4e12 -f "./data/05_01_Blending.csv" -m "05_01_Blending"

100%|██████████████████████████████████████| 19.8M/19.8M [00:00<00:00, 42.4MB/s]
Successfully submitted to Regression with an Insurance Dataset

> **Public Score Comparison**

- **Baseline Model:**
  - **Public Score:** 1.04849  
  - **Rank:** 498 / 1653 (30.12%)  

- **Second Model (Feature Engineering + PowerTransformer):**
  - **Public Score:** 1.04506  
  - **Rank:** 334 / 1693 (19.72%)

- **NaN (NA col + No imputer):**
  - **Public Score:** 1.04496  
  - **Rank:** 378 / 1895 (19.94%)

- **Ensemble(lgbm + xgb + catboost):**
  - **Public Score:** 1.04475  
  - **Rank:** 346 / 1906 (18.15%)

- **AbdML(RKF) + Ensemble(lgbm + catboost):**
  - **Public Score:** 1.04473  
  - **Rank:** 357 / 1951 (18.29%)