# Module

In [1]:
!chmod 600 ~/.kaggle/kaggle.json

In [2]:
%%time

import random
import pandas as pd
import numpy as np
import polars as pl
from tqdm import tqdm
import category_encoders as ce
from IPython.display import clear_output

from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer  
from sklearn.base import clone
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PowerTransformer, OneHotEncoder, LabelEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import *
from sklearn.metrics import *
from sklearn.ensemble import HistGradientBoostingRegressor

import optuna
import lightgbm as lgb
from lightgbm import early_stopping  
from catboost import CatBoostRegressor, CatBoostClassifier, Pool

import warnings

warnings.filterwarnings('ignore')

CPU times: user 754 ms, sys: 648 ms, total: 1.4 s
Wall time: 739 ms


In [3]:
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

In [4]:
SEED = 2024

np.random.seed(SEED)
random.seed(SEED)

# Func

In [11]:
%%time

def load_data():    
    train = pd.read_csv('data/train.csv')
    test = pd.read_csv('data/test.csv')    
    all_df = pd.concat([train, test], sort=False).reset_index(drop=True)
    return train, test, all_df

def split_data(df):
    train = df[~df['Premium Amount'].isnull()]
    test = df[df['Premium Amount'].isnull()]
    return train, test

def date(df):
    df['Policy Start Date'] = pd.to_datetime(df['Policy Start Date'])
    df['Year'] = df['Policy Start Date'].dt.year
    df['Day'] = df['Policy Start Date'].dt.day
    df['Month'] = df['Policy Start Date'].dt.month
    df['Month_name'] = df['Policy Start Date'].dt.month_name()
    df['Day_of_week'] = df['Policy Start Date'].dt.day_name()
    df['Week'] = df['Policy Start Date'].dt.isocalendar().week
    df['Year_sin'] = np.sin(2 * np.pi * df['Year'])
    df['Year_cos'] = np.cos(2 * np.pi * df['Year'])
    min_year = df['Year'].min()
    max_year = df['Year'].max()
    df['Year_sin'] = np.sin(2 * np.pi * (df['Year'] - min_year) / (max_year - min_year))
    df['Year_cos'] = np.cos(2 * np.pi * (df['Year'] - min_year) / (max_year - min_year))
    df['Month_sin'] = np.sin(2 * np.pi * df['Month'] / 12) 
    df['Month_cos'] = np.cos(2 * np.pi * df['Month'] / 12)
    df['Day_sin'] = np.sin(2 * np.pi * df['Day'] / 31)  
    df['Day_cos'] = np.cos(2 * np.pi * df['Day'] / 31)
    df['Group']=(df['Year']-2020)*48+df['Month']*4+df['Day']//7    
    df.drop('Policy Start Date', axis=1, inplace=True)
    return df

def get_nan_cols(df):
    nan_cols = ['Health Score', 'Previous Claims', 'Vehicle Age', 'Credit Score', 'Insurance Duration']
    for col in nan_cols:
        col_name = col + '_NA'
        df[col_name] = df[col].isnull().astype(int)
    return df

def fill_nan_values(df):
    num_cols = ['Age', 'Annual Income', 'Number of Dependents', 'Health Score', 'Credit Score']
    cat_cols = ['Marital Status', 'Occupation', 'Customer Feedback']
    for col in num_cols:
        df[col] = df[col].fillna(df[col].median())
    for col in cat_cols:
        df[col] = df[col].fillna('missing')
    return df

def skewed(df, all_df):
    pt = PowerTransformer(method='yeo-johnson')
    pt.fit(df[['Annual Income']])
    all_df['transformed_Annual_Income'] = pt.transform(all_df[['Annual Income']])
    return all_df

def get_encoding(df):
    object_columns = df.select_dtypes(include='object').columns

    def encode_ordinal(df):
        educ = {"High School":0, "Bachelor's":1, "Master's":2, "PhD":3}
        policy = {'Basic':0, 'Comprehensive':1, 'Premium':2}
        exerc = {'Rarely':0, 'Daily':1, 'Weekly':2, 'Monthly': 3}
        feedback = {'Poor':0, 'Average':1, 'Good':2, 'missing':-1}

        df['Education Level'] = df['Education Level'].map(educ)
        df['Policy Type'] = df['Policy Type'].map(policy)
        df['Exercise Frequency'] = df['Exercise Frequency'].map(exerc)
        df['Customer Feedback'] = df['Customer Feedback'].map(feedback)
        df['Gender'] = df['Gender'].map({'Male':0, 'Female':1})
        df['Smoking Status'] = df['Smoking Status'].map({'Yes':1, 'No':0})
        return df

    def one_hot_dummies(df, categorical):
        existing_cols = [col for col in categorical if col in df.columns]
        oh = pd.get_dummies(df[existing_cols])
        df = df.drop(existing_cols, axis=1)
        return pd.concat([df, oh], axis=1)
        return df

    df = encode_ordinal(df)
    categorical_features = df.select_dtypes(include='object').columns
    df = one_hot_dummies(df, categorical_features)    
    remaining_columns = [col for col in object_columns if col in df.columns]
    df[remaining_columns] = df[remaining_columns].astype("category")
    return df

def add_new_features(df):
    df['Income to Dependents Ratio'] = df['Annual Income'] / (df['Number of Dependents'].fillna(0) + 1)
    df['Income_per_Dependent'] = df['Annual Income'] / (df['Number of Dependents'] + 1)
    df['CreditScore_InsuranceDuration'] = df['Credit Score'] * df['Insurance Duration']
    
    df['Health_Risk_Score'] = df['Smoking Status'].apply(lambda x: 1 if x == 'Smoker' else 0) + \
                                df['Exercise Frequency'].apply(lambda x: 1 if x == 'Low' else (0.5 if x == 'Medium' else 0)) + \
                                (100 - df['Health Score']) / 20
    df['Credit_Health_Score'] = df['Credit Score'] * df['Health Score']
    df['Health_Age_Interaction'] = df['Health Score'] * df['Age']
    
    df['contract length'] = pd.cut(
        df["Insurance Duration"].fillna(99),  
        bins=[-float('inf'), 1, 3, float('inf')],  
        labels=[0, 1, 2]  
    ).astype(int)

    df['Claims_Bins'] = pd.cut(
        df["Previous Claims"].fillna(-1),  
        bins=[-float('inf'), -0.1, 4, float('inf')],  
        labels=[0, 1, 2]  
    ).astype('category')

    df["Annual_Income_Health_Score_Ratio"] = df["Health Score"] / df["Annual Income"]
    df["Annual_Income_Age_Ratio"] = df["Annual Income"] / df["Age"]
    df["Credit_Age"] = df["Credit Score"] / df["Age"]
    df["Vehicle_Age_Insurance_Duration"] = df["Vehicle Age"] / df["Insurance Duration"]

    return df

def normalize(df):    
    train, test = split_data(df)    
    num_cols = [col for col in test.columns if test[col].dtype == 'float64' and col != 'Premium Amount']
    scaler = StandardScaler()
    scaler.fit(train[num_cols])
    df[num_cols] = scaler.transform(df[num_cols])
    return df, scaler

def prep():
    train, test, all_df = load_data()
    all_df = get_nan_cols(all_df)
    all_df = fill_nan_values(all_df)
    all_df = skewed(train, all_df)
    train, test = split_data(all_df)
    all_df = date(all_df)
    all_df = add_new_features(all_df)
    all_df, scaler = normalize(all_df)
    all_df = get_encoding(all_df)

    all_df.drop(['Annual Income', 'Year', 'Month', 'Day'], axis=1, inplace=True)
    
    train = all_df[~all_df['Premium Amount'].isnull()]
    test = all_df[all_df['Premium Amount'].isnull()]

    train.drop('id', axis=1, inplace=True)
    test.drop(['id', 'Premium Amount'], axis=1, inplace=True)
    
    return train, test, all_df

train, test, all_df= prep()

CPU times: user 10.2 s, sys: 1.39 s, total: 11.6 s
Wall time: 11.7 s


In [12]:
train.head()

Unnamed: 0,Age,Gender,Number of Dependents,Education Level,Health Score,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Customer Feedback,Smoking Status,Exercise Frequency,Premium Amount,Health Score_NA,Previous Claims_NA,Vehicle Age_NA,Credit Score_NA,Insurance Duration_NA,transformed_Annual_Income,Week,Year_sin,Year_cos,Month_sin,Month_cos,Day_sin,Day_cos,Group,Income to Dependents Ratio,Income_per_Dependent,CreditScore_InsuranceDuration,Health_Risk_Score,Credit_Health_Score,Health_Age_Interaction,contract length,Claims_Bins,Annual_Income_Health_Score_Ratio,Annual_Income_Age_Ratio,Credit_Age,Vehicle_Age_Insurance_Duration,Marital Status_Divorced,Marital Status_Married,Marital Status_Single,Marital Status_missing,Occupation_Employed,Occupation_Self-Employed,Occupation_Unemployed,Occupation_missing,Location_Rural,Location_Suburban,Location_Urban,Property Type_Apartment,Property Type_Condo,Property Type_House,Month_name_April,Month_name_August,Month_name_December,Month_name_February,Month_name_January,Month_name_July,Month_name_June,Month_name_March,Month_name_May,Month_name_November,Month_name_October,Month_name_September,Day_of_week_Friday,Day_of_week_Monday,Day_of_week_Saturday,Day_of_week_Sunday,Day_of_week_Thursday,Day_of_week_Tuesday,Day_of_week_Wednesday
0,-1.648301,1,-0.746862,1,-0.249615,2,1.014724,1.286335,-1.567375,-0.007023,0,0,2,2869.0,0,0,0,0,0,-0.611037,51,-1.342615,0.462475,-0.002224,1.428703,-1.402711,-0.048826,195,-0.496656,-0.496656,-0.644098,0.249615,-0.836236,-1.010111,2,1,-0.06041,-0.366634,0.427324,0.112166,False,True,False,False,False,True,False,False,False,False,True,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False
1,-0.159542,1,0.7335,2,-0.844115,1,-0.002736,0.420712,0.71463,-1.163391,1,1,3,1483.0,0,0,0,0,0,0.339875,24,-1.342615,0.462475,-0.002224,-1.403181,0.910948,-1.059296,169,-0.343093,-0.343093,-0.916321,0.844115,-0.539742,-0.721283,1,1,-0.083838,-0.082004,0.190798,0.850376,True,False,False,False,False,False,False,True,True,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False
2,-1.350549,0,0.7335,0,1.829208,2,-0.002736,0.766961,0.01302,-0.777935,2,1,2,567.0,0,0,0,1,0,0.140346,39,-1.342615,0.462475,-1.414833,0.012761,-0.284618,1.421239,184,-0.423667,-0.423667,-0.687354,-1.829208,1.594389,0.054797,1,1,-0.065824,0.220217,1.261313,0.471807,True,False,False,False,False,True,False,False,False,True,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False
3,-1.499425,0,-0.006681,1,-1.235845,0,-0.002736,-1.656783,-1.60281,-1.548846,0,1,1,765.0,0,0,0,0,0,2.125309,24,-0.002852,1.443798,-0.002224,-1.403181,0.910948,-1.059296,217,1.745024,1.745024,-1.505176,1.235845,-1.379179,-1.334396,0,1,-0.089362,5.887344,0.148548,-0.853185,False,True,False,False,False,False,False,True,True,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True
4,-1.499425,0,-0.746862,1,-0.437603,2,-1.020196,-0.271786,0.034281,-0.392479,0,1,2,2022.0,0,0,0,0,0,0.563128,48,0.825167,-1.125339,-0.002224,1.428703,0.27988,1.421239,96,0.288453,0.288453,-0.337271,0.437603,-0.369223,-1.012512,2,1,-0.083539,0.998699,1.60689,-0.285331,False,False,True,False,False,True,False,False,True,False,False,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True


In [13]:
all_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000000 entries, 0 to 1999999
Data columns (total 74 columns):
 #   Column                            Dtype   
---  ------                            -----   
 0   id                                int64   
 1   Age                               float64 
 2   Gender                            category
 3   Number of Dependents              float64 
 4   Education Level                   category
 5   Health Score                      float64 
 6   Policy Type                       category
 7   Previous Claims                   float64 
 8   Vehicle Age                       float64 
 9   Credit Score                      float64 
 10  Insurance Duration                float64 
 11  Customer Feedback                 category
 12  Smoking Status                    category
 13  Exercise Frequency                category
 14  Premium Amount                    float64 
 15  Health Score_NA                   int64   
 16  Previous Claims_NA

In [15]:
train.shape, test.shape

((1200000, 73), (800000, 72))

# AbdML

In [16]:
import os
import sys

sys.path.append(os.path.abspath("../AbdML"))

from main import AbdBase

In [17]:
cat_c = [col for col in train.columns if train[col].dtype=='category']
cat_c

['Gender',
 'Education Level',
 'Policy Type',
 'Customer Feedback',
 'Smoking Status',
 'Exercise Frequency',
 'Claims_Bins']

In [18]:
n_splits = 10

base = AbdBase(train_data=train, test_data=test, target_column='Premium Amount', gpu=False,
                 problem_type="regression", metric="rmsle", seed=SEED,
                 n_splits=n_splits, early_stop=True, num_classes=0, cat_features = cat_c,
                 fold_type='RKF')

[31m*** AbdBase ['V_1.3'] ***

[31m *** Available Settings *** 

[31mAvailable Models: [36mLGBM, [36mCAT, [36mXGB, [36mVoting, [36mTABNET
[31mAvailable Metrics: [36mroc_auc, [36maccuracy, [36mf1, [36mprecision, [36mrecall, [36mrmse, [36mwmae, [36mrmsle, [36mmae, [36mr2, [36mmse
[31mAvailable Problem Types: [36mclassification, [36mregression
[31mAvailable Fold Types: [36mSKF, [36mKF, [36mGKF, [36mGSKF, [36mRKF
[31m
 *** Configuration *** 

[31mProblem Type Selected: [36mREGRESSION
[31mMetric Selected: [36mRMSLE
[31mFold Type Selected: [36mRKF
[31mCalculate Train Probabilities: [36mFalse
[31mCalculate Test Probabilities: [36mFalse
[31mEarly Stopping: [36mTrue
[31mGPU: [36mFalse


## LGBM

In [20]:
%%time

# Params = lgb_study.best_params
# Params['estimators'] = 500

Params = {
 'estimators': 500,
 'boosting_type': 'gbdt',
 'num_leaves': 118,
 'learning_rate': 0.07114476009343425,
 'feature_fraction': 0.8720536237417198,
 'bagging_fraction': 0.8146346649119967,
 'bagging_freq': 6,
 'min_data_in_leaf': 24,
 'max_depth': -1,
 'lambda_l1': 2.1423021757741068,
 'lambda_l2': 0.10129197956845731,
 'min_gain_to_split': 0.02607024758370768,
 'n_jobs': -1
}

results_lgb = base.Train_ML(Params,'LGBM', e_stop=100, y_log=True)

Training Folds: 100%|██████████| 10/10 [02:31<00:00, 15.12s/it]

Overall Train RMSLE: 1.0386
Overall OOF RMSLE: 1.0465 
CPU times: user 7min 18s, sys: 4.08 s, total: 7min 22s
Wall time: 2min 31s





In [21]:
results_lgb

# 1. OOF 에측결과, 2. test 예측결과, 3. 최종학습모델 4.fold별 모델 리스트, 5. OOF 스코어, 6. 학습데이터 스코어

(array([976.85034927, 743.8342066 , 827.08949115, ..., 182.6150773 ,
        783.68950323, 256.05357109]),
 array([845.33939634, 785.40054164, 801.17836418, ..., 799.66936097,
        804.03019904, 789.07615358]),
 LGBMRegressor(bagging_fraction=0.8146346649119967, bagging_freq=6, device='cpu',
               estimators=500, feature_fraction=0.8720536237417198,
               lambda_l1=2.1423021757741068, lambda_l2=0.10129197956845731,
               learning_rate=0.07114476009343425, min_data_in_leaf=24,
               min_gain_to_split=0.02607024758370768, n_jobs=-1, num_leaves=118,
               random_state=2024, verbose=-1),
 [LGBMRegressor(bagging_fraction=0.8146346649119967, bagging_freq=6, device='cpu',
                estimators=500, feature_fraction=0.8720536237417198,
                lambda_l1=2.1423021757741068, lambda_l2=0.10129197956845731,
                learning_rate=0.07114476009343425, min_data_in_leaf=24,
                min_gain_to_split=0.02607024758370768, n_job

## CatBoost

In [19]:
%%time

Params = {
 'iterations': 500,
 'learning_rate': 0.0396760507705299,
 'depth': 9,
 'l2_leaf_reg': 0.3470266988650412,
 'bagging_temperature': 0.00115279871282324,
 'random_strength': 7.579479953348009,
 'border_count': 218,
 'colsample_bylevel': 0.6687417180293094,
 }

results_cat = base.Train_ML(Params,'CAT', e_stop=100, y_log=True)

Training Folds: 100%|██████████| 10/10 [30:10<00:00, 181.07s/it]

Overall Train RMSLE: 1.0457
Overall OOF RMSLE: 1.0480 
CPU times: user 1h 41min 44s, sys: 1min 59s, total: 1h 43min 44s
Wall time: 30min 10s





> .
- Overall Train MSE: 693424.5694
- Overall OOF MSE: 700727.2125
> ..
- Overall Train MSE: 697429.8053
- Overall OOF MSE: 700727.3238 

In [55]:
%%time

# Params = lgb_study.best_params
# Params['estimators'] = 500

Params = {
 'estimators': 500,
 'boosting_type': 'gbdt',
 'num_leaves': 118,
 'learning_rate': 0.07114476009343425,
 'feature_fraction': 0.8720536237417198,
 'bagging_fraction': 0.8146346649119967,
 'bagging_freq': 6,
 'min_data_in_leaf': 24,
 'max_depth': -1,
 'lambda_l1': 2.1423021757741068,
 'lambda_l2': 0.10129197956845731,
 'min_gain_to_split': 0.02607024758370768,
 'n_jobs': -1
}

results_lgb = base.Train_ML(Params,'LGBM', e_stop=100, y_log=True)

Training Folds: 100%|██████████| 10/10 [02:18<00:00, 13.90s/it]

Overall Train RMSLE: 1.0376
Overall OOF RMSLE: 1.0461 
CPU times: user 6min 48s, sys: 8.46 s, total: 6min 56s
Wall time: 2min 19s





# Submission

In [22]:
submission = pd.read_csv('./data/sample_submission.csv')
submission.head()

Unnamed: 0,id,Premium Amount
0,1200000,1102.545
1,1200001,1102.545
2,1200002,1102.545
3,1200003,1102.545
4,1200004,1102.545


In [23]:
submission['Premium Amount'] = results_lgb[1]
submission.head()

Unnamed: 0,id,Premium Amount
0,1200000,722.606361
1,1200001,789.283846
2,1200002,790.554213
3,1200003,802.498971
4,1200004,764.0532


In [24]:
submission.to_csv('./data/05_02_Blending.csv', index=False)
!kaggle competitions submit -c playground-series-s4e12 -f "./data/05_02_Blending.csv" -m "05_02_Blending"

100%|██████████████████████████████████████| 19.8M/19.8M [00:00<00:00, 36.1MB/s]
Successfully submitted to Regression with an Insurance Dataset

> **Public Score Comparison**

- **Baseline Model:**
  - **Public Score:** 1.04849  
  - **Rank:** 498 / 1653 (30.12%)  

- **Second Model (Feature Engineering + PowerTransformer):**
  - **Public Score:** 1.04506  
  - **Rank:** 334 / 1693 (19.72%)

- **NaN (NA col + No imputer):**
  - **Public Score:** 1.04496  
  - **Rank:** 378 / 1895 (19.94%)

- **Ensemble(lgbm + xgb + catboost):**
  - **Public Score:** 1.04475  
  - **Rank:** 346 / 1906 (18.15%)

- **AbdML(RKF) + Ensemble(lgbm + catboost):**
  - **Public Score:** 1.04473  
  - **Rank:** 357 / 1951 (18.29%)