In [72]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_log_error, mean_squared_error
from sklearn.model_selection import cross_validate, KFold
from sklearn.linear_model import LinearRegression

In [73]:
import lightgbm as lgb
from lightgbm import early_stopping

In [74]:
from catboost import CatBoostRegressor

In [75]:
df = pd.read_csv('data/train.csv')
df = df.drop(columns='id')

test_df = pd.read_csv('data/test.csv')
test_df = test_df.drop(columns='id')

display(df.head())
display(test_df.head())

Unnamed: 0,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount
0,19.0,Female,10049.0,Married,1.0,Bachelor's,Self-Employed,22.598761,Urban,Premium,2.0,17.0,372.0,5.0,2023-12-23 15:21:39.134960,Poor,No,Weekly,House,2869.0
1,39.0,Female,31678.0,Divorced,3.0,Master's,,15.569731,Rural,Comprehensive,1.0,12.0,694.0,2.0,2023-06-12 15:21:39.111551,Average,Yes,Monthly,House,1483.0
2,23.0,Male,25602.0,Divorced,3.0,High School,Self-Employed,47.177549,Suburban,Premium,1.0,14.0,,3.0,2023-09-30 15:21:39.221386,Good,Yes,Weekly,House,567.0
3,21.0,Male,141855.0,Married,2.0,Bachelor's,,10.938144,Rural,Basic,1.0,0.0,367.0,1.0,2024-06-12 15:21:39.226954,Poor,Yes,Daily,Apartment,765.0
4,21.0,Male,39651.0,Single,1.0,Bachelor's,Self-Employed,20.376094,Rural,Premium,0.0,8.0,598.0,4.0,2021-12-01 15:21:39.252145,Poor,Yes,Weekly,House,2022.0


Unnamed: 0,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type
0,28.0,Female,2310.0,,4.0,Bachelor's,Self-Employed,7.657981,Rural,Basic,,19.0,,1.0,2023-06-04 15:21:39.245086,Poor,Yes,Weekly,House
1,31.0,Female,126031.0,Married,2.0,Master's,Self-Employed,13.381379,Suburban,Premium,,14.0,372.0,8.0,2024-04-22 15:21:39.224915,Good,Yes,Rarely,Apartment
2,47.0,Female,17092.0,Divorced,0.0,PhD,Unemployed,24.354527,Urban,Comprehensive,,16.0,819.0,9.0,2023-04-05 15:21:39.134960,Average,Yes,Monthly,Condo
3,28.0,Female,30424.0,Divorced,3.0,PhD,Self-Employed,5.136225,Suburban,Comprehensive,1.0,3.0,770.0,5.0,2023-10-25 15:21:39.134960,Poor,Yes,Daily,House
4,24.0,Male,10863.0,Divorced,2.0,High School,Unemployed,11.844155,Suburban,Premium,,14.0,755.0,7.0,2021-11-26 15:21:39.259788,Average,No,Weekly,House


In [93]:
def date_transform(curr_df):
    curr_df = curr_df.copy()
    curr_df['Policy Start Date'] = pd.to_datetime(curr_df['Policy Start Date'])
    curr_df['Day'] = curr_df['Policy Start Date'].dt.day_of_week
    curr_df['Year'] = curr_df['Policy Start Date'].dt.year
    curr_df['Month'] = curr_df['Policy Start Date'].dt.month
    
    curr_df = curr_df.drop(columns='Policy Start Date')
    return curr_df

def transform_na(curr_df):
    curr_df = curr_df.copy()
    
    categorical_vars = curr_df.select_dtypes('object').columns
    curr_df[categorical_vars] = curr_df[categorical_vars].fillna('None')
    
    numeric_vars = curr_df.select_dtypes('number').columns
    curr_df[numeric_vars] = curr_df[numeric_vars].fillna(-999)
    return curr_df

# Fix dtypes and reduce mem usage
def reduce_memory_usage(df):
    """Reduce memory usage of a pandas DataFrame by downcasting numeric types."""
    start_mem = df.memory_usage().sum() / 1024**2
    print(f"Memory usage of dataframe is {start_mem:.2f} MB")
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object and col_type.name != 'category':
            if np.issubdtype(col_type, np.float64):
                df[col] = pd.to_numeric(df[col], downcast='float')
            elif np.issubdtype(col_type, np.int64):
                df[col] = pd.to_numeric(df[col], downcast='integer')
        
        elif col_type == object:
            num_unique_values = len(df[col].unique())
            num_total_values = len(df[col])
            if num_unique_values / num_total_values < 0.5:
                df[col] = df[col].astype('category')
    
    end_mem = df.memory_usage().sum() / 1024**2
    print(f"Memory usage after optimization is: {end_mem:.2f} MB")
    print(f"Decreased by {(start_mem - end_mem) / start_mem * 100:.1f}%")
    
    return df

def combine(curr_df):
    curr_df = curr_df.copy()
    curr_df = date_transform(curr_df)
    curr_df = transform_na(curr_df)
    curr_df = reduce_memory_usage(curr_df)
    return curr_df
    
transform_df = combine(df)
transform_df.head()

Memory usage of dataframe is 187.68 MB
Memory usage after optimization is: 66.38 MB
Decreased by 64.6%


Unnamed: 0,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,...,Credit Score,Insurance Duration,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount,Day,Year,Month
0,19.0,Female,10049.0,Married,1.0,Bachelor's,Self-Employed,22.598761,Urban,Premium,...,372.0,5.0,Poor,No,Weekly,House,2869.0,5,2023,12
1,39.0,Female,31678.0,Divorced,3.0,Master's,,15.569731,Rural,Comprehensive,...,694.0,2.0,Average,Yes,Monthly,House,1483.0,0,2023,6
2,23.0,Male,25602.0,Divorced,3.0,High School,Self-Employed,47.177547,Suburban,Premium,...,-999.0,3.0,Good,Yes,Weekly,House,567.0,5,2023,9
3,21.0,Male,141855.0,Married,2.0,Bachelor's,,10.938144,Rural,Basic,...,367.0,1.0,Poor,Yes,Daily,Apartment,765.0,2,2024,6
4,21.0,Male,39651.0,Single,1.0,Bachelor's,Self-Employed,20.376093,Rural,Premium,...,598.0,4.0,Poor,Yes,Weekly,House,2022.0,2,2021,12


In [94]:
""" 
curr_df passed in:
    contains no NA values
    categorical columns are all 'category', numeric columns all some 'number'
"""
def generate_oof(curr_df):
    X = curr_df.drop(columns='Premium Amount')
    y = curr_df['Premium Amount']
    cat_features = list(X.select_dtypes('category').columns)
    
    k = KFold(n_splits=5, shuffle=True, random_state=42)
    oof = np.zeros(len(X))

    for index, (train_index, test_index) in enumerate(k.split(X)):
        print(f"Fold {index}")
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        model = lgb.LGBMRegressor(
            n_estimators=1000,
            learning_rate=0.1,
            verbose=-1,
            device='gpu',
            gpu_platform_id=0,
            gpu_device_id=0
        )
        
        model.fit(X_train,
                  y_train,
                  categorical_feature = cat_features,
                  eval_set=[(X_test, y_test)],
                  eval_metric='rmse',
                  callbacks=[early_stopping(100)]
                 )
        
        predictions = model.predict(X_test)
        oof[test_index] = predictions
        
        print("rmsle:", np.sqrt(mean_squared_log_error(y_test, predictions)))
        
    overall_rmsle = np.sqrt(mean_squared_log_error(y, oof))
    print("final rmsle:", overall_rmsle)
    return oof

oof = generate_oof(transform_df)
oof_df = transform_df.copy()
oof_df['nonlog'] = oof.astype('float32')

Fold 0
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[311]	valid_0's rmse: 835.609	valid_0's l2: 698242
rmsle: 1.1337592701853236
Fold 1
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[347]	valid_0's rmse: 834.583	valid_0's l2: 696529
rmsle: 1.1319697963371163
Fold 2
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[416]	valid_0's rmse: 837.445	valid_0's l2: 701314
rmsle: 1.1300155267659253
Fold 3
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[413]	valid_0's rmse: 834.921	valid_0's l2: 697093
rmsle: 1.1311743624962007
Fold 4
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[500]	valid_0's rmse: 836.041	valid_0's l2: 698965
rmsle: 1.1330582153343038
final rmsle: 1.1319962147452582


In [95]:
features = oof_df.columns.difference(['Premium Amount', 'nonlog'])
for col in features:
    oof_df[col] = oof_df[col].astype('category')

In [96]:
def generate_final(curr_df):
    X = curr_df.drop(columns='Premium Amount')
    y = np.log1p(curr_df['Premium Amount'])
    cat_features = list(X.select_dtypes('category').columns)
    
    k = KFold(n_splits=5, shuffle=True, random_state=42)
    oof = np.zeros(len(X))

    for index, (train_index, test_index) in enumerate(k.split(X)):
        print(f"Fold {index}")
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        model = CatBoostRegressor(
            iterations=3000,
            learning_rate=0.05,
            depth=6,
            eval_metric="RMSE",
            random_seed=42,
            verbose=200,
            task_type='GPU',
            l2_leaf_reg =  0.7,
        )
        
        model.fit(X_train,
                  y_train,
                  eval_set=(X_test, y_test), 
                  early_stopping_rounds=300,
                  cat_features=cat_features,
                 )
        
        predictions = model.predict(X_test)
        oof[test_index] = predictions
        
        print("rmsle:", np.sqrt(mean_squared_error(y_test, predictions)))
        
    overall_rmsle = np.sqrt(mean_squared_error(y, oof))
    print("final rmsle:", overall_rmsle)
    return oof

final = generate_final(oof_df)

Fold 0


CatBoostError: Invalid type for cat_feature category for [feature_idx=0]=-999.0 : cat_features must be integer or string, real number values and NaN values should be converted to string.