In [97]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_log_error, mean_squared_error
from sklearn.model_selection import cross_validate, KFold
from sklearn.linear_model import LinearRegression

In [98]:
import lightgbm as lgb
from lightgbm import early_stopping

In [99]:
from catboost import CatBoostRegressor

In [128]:
df = pd.read_csv('data/train.csv')
df = df.drop(columns='id')

test_df = pd.read_csv('data/test.csv')
test_df = test_df.drop(columns='id')

display(df.head())
display(test_df.head())

Unnamed: 0,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount
0,19.0,Female,10049.0,Married,1.0,Bachelor's,Self-Employed,22.598761,Urban,Premium,2.0,17.0,372.0,5.0,2023-12-23 15:21:39.134960,Poor,No,Weekly,House,2869.0
1,39.0,Female,31678.0,Divorced,3.0,Master's,,15.569731,Rural,Comprehensive,1.0,12.0,694.0,2.0,2023-06-12 15:21:39.111551,Average,Yes,Monthly,House,1483.0
2,23.0,Male,25602.0,Divorced,3.0,High School,Self-Employed,47.177549,Suburban,Premium,1.0,14.0,,3.0,2023-09-30 15:21:39.221386,Good,Yes,Weekly,House,567.0
3,21.0,Male,141855.0,Married,2.0,Bachelor's,,10.938144,Rural,Basic,1.0,0.0,367.0,1.0,2024-06-12 15:21:39.226954,Poor,Yes,Daily,Apartment,765.0
4,21.0,Male,39651.0,Single,1.0,Bachelor's,Self-Employed,20.376094,Rural,Premium,0.0,8.0,598.0,4.0,2021-12-01 15:21:39.252145,Poor,Yes,Weekly,House,2022.0


Unnamed: 0,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type
0,28.0,Female,2310.0,,4.0,Bachelor's,Self-Employed,7.657981,Rural,Basic,,19.0,,1.0,2023-06-04 15:21:39.245086,Poor,Yes,Weekly,House
1,31.0,Female,126031.0,Married,2.0,Master's,Self-Employed,13.381379,Suburban,Premium,,14.0,372.0,8.0,2024-04-22 15:21:39.224915,Good,Yes,Rarely,Apartment
2,47.0,Female,17092.0,Divorced,0.0,PhD,Unemployed,24.354527,Urban,Comprehensive,,16.0,819.0,9.0,2023-04-05 15:21:39.134960,Average,Yes,Monthly,Condo
3,28.0,Female,30424.0,Divorced,3.0,PhD,Self-Employed,5.136225,Suburban,Comprehensive,1.0,3.0,770.0,5.0,2023-10-25 15:21:39.134960,Poor,Yes,Daily,House
4,24.0,Male,10863.0,Divorced,2.0,High School,Unemployed,11.844155,Suburban,Premium,,14.0,755.0,7.0,2021-11-26 15:21:39.259788,Average,No,Weekly,House


In [135]:
def date_transform(curr_df):
    curr_df = curr_df.copy()
    curr_df['Policy Start Date'] = pd.to_datetime(curr_df['Policy Start Date'])
    curr_df['Day'] = curr_df['Policy Start Date'].dt.day_of_week
    curr_df['Year'] = curr_df['Policy Start Date'].dt.year
    curr_df['Month'] = curr_df['Policy Start Date'].dt.month
    
    curr_df = curr_df.drop(columns='Policy Start Date')
    return curr_df

def transform_na(curr_df):
    curr_df = curr_df.copy()
    
    categorical_vars = curr_df.select_dtypes('object').columns
    curr_df[categorical_vars] = curr_df[categorical_vars].fillna('None')
    
    numeric_vars = curr_df.select_dtypes('number').columns
    curr_df[numeric_vars] = curr_df[numeric_vars].fillna(-999)
    return curr_df

# Fix dtypes and reduce mem usage
def reduce_memory_usage(df):
    """Reduce memory usage of a pandas DataFrame by downcasting numeric types."""
    start_mem = df.memory_usage().sum() / 1024**2
    print(f"Memory usage of dataframe is {start_mem:.2f} MB")
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object and col_type.name != 'category':
            if np.issubdtype(col_type, np.float64):
                df[col] = pd.to_numeric(df[col], downcast='float')
            elif np.issubdtype(col_type, np.int64):
                df[col] = pd.to_numeric(df[col], downcast='integer')
        
        elif col_type == object:
            num_unique_values = len(df[col].unique())
            num_total_values = len(df[col])
            if num_unique_values / num_total_values < 0.5:
                df[col] = df[col].astype('category')
    
    end_mem = df.memory_usage().sum() / 1024**2
    print(f"Memory usage after optimization is: {end_mem:.2f} MB")
    print(f"Decreased by {(start_mem - end_mem) / start_mem * 100:.1f}%")
    
    return df

def catboost_na(curr_df):
    curr_df = curr_df.copy()
    
    cols = list(curr_df.columns.difference(['Premium Amount']))
    for col in cols:
        curr_df[col] = curr_df[col].fillna('None').astype('string')
    return curr_df

def combine(curr_df):
    curr_df = curr_df.copy()
    curr_df = date_transform(curr_df)
    curr_df = transform_na(curr_df)
    curr_df = reduce_memory_usage(curr_df)
    return curr_df
    
transform_df = combine(df)
transform_df.head()

Memory usage of dataframe is 187.68 MB
Memory usage after optimization is: 66.38 MB
Decreased by 64.6%


Unnamed: 0,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,...,Credit Score,Insurance Duration,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount,Day,Year,Month
0,19.0,Female,10049.0,Married,1.0,Bachelor's,Self-Employed,22.598761,Urban,Premium,...,372.0,5.0,Poor,No,Weekly,House,2869.0,5,2023,12
1,39.0,Female,31678.0,Divorced,3.0,Master's,,15.569731,Rural,Comprehensive,...,694.0,2.0,Average,Yes,Monthly,House,1483.0,0,2023,6
2,23.0,Male,25602.0,Divorced,3.0,High School,Self-Employed,47.177547,Suburban,Premium,...,-999.0,3.0,Good,Yes,Weekly,House,567.0,5,2023,9
3,21.0,Male,141855.0,Married,2.0,Bachelor's,,10.938144,Rural,Basic,...,367.0,1.0,Poor,Yes,Daily,Apartment,765.0,2,2024,6
4,21.0,Male,39651.0,Single,1.0,Bachelor's,Self-Employed,20.376093,Rural,Premium,...,598.0,4.0,Poor,Yes,Weekly,House,2022.0,2,2021,12


In [132]:
""" 
curr_df passed in:
    contains no NA values
    categorical columns are all 'category', numeric columns all some 'number'
"""
def generate_oof(curr_df):
    X = curr_df.drop(columns='Premium Amount')
    y = curr_df['Premium Amount']
    cat_features = list(X.select_dtypes(include=['string', 'category']).columns)
    print(cat_features)
    
    k = KFold(n_splits=5, shuffle=True, random_state=42)
    oof = np.zeros(len(X))

    for index, (train_index, test_index) in enumerate(k.split(X)):
        print(f"Fold {index}")
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        model = CatBoostRegressor(
            iterations=1000,
            learning_rate=0.1,
            depth=6,
            eval_metric="RMSE",
            random_seed=42,
            verbose=200,
            task_type='GPU',
            l2_leaf_reg =  0.7,
        )
        
        model.fit(X_train,
                  y_train,
                  eval_set=(X_test, y_test), 
                  early_stopping_rounds=300,
                  cat_features=cat_features,
                 )
        
        predictions = np.maximum(0, model.predict(X_test))
        oof[test_index] = predictions
        
        print("rmsle:", np.sqrt(mean_squared_log_error(y_test, predictions)))
        
    overall_rmsle = np.sqrt(mean_squared_log_error(y, oof))
    print("final rmsle:", overall_rmsle)
    return oof

oof = generate_oof(transform_df)
oof_df = transform_df.copy()
oof_df['nonlog'] = oof.astype('float32')

['Age', 'Gender', 'Annual Income', 'Marital Status', 'Number of Dependents', 'Education Level', 'Occupation', 'Health Score', 'Location', 'Policy Type', 'Previous Claims', 'Vehicle Age', 'Credit Score', 'Insurance Duration', 'Customer Feedback', 'Smoking Status', 'Exercise Frequency', 'Property Type', 'Day', 'Year', 'Month']
Fold 0
0:	learn: 863.0905633	test: 862.0232597	best: 862.0232597 (0)	total: 17.3ms	remaining: 17.3s
200:	learn: 844.0991436	test: 839.7056094	best: 839.7050809 (196)	total: 3.79s	remaining: 15.1s
400:	learn: 842.7443272	test: 839.1906704	best: 839.1905484 (399)	total: 7.69s	remaining: 11.5s
600:	learn: 841.7374387	test: 838.9870718	best: 838.9860954 (599)	total: 11.6s	remaining: 7.7s
800:	learn: 840.7804577	test: 838.9233990	best: 838.9213646 (794)	total: 15.5s	remaining: 3.85s
999:	learn: 839.8493732	test: 838.8570357	best: 838.8552860 (989)	total: 19.3s	remaining: 0us
bestTest = 838.855286
bestIteration = 989
Shrink model to first 990 iterations.
rmsle: 1.1415534

In [95]:
features = oof_df.columns.difference(['Premium Amount', 'nonlog'])
for col in features:
    oof_df[col] = oof_df[col].astype('category')

In [134]:
def generate_final(curr_df):
    X = curr_df.drop(columns='Premium Amount')
    y = np.log1p(curr_df['Premium Amount'])
    cat_features = list(X.select_dtypes(include=['string', 'category']).columns)
    
    k = KFold(n_splits=5, shuffle=True, random_state=42)
    oof = np.zeros(len(X))

    for index, (train_index, test_index) in enumerate(k.split(X)):
        print(f"Fold {index}")
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        model = CatBoostRegressor(
            iterations=3000,
            learning_rate=0.05,
            depth=6,
            eval_metric="RMSE",
            random_seed=42,
            verbose=200,
            task_type='GPU',
            l2_leaf_reg =  0.7,
        )
        
        model.fit(X_train,
                  y_train,
                  eval_set=(X_test, y_test), 
                  early_stopping_rounds=300,
                  cat_features=cat_features,
                 )
        
        predictions = model.predict(X_test)
        oof[test_index] = predictions
        
        print("rmsle:", np.sqrt(mean_squared_error(y_test, predictions)))
        
    overall_rmsle = np.sqrt(mean_squared_error(y, oof))
    print("final rmsle:", overall_rmsle)
    return oof

final = generate_final(oof_df)

Fold 0
0:	learn: 1.0927914	test: 1.0935124	best: 1.0935124 (0)	total: 22.4ms	remaining: 1m 7s
200:	learn: 1.0474578	test: 1.0458391	best: 1.0458391 (200)	total: 4.19s	remaining: 58.4s
400:	learn: 1.0426114	test: 1.0396755	best: 1.0396755 (400)	total: 8.41s	remaining: 54.5s
600:	learn: 1.0405398	test: 1.0374629	best: 1.0374629 (600)	total: 12.7s	remaining: 50.6s
800:	learn: 1.0387215	test: 1.0354503	best: 1.0354494 (799)	total: 16.8s	remaining: 46.2s
1000:	learn: 1.0376224	test: 1.0347981	best: 1.0347961 (998)	total: 21s	remaining: 41.9s
1200:	learn: 1.0364433	test: 1.0338973	best: 1.0338973 (1200)	total: 25.2s	remaining: 37.8s
1400:	learn: 1.0353241	test: 1.0330486	best: 1.0330470 (1391)	total: 29.2s	remaining: 33.4s
1600:	learn: 1.0343897	test: 1.0325807	best: 1.0325807 (1600)	total: 33.4s	remaining: 29.2s
1800:	learn: 1.0334702	test: 1.0321247	best: 1.0321235 (1791)	total: 37.5s	remaining: 25s
2000:	learn: 1.0326986	test: 1.0317907	best: 1.0317900 (1996)	total: 41.5s	remaining: 20.7s

KeyboardInterrupt: 

# LGBM Sub:
        model = lgb.LGBMRegressor(
            n_estimators=1000,
            learning_rate=0.1,
            verbose=-1,
            device='gpu',
            gpu_platform_id=0,
            gpu_device_id=0
        )
        
        model.fit(X_train,
                  y_train,
                  categorical_feature = cat_features,
                  eval_set=[(X_test, y_test)],
                  eval_metric='rmse',
                  callbacks=[early_stopping(100)]
                 )