In [None]:
# Load standard python libraries
import pandas as pd
import numpy as np
import scipy.sparse as sp
import joblib

# Set the path to find my dataprocessing functions
import sys
sys.path.append('./preprocessing')

# Load custom libraries
from vectorization import load_vectorizer, save_sparse_matrix
from inflation import load_cpi_data, adjust_for_inflation, undo_inflation_adjustment
from feature_engineering import standardize_dates, engineer_features
from text_cleaning import preprocess_claim_description

In [None]:
# The test data must be run through the same preprocessing and feature engineering pipeline as the train data

# Load test data
df_test = pd.read_csv('Data/actuarial/test.csv', index_col= 'ClaimNumber')

# Standardize dates
df_test = standardize_dates(df_test)

# Engineer new features
df_test = engineer_features(df_test)

# Adjust claims for inflation
cpi_data = load_cpi_data('Data/CPI.csv')
df_test['InitialIncurredCalimsCost'] = adjust_for_inflation(
    df_test,
    cpi_data,
    target_col='InitialIncurredCalimsCost',
    date_col='DateReported'
)

# Preprocess text
df_test['ClaimDescription'] = preprocess_claim_description(df_test)

# Load TF-IDF vectorizer and transform text
tfidf = load_vectorizer()
X_tfidf_test = tfidf.transform(df_test['ClaimDescription'])

  df[date_col] = pd.to_datetime(df[date_col]).dt.to_period('M')


In [None]:
# The test data has a similar pattern of missing values in this column as the train data. Here they are all just imputed as unknown.
df_test['MaritalStatus'] = df_test['MaritalStatus'].fillna('U')

In [53]:
df_test.to_csv('Data/test_processed.csv', index=False)
save_sparse_matrix(X_tfidf_test, filename='X_tfidf_test.npz')

In [63]:
df_test

Unnamed: 0_level_0,DateTimeOfAccident,DateReported,Age,Gender,MaritalStatus,DependentChildren,DependentsOther,WeeklyWages,PartTimeFullTime,HoursWorkedPerWeek,DaysWorkedPerWeek,ClaimDescription,InitialIncurredCalimsCost,ReportingDelta,HoursWorkedPerDay
ClaimNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
WC8145235,2002-04-02 10:00:00+00:00,2002-05,26,M,S,1,0,600.18,F,40.0,5,catch right hand with hammer burn to right hand,9377.693036,34,8.0
WC2005111,1988-04-06 16:00:00+00:00,1988-04,31,M,M,0,0,311.54,F,35.0,5,sprain right ankle fracture right elbow,5419.846416,8,7.0
WC6899143,1999-03-08 09:00:00+00:00,1999-04,57,M,M,0,0,1000.00,F,38.0,5,strike hammer crush injury finger hand,38288.487040,26,7.6
WC5502023,1996-07-26 09:00:00+00:00,1996-09,33,M,M,0,0,200.00,F,38.0,5,strike against air those strike glass lacerati...,704.889347,39,7.6
WC4785156,1994-04-13 14:00:00+00:00,1994-07,32,F,M,0,0,359.60,F,40.0,5,foreign body in right foot bruise right big toe,6420.545822,84,8.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WC9666858,2005-03-14 18:00:00+00:00,2005-07,54,F,M,0,0,738.72,F,38.0,5,fall to floor from chair neck and back strain,15480.905593,127,7.6
WC4800526,1994-05-17 12:00:00+00:00,1994-05,22,M,S,0,0,467.30,F,38.0,5,box slip and cut hand laceration left forearm,861.296271,13,7.6
WC3360567,1991-07-18 09:00:00+00:00,1991-07,35,F,S,0,0,164.05,P,16.0,5,twisted knee and fall bruised right knee,17722.340675,11,3.2
WC7491778,2000-08-01 11:00:00+00:00,2000-09,45,M,M,0,0,200.00,F,38.0,5,lift part strain back low back strain,18295.103687,34,7.6


In [None]:
# Prepare test data for modeling

# Load the fitted scaler and one hot encoder from the modeling notebook
scaler = joblib.load('models/scaler.pkl')
ohe = joblib.load('models/ohe.pkl')

X = df_test.drop(columns=['DateTimeOfAccident', 'DateReported', 'ClaimDescription'], axis=1)

# Automatically select numeric and categorical columns by dtype
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

# Extract numeric data and scale
X_numeric = X[numeric_cols].values
X_test_numeric_scaled = scaler.transform(X_numeric)
X_test_numeric_sparse = sp.csr_matrix(X_test_numeric_scaled)

# 3. Transform categorical columns
X_test_categorical = ohe.transform(X[categorical_cols])
X_test_categorical_sparse = sp.csr_matrix(X_test_categorical)

# 4. Stack numeric and categorical features
X_test_other = sp.hstack([X_test_numeric_sparse, X_test_categorical_sparse])

# 5. Stack with TF-IDF
X_test_final = sp.hstack([X_test_other, X_tfidf_test])

This code is to fit the lightGBM model with regularization on the test data.

In [None]:
# Train test data on LightGBM model

# Load the trained LightGBM model
lgbm_model = joblib.load('models/lgbm_model.pkl')

# Predict on the test set
y_test_pred = lgbm_model.predict(X_test_final)

# Save y hat
test_predictions = df_test.copy()
test_predictions['UltimateIncurredClaimCost'] = y_test_pred



In [None]:
# Undo inflation adjustment on predicted claims
y_test_pred_unadjusted = undo_inflation_adjustment(
    adjusted_series=pd.Series(y_test_pred, index=df_test.index),
    df=df_test,
    CPI=cpi_data,
    date_col='DateReported',
    reference_month='2024-12'
)

# Write adjusted y hat back to dataframe
test_predictions = df_test.copy()
test_predictions['UltimateIncurredClaimCost'] = y_test_pred_unadjusted

In [None]:
# Drop all unneeded columns for Kaggle submission

target = test_predictions[['UltimateIncurredClaimCost']]

# Save to CSV
target.to_csv('Data/submissions/lightgbm_predicted.csv', index=True)

This code is to fit the xgboost model with regularization on the test data.

In [None]:
# Train the test data on XGBoost model

# Load the trained XGBoost model
lgbm_model = joblib.load('models/xgbR_trained_model.pkl')

# Predict on the test set
y_test_pred = lgbm_model.predict(X_test_final)

# Attach predictions to identifiers (e.g., claim numbers)

test_predictions = df_test.copy()
test_predictions['UltimateIncurredClaimCost'] = y_test_pred

In [None]:
# Undo inflation adjustment on predicted claims
y_test_pred_unadjusted = undo_inflation_adjustment(
    adjusted_series=pd.Series(y_test_pred, index=df_test.index),
    df=df_test,
    CPI=cpi_data,
    date_col='DateReported',
    reference_month='2024-12'
)

# Write adjusted y hat back to dataframe
test_predictions = df_test.copy()
test_predictions['UltimateIncurredClaimCost'] = y_test_pred_unadjusted

In [None]:
# Drop all unneeded columns for Kaggle submission

target = test_predictions[['UltimateIncurredClaimCost']]

# Save to CSV
target.to_csv('Data/submissions/xgboost_predicted.csv', index=True)