In [140]:
import numpy as np
import pandas as pd


import warnings
warnings.filterwarnings(action='ignore')
warnings.filterwarnings("ignore", category=DeprecationWarning)

from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, LabelEncoder , OrdinalEncoder, PowerTransformer



from lightgbm import LGBMRegressor
from lightgbm import early_stopping

from sklearn.feature_selection import RFE,f_regression,SelectKBest,mutual_info_regression

from sklearn.metrics import r2_score, root_mean_squared_error, mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, \
explained_variance_score

import pickle

from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split
import json


In [141]:
def r2_metric(y_true, y_pred):
    ss_res = K.sum(K.square(y_true - y_pred)) 
    ss_tot = K.sum(K.square(y_true - K.mean(y_true)))
    return 1 - ss_res / (ss_tot + K.epsilon())

In [142]:
def adjusted_r2(r2, n, p):
    """
    Calculate Adjusted R-squared.
    
    Parameters:
    r2: float - Regular R-squared value.
    n: int - Number of observations (samples).
    p: int - Number of predictors (independent variables).

    Returns:
    float - Adjusted R-squared.
    """
    return 1 - ((1 - r2) * (n - 1) / (n - p - 1))

In [143]:
def model_performance(model, X, y, return_value=False):
    
    y_pred = model.predict(X)
    mse = mean_squared_error(y, y_pred)
    rmse = root_mean_squared_error(y, y_pred)
    mae =  mean_absolute_error(y, y_pred)
    mape = mean_absolute_percentage_error(y, y_pred)
    r2 = r2_score(y,y_pred)
    r2_adj = adjusted_r2(r2, X.shape[0], X.shape[1])
    
    print("Mean Squared Error ",mse)
    print("Root Mean Squared Error ", rmse)
    print("Mean Absolute Error ", mae)
    print("Mean Absolute Percentage Error ",mape)
    print("R2 score ", r2)
    print("Adjusted R2 score ", r2_adj)
    
    if return_value:
           return mse, rmse, mae, mape, r2, r2_adj
    else:
        pass

# Data Loading

In [144]:
# Reading csv file
auto_insurance_df = pd.read_csv('Dataset/AutoInsurance.csv')
auto_insurance_df.head()

Unnamed: 0,Customer,State,Customer Lifetime Value,Response,Coverage,Education,Effective To Date,EmploymentStatus,Gender,Income,...,Months Since Policy Inception,Number of Open Complaints,Number of Policies,Policy Type,Policy,Renew Offer Type,Sales Channel,Total Claim Amount,Vehicle Class,Vehicle Size
0,BU79786,Washington,2763.519279,No,Basic,Bachelor,2/24/11,Employed,F,56274,...,5,0,1,Corporate Auto,Corporate L3,Offer1,Agent,384.811147,Two-Door Car,Medsize
1,QZ44356,Arizona,6979.535903,No,Extended,Bachelor,1/31/11,Unemployed,F,0,...,42,0,8,Personal Auto,Personal L3,Offer3,Agent,1131.464935,Four-Door Car,Medsize
2,AI49188,Nevada,12887.43165,No,Premium,Bachelor,2/19/11,Employed,F,48767,...,38,0,2,Personal Auto,Personal L3,Offer1,Agent,566.472247,Two-Door Car,Medsize
3,WW63253,California,7645.861827,No,Basic,Bachelor,1/20/11,Unemployed,M,0,...,65,0,7,Corporate Auto,Corporate L2,Offer1,Call Center,529.881344,SUV,Medsize
4,HB64268,Washington,2813.692575,No,Basic,Bachelor,3/2/2011,Employed,M,43836,...,44,0,1,Personal Auto,Personal L1,Offer1,Agent,138.130879,Four-Door Car,Medsize


# Data Preprocessing

In [145]:
auto_insurance_df_copy = auto_insurance_df.copy()

In [146]:
auto_insurance_df_copy['Policy'] = auto_insurance_df_copy['Policy'].apply(lambda x:x[-2:])

In [147]:
#auto_insurance_df_copy['Effective To Date'] = pd.to_datetime(auto_insurance_df_copy['Effective To Date'])

# Assuming the current date for calculations
#date = pd.to_datetime("2011-12-31")

# 1. Time-Based Features
#auto_insurance_df_copy['Days_Since_Policy_Inception'] = (reference_date - auto_insurance_df_copy['Effective To Date']).dt.days
#auto_insurance_df_copy['Days_Since_Last_Claim'] = auto_insurance_df_copy['Months Since Last Claim'] * 30  # Approximation

auto_insurance_df_copy['Region'] = auto_insurance_df_copy['State'].apply(lambda x: 'West' if x in ['California', 'Oregon', 'Nevada'] else 'East')  # Example grouping

#auto_insurance_df_copy['Claim_Efficiency'] = auto_insurance_df_copy['Total Claim Amount'] / (auto_insurance_df_copy['Monthly Premium Auto'] + 1e-6)
#auto_insurance_df_copy['Policy_Stability'] = auto_insurance_df_copy['Number of Policies'] / (auto_insurance_df_copy['Number of Open Complaints'] + 1e-6)




auto_insurance_df_copy.drop(['Effective To Date','Customer','State'], axis=1, inplace=True)
#droping the 'Effective To Date as it is not important for our model
# dropping Customer column as it is not important for our model


auto_insurance_df_copy.rename(columns={'Customer Lifetime Value':'CLV'},inplace=True)



auto_insurance_df_copy['EmploymentStatus'].replace(to_replace=['Medical Leave','Disabled','Retired'],
                                                   value='Other', inplace=True)
auto_insurance_df_copy.head()

Unnamed: 0,CLV,Response,Coverage,Education,EmploymentStatus,Gender,Income,Location Code,Marital Status,Monthly Premium Auto,...,Number of Open Complaints,Number of Policies,Policy Type,Policy,Renew Offer Type,Sales Channel,Total Claim Amount,Vehicle Class,Vehicle Size,Region
0,2763.519279,No,Basic,Bachelor,Employed,F,56274,Suburban,Married,69,...,0,1,Corporate Auto,L3,Offer1,Agent,384.811147,Two-Door Car,Medsize,East
1,6979.535903,No,Extended,Bachelor,Unemployed,F,0,Suburban,Single,94,...,0,8,Personal Auto,L3,Offer3,Agent,1131.464935,Four-Door Car,Medsize,East
2,12887.43165,No,Premium,Bachelor,Employed,F,48767,Suburban,Married,108,...,0,2,Personal Auto,L3,Offer1,Agent,566.472247,Two-Door Car,Medsize,West
3,7645.861827,No,Basic,Bachelor,Unemployed,M,0,Suburban,Married,106,...,0,7,Corporate Auto,L2,Offer1,Call Center,529.881344,SUV,Medsize,West
4,2813.692575,No,Basic,Bachelor,Employed,M,43836,Rural,Single,73,...,0,1,Personal Auto,L1,Offer1,Agent,138.130879,Four-Door Car,Medsize,East


In [148]:
int_cols = auto_insurance_df_copy.select_dtypes(include=['int64']).columns.tolist()
int_cols.remove('Income')
int_cols.remove('Monthly Premium Auto')

auto_insurance_df_copy[int_cols] = auto_insurance_df_copy[int_cols].astype('int8')
auto_insurance_df_copy.head()

Unnamed: 0,CLV,Response,Coverage,Education,EmploymentStatus,Gender,Income,Location Code,Marital Status,Monthly Premium Auto,...,Number of Open Complaints,Number of Policies,Policy Type,Policy,Renew Offer Type,Sales Channel,Total Claim Amount,Vehicle Class,Vehicle Size,Region
0,2763.519279,No,Basic,Bachelor,Employed,F,56274,Suburban,Married,69,...,0,1,Corporate Auto,L3,Offer1,Agent,384.811147,Two-Door Car,Medsize,East
1,6979.535903,No,Extended,Bachelor,Unemployed,F,0,Suburban,Single,94,...,0,8,Personal Auto,L3,Offer3,Agent,1131.464935,Four-Door Car,Medsize,East
2,12887.43165,No,Premium,Bachelor,Employed,F,48767,Suburban,Married,108,...,0,2,Personal Auto,L3,Offer1,Agent,566.472247,Two-Door Car,Medsize,West
3,7645.861827,No,Basic,Bachelor,Unemployed,M,0,Suburban,Married,106,...,0,7,Corporate Auto,L2,Offer1,Call Center,529.881344,SUV,Medsize,West
4,2813.692575,No,Basic,Bachelor,Employed,M,43836,Rural,Single,73,...,0,1,Personal Auto,L1,Offer1,Agent,138.130879,Four-Door Car,Medsize,East


In [149]:
auto_insurance_df_copy[['CLV','Total Claim Amount']] = auto_insurance_df_copy[['CLV','Total Claim Amount']].astype('float32')

auto_insurance_df_copy[['Income','Monthly Premium Auto']] = auto_insurance_df_copy[['Income','Monthly Premium Auto']].astype('int32')

In [150]:
X = auto_insurance_df_copy.drop(['CLV'], axis=1)

y = auto_insurance_df_copy['CLV']

In [151]:
# Create bins for the target variable
num_bins = 10  # Adjust the number of bins as needed
bins = pd.qcut(y, q=num_bins, duplicates='drop')

# Perform stratified splitting based on the bins
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.05, stratify=bins, random_state=42
)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

print(y_train.skew())
print(y_test.skew())

(8677, 21)
(457, 21)
(8677,)
(457,)
3.050567
2.6664152


In [152]:

or_coverage = OrdinalEncoder(categories=[['Basic', 'Extended', 'Premium']])
# change the encoding by order basic -> 0, Extended -> 1 and Premium -> 2
X_train['Coverage'] = or_coverage.fit_transform(X_train['Coverage'].values.reshape(-1,1)).astype('int8') 


#or_edu = OrdinalEncoder(categories=[['High School or Below', 'College', 'Bachelor', 'Master', 'Doctor']])
# change the encoding by order High School or Below -> 0, College -> 1, Bachelor -> 2, Master -> 3 and Doctor -> 4
#X_train['Education'] = or_edu.fit_transform(X_train['Education'].values.reshape(-1,1)).astype('int8')  
#X_test['Education'] = or_edu.transform(X_test['Education'].values.reshape(-1,1)).astype('int8')  


# changing All the records having Education as Master and Doctor into Master and Above
X_train['Education'].replace(to_replace=['Master','Doctor'],value='Master and Above', inplace=True)

or_edu = OrdinalEncoder(categories=[['High School or Below', 'College', 'Bachelor', 'Master and Above']])

X_train['Education'] = or_edu.fit_transform(X_train['Education'].values.reshape(-1,1)).astype('int8') 



or_Marital_Stat = OrdinalEncoder(categories=[['Married', 'Single', 'Divorced']])
# change the encoding by order Single -> 0, Married -> 1 and Divorced -> 2
X_train['Marital Status'] = or_Marital_Stat.fit_transform(X_train['Marital Status'].values.reshape(-1,1)).astype('int8')  

or_vec_class = OrdinalEncoder(categories=[['Two-Door Car', 'Four-Door Car', 'SUV', 'Sports Car', 'Luxury Car', 'Luxury SUV']])
# 'Two-Door Car' -> 0 , 'Four-Door Car'-> 1 , 'SUV' -> 2 , 'Sports Car' -> 3, 'Luxury Car' -> 4, 'Luxury SUV' -> 5
X_train['Vehicle Class'] = or_vec_class.fit_transform(X_train['Vehicle Class'].values.reshape(-1,1)).astype('int8')  

or_policy = OrdinalEncoder(categories=[['L1', 'L2', 'L3']])
# L1 -> 0 , L2-> 1 ,and L3 -> 2 
X_train['Policy'] = or_policy.fit_transform(X_train['Policy'].values.reshape(-1,1)).astype('int8')

or_vec_size = OrdinalEncoder(categories=[['Small', 'Medsize', 'Large']])
# Small -> 0 , 'Medsize'-> 1 ,and 'Large' -> 2 
X_train['Vehicle Size'] = or_vec_size.fit_transform(X_train['Vehicle Size'].values.reshape(-1,1)).astype('int8')   

In [153]:
# ===== Coverage =====
X_test['Coverage'] = or_coverage.transform(
    X_test['Coverage'].values.reshape(-1, 1)
).astype('int8')

# ===== Education =====
# First convert Master + Doctor into Master and Above
X_test['Education'].replace(
    to_replace=['Master', 'Doctor'],
    value='Master and Above',
    inplace=True
)

X_test['Education'] = or_edu.transform(
    X_test['Education'].values.reshape(-1, 1)
).astype('int8')

# ===== Marital Status =====
X_test['Marital Status'] = or_Marital_Stat.transform(
    X_test['Marital Status'].values.reshape(-1, 1)
).astype('int8')

# ===== Vehicle Class =====
X_test['Vehicle Class'] = or_vec_class.transform(
    X_test['Vehicle Class'].values.reshape(-1, 1)
).astype('int8')

# ===== Policy =====
X_test['Policy'] = or_policy.transform(
    X_test['Policy'].values.reshape(-1, 1)
).astype('int8')

# ===== Vehicle Size =====
X_test['Vehicle Size'] = or_vec_size.transform(
    X_test['Vehicle Size'].values.reshape(-1, 1)
).astype('int8')

In [154]:
X_train_cat = X_train.select_dtypes('O')
# selecting all the categorical column

cat_columns = X_train_cat.columns.tolist()

X_train_cat = pd.get_dummies(X_train_cat, drop_first=True).astype('int8')
#converting all the categorical into numerical column using one-hot ending

X_train = pd.concat([X_train,X_train_cat],axis=1).drop(cat_columns,axis=1)

X_train.columns =  X_train.columns.str.strip().str.replace(' ','_')

#auto_insurance_df_copy_new.drop('CLV', axis=1, inplace=True)
X_train.head()

Unnamed: 0,Coverage,Education,Income,Marital_Status,Monthly_Premium_Auto,Months_Since_Last_Claim,Months_Since_Policy_Inception,Number_of_Open_Complaints,Number_of_Policies,Policy,...,Location_Code_Urban,Policy_Type_Personal_Auto,Policy_Type_Special_Auto,Renew_Offer_Type_Offer2,Renew_Offer_Type_Offer3,Renew_Offer_Type_Offer4,Sales_Channel_Branch,Sales_Channel_Call_Center,Sales_Channel_Web,Region_West
6887,1,3,77235,0,87,2,68,0,5,2,...,1,1,0,1,0,0,1,0,0,1
6300,2,3,67787,0,194,9,73,0,1,2,...,0,1,0,0,0,1,0,0,1,0
91,1,2,72540,0,88,7,3,0,2,2,...,0,0,0,1,0,0,0,1,0,0
3016,0,0,75329,0,67,15,17,0,1,1,...,0,0,0,0,1,0,1,0,0,0
1025,0,2,70200,2,65,25,10,0,2,2,...,0,0,0,1,0,0,0,0,0,1


In [155]:
# Select categorical columns
X_test_cat = X_test.select_dtypes('O')
cat_columns_test = X_test_cat.columns.tolist()

# One-hot encode
X_test_cat = pd.get_dummies(X_test_cat, drop_first=True).astype('int8')

# Align X_test to X_train
X_test_cat = X_test_cat.reindex(columns=X_train_cat.columns, fill_value=0)

# Combine numerical + encoded categorical
X_test = pd.concat([X_test.drop(cat_columns_test, axis=1), X_test_cat], axis=1)

# Clean column names
X_test.columns = X_test.columns.str.strip().str.replace(' ', '_')
X_test.head()

Unnamed: 0,Coverage,Education,Income,Marital_Status,Monthly_Premium_Auto,Months_Since_Last_Claim,Months_Since_Policy_Inception,Number_of_Open_Complaints,Number_of_Policies,Policy,...,Location_Code_Urban,Policy_Type_Personal_Auto,Policy_Type_Special_Auto,Renew_Offer_Type_Offer2,Renew_Offer_Type_Offer3,Renew_Offer_Type_Offer4,Sales_Channel_Branch,Sales_Channel_Call_Center,Sales_Channel_Web,Region_West
6624,0,1,92381,0,72,24,27,0,1,1,...,1,0,1,1,0,0,0,1,0,1
8253,0,3,55479,1,68,5,7,1,1,1,...,0,1,0,1,0,0,1,0,0,1
1794,1,2,99359,0,95,16,19,0,9,2,...,0,1,0,1,0,0,0,1,0,1
7349,0,1,0,2,65,24,45,1,4,2,...,0,0,0,0,1,0,1,0,0,1
8826,1,1,44980,2,81,2,27,0,2,1,...,1,0,0,0,1,0,0,0,0,1


In [156]:
X_train['Monthly_Premium_Auto'].min()

61

In [157]:
X_test['Monthly_Premium_Auto'].min()

61

In [158]:
pow_trans = PowerTransformer(method='box-cox')

X_train['Monthly_Premium_Auto'] = pow_trans.fit_transform(X_train['Monthly_Premium_Auto'].values.reshape(-1,1))
X_test['Monthly_Premium_Auto'] = pow_trans.transform(X_test['Monthly_Premium_Auto'].values.reshape(-1,1))

In [159]:
pow_transform = PowerTransformer(method='yeo-johnson', standardize=True)
X_train['Total_Claim_Amount'] = pd.Series(pow_transform.fit_transform(X_train['Total_Claim_Amount'].values.reshape(-1, 1)).reshape(X_train['Total_Claim_Amount'].shape[0],))

X_test['Total_Claim_Amount'] = pd.Series(pow_transform.transform(X_test['Total_Claim_Amount'].values.reshape(-1, 1)).reshape(X_test['Total_Claim_Amount'].shape[0],))


In [160]:
# {'yeo-johnson', 'box-cox'}
#pow_transform = PowerTransformer(method='box-cox', standardize=True)
#y_train_transformed = pd.Series(pow_transform.fit_transform(y_train.values.reshape(-1, 1)).reshape(y_train.shape[0],))
#scaler = RobustScaler()

y_train_transformed = pd.Series(np.where(y_train==0,0,1/y_train))

#scaler.fit_transform(y_train.values.reshape(1,-1))
#y_test_transformed = pd.Series(pow_transform.fit_transform(y_test.values.reshape(-1,1)).reshape(y_test.shape[0],))

y_test_transformed = pd.Series(np.where(y_test==0,0,1/y_test))
#scaler.transform(y_test.values.reshape(1,-1))

print(y_train_transformed.skew())
print(y_test_transformed.skew())

0.6960564
0.6870645


In [161]:
X_train['Yearly_premium'] = X_train['Monthly_Premium_Auto'] * 12
X_test['Yearly_premium'] = X_test['Monthly_Premium_Auto'] * 12

#X_train['Net_revenue'] = X_train['Yearly_premium'] - X_train['Total Claim Amount']
#X_test['Net_revenue'] = X_test['Yearly_premium'] - X_test['Total Claim Amount']


X_train.drop('Monthly_Premium_Auto', axis=1, inplace=True)
X_test.drop('Monthly_Premium_Auto', axis=1, inplace=True)

In [162]:
signi_columns = ['EmploymentStatus_Other', 'Yearly_premium', 'Total_Claim_Amount',
       'Income', 'EmploymentStatus_Unemployed', 'Coverage',
       'Number_of_Open_Complaints', 'Gender_M', 'Months_Since_Last_Claim',
       'Number_of_Policies', 'Education', 'Marital_Status',
       'Months_Since_Policy_Inception']

X_train_signi = X_train[signi_columns]
X_test_signi = X_test[signi_columns]

# Final Model Training

In [163]:
model=LGBMRegressor(bagging_fraction=0.9693019353405686, bagging_freq=6,
               extra_trees=False, feature_fraction=0.8940125922273224,
               lambda_l1=1.352479444345171e-08, lambda_l2=3.867730994212937e-08,
               learning_rate=0.2669265474390058, max_bin=195, max_depth=2,
               min_child_weight=0.0064857098186572775, min_data_in_leaf=95,
               min_sum_hessian_in_leaf=0.013257550912313416, n_estimators=170,
               num_leaves=60, path_smooth=0.9961452467814682,
               subsample=0.9184787324572553, verbose=-1)

model.fit(X_train_signi,y_train_transformed)
model_performance(model,X=X_train_signi,y=y_train_transformed)

Mean Squared Error  2.4370723874840174e-10
Root Mean Squared Error  1.561112547987498e-05
Mean Absolute Error  1.0152246410425906e-05
Mean Absolute Percentage Error  0.11986482737809578
R2 score  0.9799379256076672
Adjusted R2 score  0.9799078197589889


In [164]:
model_performance(model,X=X_test_signi,y=y_test_transformed)

Mean Squared Error  2.420898991493242e-10
Root Mean Squared Error  1.5559238385901935e-05
Mean Absolute Error  1.0122395854635239e-05
Mean Absolute Percentage Error  0.11942097316750433
R2 score  0.9799897148748203
Adjusted R2 score  0.9794025056047813


# Saving Final and component of preprocessing data

In [165]:
ordinal_encoders = {
    "coverage": or_coverage,
    "education": or_edu,
    "marital_status": or_Marital_Stat,
    "vehicle_class": or_vec_class,
    "policy": or_policy,
    "vehicle_size": or_vec_size
}


In [167]:
json.dump(list(X_train_cat.columns), open("Model Files/model/ohe_columns.json", "w"))

In [None]:
with open("Model Files/model/ohe_columns.json", "w") as f:
    json.dump(list(X_train_ohe.columns), f)

In [None]:
power_transforms = {
    "total_claim_amount": pow_transform,
    "monthly_premium_auto": pow_trans
}

with open("Model Files/model/power_transformers.pkl", "wb") as f:
    pickle.dump(power_transforms, f)

In [None]:
with open("Model Files/model/selected_features.json", "w") as f:
    json.dump(list(X_train_signi.columns), f)

In [None]:
with open("Model Files/model/final_model.pkl", "wb") as f:
    pickle.dump(model, f)