# [kaggle_insurance_cross_selling]

---

## 1. Importing Libraries

---

In [59]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier, callback
from sklearn.metrics import roc_auc_score
import gc

In [60]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample = pd.read_csv('sample_submission.csv')

In [81]:
sample

Unnamed: 0,id,Response
0,11504798,0.5
1,11504799,0.5
2,11504800,0.5
3,11504801,0.5
4,11504802,0.5
...,...,...
7669861,19174659,0.5
7669862,19174660,0.5
7669863,19174661,0.5
7669864,19174662,0.5


In [62]:
train.head()

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,0,Male,21,1,35.0,0,1-2 Year,Yes,65101.0,124.0,187,0
1,1,Male,43,1,28.0,0,> 2 Years,Yes,58911.0,26.0,288,1
2,2,Female,25,1,14.0,1,< 1 Year,No,38043.0,152.0,254,0
3,3,Female,35,1,1.0,0,1-2 Year,Yes,2630.0,156.0,76,0
4,4,Female,36,1,15.0,1,1-2 Year,No,31951.0,152.0,294,0


In [63]:
test.head()

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage
0,11504798,Female,20,1,47.0,0,< 1 Year,No,2630.0,160.0,228
1,11504799,Male,47,1,28.0,0,1-2 Year,Yes,37483.0,124.0,123
2,11504800,Male,47,1,43.0,0,1-2 Year,Yes,2630.0,26.0,271
3,11504801,Female,22,1,47.0,1,< 1 Year,No,24502.0,152.0,115
4,11504802,Male,51,1,19.0,0,1-2 Year,No,34115.0,124.0,148


In [64]:
train.shape, test.shape

((11504798, 12), (7669866, 11))

In [65]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11504798 entries, 0 to 11504797
Data columns (total 12 columns):
 #   Column                Dtype  
---  ------                -----  
 0   id                    int64  
 1   Gender                object 
 2   Age                   int64  
 3   Driving_License       int64  
 4   Region_Code           float64
 5   Previously_Insured    int64  
 6   Vehicle_Age           object 
 7   Vehicle_Damage        object 
 8   Annual_Premium        float64
 9   Policy_Sales_Channel  float64
 10  Vintage               int64  
 11  Response              int64  
dtypes: float64(3), int64(6), object(3)
memory usage: 1.0+ GB


In [70]:
# Convert object types to category
for col in ['Gender', 'Vehicle_Age', 'Vehicle_Damage']:
    train[col] = train[col].astype('category')
    test[col] = test[col].astype('category')

In [71]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11504798 entries, 0 to 11504797
Data columns (total 12 columns):
 #   Column                Dtype   
---  ------                -----   
 0   id                    int64   
 1   Gender                category
 2   Age                   int64   
 3   Driving_License       int64   
 4   Region_Code           float64 
 5   Previously_Insured    int64   
 6   Vehicle_Age           category
 7   Vehicle_Damage        category
 8   Annual_Premium        float64 
 9   Policy_Sales_Channel  float64 
 10  Vintage               int64   
 11  Response              int64   
dtypes: category(3), float64(3), int64(6)
memory usage: 822.9 MB


In [35]:
y = train['Response']
X = train.drop(['Response'],axis=1)

train_X, test_X, train_y, test_y = train_test_split(X, y,test_size = 0.2, random_state =41,stratify=y)

In [72]:
# Define parameters
params = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'eta': 0.03,
    'alpha': 0.1,
    'subsample': 0.851,
    'colsample_bytree': 0.45,
    'max_depth': 12,
    'min_child_weight': 10,
    'gamma': 1e-6,
    'random_state': 42,
    'max_bin': 55,
    'enable_categorical': True,
    'tree_method': 'hist'
}
gc.collect()

1736

In [73]:
# Initialize model
model = XGBClassifier(**params, n_estimators=100)

# Train model with early stopping
early_stop = callback.EarlyStopping(rounds=50, save_best=True, maximize=True)
model.fit(
    train_X,
    train_y,
    eval_set=[(test_X, test_y)],
    verbose=50
)


[0]	validation_0-auc:0.72950
[50]	validation_0-auc:0.86228
[99]	validation_0-auc:0.86639


In [74]:
# Evaluate the model
from sklearn.metrics import accuracy_score, roc_auc_score

# Predict on test data
preds = model.predict(test_X)
proba_preds = model.predict_proba(test_X)[:, 1]

In [75]:
# Calculate accuracy
accuracy = accuracy_score(test_y, preds)
auc = roc_auc_score(test_y, proba_preds)

print(f"Accuracy: {accuracy:.4f}")
print(f"AUC: {auc:.4f}")


Accuracy: 0.8770
AUC: 0.8664


In [78]:
# Predict on new test data
new_preds = model.predict(test)
new_proba_preds = model.predict_proba(test)[:, 1]

new_proba_preds = np.round(new_proba_preds, 1)

# Prepare submission DataFrame
submission = pd.DataFrame({
    'id': sample['id'],
    'Response': new_proba_preds
})

In [79]:
# Save to CSV
submission.to_csv('submission.csv', index=False)

# Display first few rows of submission file
print(submission.head())

         id  Response
0  11504798       0.0
1  11504799       0.3
2  11504800       0.2
3  11504801       0.0
4  11504802       0.1
