# Step 1: Import necessary libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import pickle

# Step 2: Load the dataset

In [3]:
df = pd.read_csv('insurance_data.csv')
df.head()

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,1,Male,44,1,28,0,> 2 Years,Yes,40454,26,217,1
1,2,Male,76,1,3,0,1-2 Year,No,33536,26,183,0
2,3,Male,47,1,28,0,> 2 Years,Yes,38294,26,27,1
3,4,Male,21,1,11,1,< 1 Year,No,28619,152,203,0
4,5,Female,29,1,41,1,< 1 Year,No,27496,152,39,0


# Step 3: Data Preprocessing

In [6]:
# Handling categorical features using one-hot encoding or label encoding
df['Gender'] = df['Gender'].map({'Male': 1, 'Female': 0})
df['Vehicle_Age'] = df['Vehicle_Age'].map({'< 1 Year':2, '> 2 Years':0, '1-2 Year':1})
df['Vehicle_Damage'] = df['Vehicle_Damage'].map({'Yes': 1, 'No': 0})
df.head()

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,1,1,44,1,28,0,0,1,40454,26,217,1
1,2,1,76,1,3,0,1,0,33536,26,183,0
2,3,1,47,1,28,0,0,1,38294,26,27,1
3,4,1,21,1,11,1,2,0,28619,152,203,0
4,5,0,29,1,41,1,2,0,27496,152,39,0


In [8]:
# Define features and target
X = df.drop(columns=['id', 'Response'])  # ID is not needed as a feature
y = df['Response']  # Target variable

# Step 4: Train-test split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: Model building (Random Forest)

In [10]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

RandomForestClassifier(random_state=42)

# Step 6: Evaluate the model

In [18]:
y_pred = rf_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8649733672692923
Confusion Matrix:
 [[64798  1901]
 [ 8391  1132]]
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.97      0.93     66699
           1       0.37      0.12      0.18      9523

    accuracy                           0.86     76222
   macro avg       0.63      0.55      0.55     76222
weighted avg       0.82      0.86      0.83     76222



# Step 7: Save the trained model using pickle

In [25]:
import gzip
# Saving (compressing) the pickle file
with gzip.open('random_forest_model.pkl.gz', 'wb') as f:
    pickle.dump(rf_model, f)

# Step 8: Checking, is Pickle works or not?

In [26]:
# Loading (decompressing) the pickle file
with gzip.open('random_forest_model.pkl.gz', 'rb') as f:
    loaded_model = pickle.load(f)

In [27]:
sample_data = np.array([[0,47,1,35,0,1,1,47576,124,46]])
prediction = loaded_model.predict(sample_data)

# Output the prediction result
if prediction[0] == 1:
    print("The customer is likely to buy the insurance.")
else:
    print("The customer is not likely to buy the insurance.")

The customer is likely to buy the insurance.


