# EDA - Life Insurance Subscription Prediction

INSAT GL4 Mini-Project

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
print('Libraries loaded!')

## 1. Data Loading

In [None]:
df = pd.read_csv('../data/train.csv')
print(f'Shape: {df.shape}')
df.head()

In [None]:
df.info()

In [None]:
df.describe()

## 2. Missing Values

In [None]:
print('Missing Values:')
print(df.isnull().sum())

## 3. Target Distribution

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(12, 5))
df['Response'].value_counts().plot(kind='bar', ax=ax[0], color=['#3498db', '#e74c3c'])
ax[0].set_title('Response Count')
df['Response'].value_counts().plot(kind='pie', autopct='%1.1f%%', ax=ax[1])
ax[1].set_title('Response %')
plt.tight_layout()
plt.savefig('../figs/target_distribution.png', dpi=150)
plt.show()

## 4. Numerical Features

In [None]:
num_cols = ['Age', 'Annual_Premium', 'Vintage']
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
for i, col in enumerate(num_cols):
    axes[0, i].hist(df[col], bins=50, color='steelblue', edgecolor='white')
    axes[0, i].set_title(f'{col} Distribution')
    axes[1, i].boxplot(df[col])
    axes[1, i].set_title(f'{col} Boxplot')
plt.tight_layout()
plt.savefig('../figs/numerical_univariate.png', dpi=150)
plt.show()

## 5. Categorical Features

In [None]:
cat_cols = ['Gender', 'Driving_License', 'Previously_Insured', 'Vehicle_Age', 'Vehicle_Damage']
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()
for i, col in enumerate(cat_cols):
    df[col].value_counts().plot(kind='bar', ax=axes[i])
    axes[i].set_title(f'{col}')
axes[5].axis('off')
plt.tight_layout()
plt.savefig('../figs/categorical_univariate.png', dpi=150)
plt.show()

## 6. Features vs Target

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
for i, col in enumerate(num_cols):
    df.boxplot(column=col, by='Response', ax=axes[i])
plt.suptitle('')
plt.tight_layout()
plt.savefig('../figs/numerical_vs_target.png', dpi=150)
plt.show()

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()
for i, col in enumerate(cat_cols):
    pd.crosstab(df[col], df['Response'], normalize='index').plot(kind='bar', ax=axes[i])
    axes[i].set_title(f'Response by {col}')
axes[5].axis('off')
plt.tight_layout()
plt.savefig('../figs/categorical_vs_target.png', dpi=150)
plt.show()

## 7. Correlation Matrix

In [None]:
df_enc = df.copy()
df_enc['Gender'] = df_enc['Gender'].map({'Male': 1, 'Female': 0})
df_enc['Vehicle_Age'] = df_enc['Vehicle_Age'].map({'< 1 Year': 0, '1-2 Year': 1, '> 2 Years': 2})
df_enc['Vehicle_Damage'] = df_enc['Vehicle_Damage'].map({'Yes': 1, 'No': 0})
corr = df_enc.drop('id', axis=1).corr()
plt.figure(figsize=(12, 10))
sns.heatmap(corr, annot=True, fmt='.2f', cmap='RdBu_r', center=0)
plt.title('Correlation Matrix')
plt.tight_layout()
plt.savefig('../figs/correlation_matrix.png', dpi=150)
plt.show()

## 8. Key Insights

In [None]:
print('EDA KEY INSIGHTS:')
print('1. Class imbalance: ~88% No, ~12% Yes')
print('2. Vehicle_Damage highly correlated with Response')
print('3. Previously_Insured negatively correlated')
print('4. Annual_Premium has outliers')
print('5. Middle-aged customers more likely to subscribe')