# Exploratory Data Analysis

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

## Step 1: Read the dataset and basic dataframe exploration

In [None]:
df = pd.read_csv('insurance_data.csv')
df.head()

Observations:
- There are a mix of numeric and category columns.
- There are missing values.
- Label column is `claim`.

In [None]:
df.shape

ML models require examples, i.e., rows. A good thumb rule is 100 rows per column. Our dataset fits that.

In [None]:
df.dtypes

Some columns require encoding as they are categorical.

In [None]:
df.columns

## Step 2: Summary statistics of numeric columns

In [None]:
df.describe()

Observations:
- 1340 rows in the dataset. Age has 5 missing values.
- `claim` column has a wide range and may have outliers.

## Step 3: Value counts of category columns

In [None]:
df['gender'].value_counts()

In [None]:
df['diabetic'].value_counts()

In [None]:
df['smoker'].value_counts()

## Step 4: Data Visualization

### Univariate Histogram

In [None]:
plt.figure(figsize=(8, 5))
sns.histplot(df['age'], bins=20, kde=True)
plt.title('Histogram of Age')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()

In [None]:
plt.figure(figsize=(8, 5))
sns.histplot(df['claim'], bins=20, kde=True)
plt.title('Histogram of Claim')
plt.xlabel('Claim')
plt.ylabel('Frequency')
plt.show()

### Univariate Pie Chart

In [None]:
plt.figure(figsize=(8, 5))
df['region'].value_counts().plot(kind='pie', autopct='%1.1f%%', colors=sns.color_palette('pastel'))
plt.title('Pie Chart of Regions')
plt.ylabel('')
plt.show()

### Univariate Box Plot

In [None]:
plt.figure(figsize=(8, 5))
sns.boxplot(x=df['bmi'])
plt.title('Box Plot of BMI')
plt.xlabel('BMI')
plt.show()

Shows some outliers in BMI column.

In [None]:
plt.figure(figsize=(8, 5))
sns.boxplot(x=df['claim'])
plt.title('Box Plot of Claim')
plt.xlabel('Claim')
plt.show()

### Bivariate Line Plot

In [None]:
sns.lineplot(x='age', y='claim', data=df, errorbar=None)

In [None]:
sns.scatterplot(x='bmi', y='claim', data=df)

### Bivariate Scatter Plot

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x='bmi', y='claim', hue='smoker', data=df)
plt.title('Scatter Plot of Claim vs BMI (colored by Smoker)')
plt.xlabel('BMI')
plt.ylabel('Claim')
plt.legend(title='Smoker')
plt.show()

## Step 5: Observations from data visualization

- There are ID columns which need to be removed.
- There are missing values that require rows to be dropped.
- Outliers require rows to be trimmed.
- Age seems to have a normal distribution with most values clustered around the mean.
- Region is categorical with four distinct values, and the proportions are relatively balanced.
- BMI shows some outliers towards the higher end of the distribution.
- Claim amount is positively correlated with age but seems to have a wider spread for smokers.

# Data Preprocessing

## Step 1: Remove ID columns - `index` and `PatientID`

In [None]:
df = df.drop(columns=['index', 'PatientID'])
df.shape

## Step 2: Remove rows with missing values

In [None]:
df = df.dropna()
df.shape

## Step 3: Split numeric columns into `X`

In [None]:
X = df.drop('claim', axis=1)
X_num = X.select_dtypes(include=['int64', 'float64'])

### Outlier Filter

In [None]:
Q1 = X_num.quantile(0.25)
Q3 = X_num.quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outlier_filter = ~((X_num < lower_bound) | (X_num > upper_bound)).any(axis=1)
df = df[outlier_filter]

From the outlier trimmed `df`, fetch label, numeric features, and category features for further processing.

In [None]:
y = df['claim']
X = df.drop('claim', axis=1)
X_num = X.select_dtypes(include=['int64', 'float64'])
X_cat = df.select_dtypes(include=['object'])
X_num.shape, X_cat.shape

## Step 4: Rescale numeric columns (optional, based on the algorithm used)

In [None]:
scaler = MinMaxScaler()
X_num_scaled = scaler.fit_transform(X_num)
X_num_scaled = pd.DataFrame(X_num_scaled, columns=X_num.columns, index=X_num.index)

## Step 5: One-hot encode category columns

In [None]:
X_cat_encoded = pd.get_dummies(X_cat, drop_first=False, dtype=int)

## Step 6: Merge `df_num` and `df_cat_encoded` into `X`

In [None]:
X = pd.concat([X_num_scaled, X_cat_encoded], axis=1)
X.shape

## Step 7: Check for NA in `X` and `y`; Check for shape compatibility

In [None]:
print(X.isnull().sum())
print(y.isnull().sum())
print(X.shape)
print(y.shape)
X.describe()

## Step 8: Train-test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

## Step 9: Observations after preprocessing

- ID columns have been removed.
- Rows with missing values have been removed.
- Numeric and categorical features have been separated into `X` and `y`, respectively.
- Numeric columns have been rescaled (if required, this step is optional based on the algorithm used).
- Category columns have been one-hot encoded to be used in the model.
- The dataset has been split into train and test sets for model evaluation.

We are ready to fit ML models to train and evaluate using test data.

In [None]:
X.to_csv('insurance_claim_features.csv', index=False)
y.to_csv('insurance_claim_label.csv', index=False)