In [None]:
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('insurance.csv')

In [None]:
df

# EDA

In [None]:
df.shape

In [None]:
df.info

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
numeric_columns = ['age', 'bmi', 'children', 'charges']
for col in numeric_columns:
    plt.figure(figsize = (6,4))
    sns.histplot(df[col],kde = True , bins = 20)

In [None]:
sns.countplot(x = df['children'])

In [None]:
sns.countplot(x = df['sex'])

In [None]:
sns.countplot(x = df['smoker'])

In [None]:
for col in numeric_columns:
    plt.figure(figsize = (6,4))
    sns.boxplot(x = df[col])

In [None]:
plt.figure(figsize = (8,6))
sns.heatmap(df.corr(numeric_only = True), annot = True)

# Data Cleaning & Preprocessing

In [None]:
df_clean = df.copy()

In [None]:
df_clean.head()

In [None]:
df_clean.shape

In [None]:
df_clean.drop_duplicates(inplace = True)

In [None]:
df_clean.shape

In [None]:
df_clean.isnull().sum()

In [None]:
df_clean['sex'].value_counts()

In [None]:
df_clean['sex'] = df_clean['sex'].map({"male" : 0 , "female" : 1})

In [None]:
df_clean.head()

In [None]:
df_clean['smoker'] = df_clean['smoker'].map({"no" : 0 , "yes" : 1})

In [None]:
df_clean

In [None]:
df_clean.rename(columns = {
    'sex' : 'is_female',
    'smoker' : 'is_smoker',
                            },inplace = True)

In [None]:
df_clean

In [None]:
df['region'].value_counts()

In [None]:
df_clean = pd.get_dummies(df_clean,columns = ['region'],drop_first = True)

In [None]:
df_clean.head()

In [None]:
df_clean = df_clean.astype(int)

In [None]:
df_clean

# Feature Engineering and Extraction

In [None]:
sns.histplot(df['bmi'])

In [None]:
df_clean['bmi_category'] = pd.cut(
    df_clean['bmi'],
    bins = [0, 18.5, 24.9,29.9, float('inf')],
    labels = ['Underweight' , 'Normal', 'Overweight', 'Obese']
)

In [None]:
df_clean

In [None]:
df_clean = pd.get_dummies(df_clean,columns = ['bmi_category'],drop_first = True)

In [None]:
df_clean = df_clean.astype(int)

In [None]:
df_clean.head()

# Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
cols = ['age' , 'bmi', 'children']
scaler = StandardScaler()
df_clean[cols] = scaler.fit_transform(df_clean[cols])

In [None]:
df_clean.head()

# Feature Extraction

In [None]:
#Scipy

In [None]:
# Here We are observing that which columns or features are more co-related with the output data column

from scipy.stats import pearsonr
# List of features to check against 
selected_features = [
    'age' , 'bmi' , 'children' , 'is_female' , 'is_smoker' ,
    'region_northwest' , 'region_southeast' , 'region_southwest' ,
    'bmi_category_Normal' , 'bmi_category_Overweight' ,'bmi_category_Obese'
]

correlations = {
    feature: pearsonr(df_clean[feature], df_clean['charges'])[0]
    for feature in selected_features
}
corelation_df = pd.DataFrame(list(correlations.items()), columns = ['Feature','Person Corelation'])
corelation_df.sort_values(by= 'Person Corelation', ascending = False)

In [None]:
cat_features = [
    'is_female', 'is_smoker',
    'region_northwest', 'region_southeast', 'region_southwest', 
    'bmi_category_Normal','bmi_category_Overweight' ,'bmi_category_Obese'
]

In [None]:
from scipy.stats import chi2_contingency
import pandas as pd

alpha = 0.05

df_clean['charges_bin'] = pd.qcut(df_clean['charges'], q = 4 , labels = False)
chi2_results= {}

for col in cat_features:
    contingency = pd.crosstab(df_clean[col], df_clean['charges_bin'])
    chi2_stat, p_val, _, _ = chi2_contingency(contingency)
    decision = 'Reject Null (Keep Feature)' if p_val < alpha else 'Accept Null (Drop Feature)'
    chi2_results[col] = {
        'chi2_statistic': chi2_stat,
        'p_value': p_val,
        'Decision': decision
    }

chi2_df = pd.DataFrame(chi2_results).T
chi2_df = chi2_df.sort_values(by='p_value')
chi2_df


In [None]:
final_df = df_clean[['age', 'is_female', 'bmi', 'children', 'is_smoker', 'charges','region_southeast','bmi_category_Obese']]

In [None]:
final_df

!git remote add origin git@github.com:arka-codes264/Insurance-Predictor-.git
!git branch -M main
!git push -u origin main
