<a href="https://colab.research.google.com/github/anjali-sharma-27/Health-Insurance-Price-Prediction/blob/main/HealthInsurancePricePrediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Step 1: Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import joblib  # for saving the model

# Step 2: Load dataset
insurance_dataset = pd.read_csv('/content/insurance.csv')

# Step 3: Basic info
print(insurance_dataset.head())
print(insurance_dataset.shape)
print(insurance_dataset.info())
print(insurance_dataset.isnull().sum())
print(insurance_dataset.describe())

# Step 4: Data Visualization (optional)
# [Skipping plots here for brevity; they don't affect model saving]

# Step 5: Encode categorical columns
insurance_dataset.replace({'sex': {'male': 0, 'female': 1}}, inplace=True)
insurance_dataset.replace({'smoker': {'yes': 0, 'no': 1}}, inplace=True)
insurance_dataset.replace({'region': {'southeast': 0, 'southwest': 1, 'northeast': 2, 'northwest': 3}}, inplace=True)

# Step 6: Split into X and Y
X = insurance_dataset.drop(columns='charges', axis=1)
Y = insurance_dataset['charges']

# Step 7: Train/Test split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=2)

# Step 8: Train model
regressor = LinearRegression()
regressor.fit(X_train, Y_train)

# Step 9: Evaluate
train_prediction = regressor.predict(X_train)
r2_train = metrics.r2_score(Y_train, train_prediction)
print("R² Score (Train):", r2_train)

test_prediction = regressor.predict(X_test)
r2_test = metrics.r2_score(Y_test, test_prediction)
print("R² Score (Test):", r2_test)

# Step 10: Test with custom input
input_data = (25, 0, 22, 1, 0, 3)  # age, sex, bmi, children, smoker, region
input_data_np = np.asarray(input_data).reshape(1, -1)
prediction = regressor.predict(input_data_np)
print("Predicted Insurance Cost: USD", prediction[0])

# ✅ Step 11: Save the model (MAKE SURE it's in the same folder as your Flask backend)
joblib.dump(regressor, 'insurance_model.pkl')
print("✅ Model saved successfully as insurance_model.pkl")


   age     sex     bmi  children smoker     region      charges
0   19  female  27.900         0    yes  southwest  16884.92400
1   18    male  33.770         1     no  southeast   1725.55230
2   28    male  33.000         3     no  southeast   4449.46200
3   33    male  22.705         0     no  northwest  21984.47061
4   32    male  28.880         0     no  northwest   3866.85520
(1338, 7)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB
None
age         0
sex         0
bmi         0
children    0
smoker      0
region     

  insurance_dataset.replace({'sex': {'male': 0, 'female': 1}}, inplace=True)
  insurance_dataset.replace({'smoker': {'yes': 0, 'no': 1}}, inplace=True)
  insurance_dataset.replace({'region': {'southeast': 0, 'southwest': 1, 'northeast': 2, 'northwest': 3}}, inplace=True)
