In [1]:
import pandas as pd

df = pd.read_csv("D:\INNOVERA\medical_in/insurance.csv")
print(df.head())  # Display first 5 rows


   age     sex     bmi  children smoker     region      charges
0   19  female  27.900         0    yes  southwest  16884.92400
1   18    male  33.770         1     no  southeast   1725.55230
2   28    male  33.000         3     no  southeast   4449.46200
3   33    male  22.705         0     no  northwest  21984.47061
4   32    male  28.880         0     no  northwest   3866.85520


In [3]:
print(df.info())  # Overview of dataset (data types, null values)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB
None


In [4]:
print(df.describe())  # Statistical summary of numerical columns

               age          bmi     children       charges
count  1338.000000  1338.000000  1338.000000   1338.000000
mean     39.207025    30.663397     1.094918  13270.422265
std      14.049960     6.098187     1.205493  12110.011237
min      18.000000    15.960000     0.000000   1121.873900
25%      27.000000    26.296250     0.000000   4740.287150
50%      39.000000    30.400000     1.000000   9382.033000
75%      51.000000    34.693750     2.000000  16639.912515
max      64.000000    53.130000     5.000000  63770.428010


In [5]:
print(df.head())  # First 5 rows

   age     sex     bmi  children smoker     region      charges
0   19  female  27.900         0    yes  southwest  16884.92400
1   18    male  33.770         1     no  southeast   1725.55230
2   28    male  33.000         3     no  southeast   4449.46200
3   33    male  22.705         0     no  northwest  21984.47061
4   32    male  28.880         0     no  northwest   3866.85520


In [6]:
print(df['sex'].unique())
print(df['smoker'].unique())
print(df['region'].unique())

['female' 'male']
['yes' 'no']
['southwest' 'southeast' 'northwest' 'northeast']


In [8]:
df = pd.get_dummies(df, columns=['sex', 'smoker', 'region'], drop_first=True)
print(df.head())

   age     bmi  children      charges  sex_male  smoker_yes  region_northwest  \
0   19  27.900         0  16884.92400     False        True             False   
1   18  33.770         1   1725.55230      True       False             False   
2   28  33.000         3   4449.46200      True       False             False   
3   33  22.705         0  21984.47061      True       False              True   
4   32  28.880         0   3866.85520      True       False              True   

   region_southeast  region_southwest  
0             False              True  
1              True             False  
2              True             False  
3             False             False  
4             False             False  


In [9]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df[['age', 'bmi', 'charges']] = scaler.fit_transform(df[['age', 'bmi', 'charges']])
print(df.head())

        age       bmi  children   charges  sex_male  smoker_yes  \
0  0.021739  0.321227         0  0.251611     False        True   
1  0.000000  0.479150         1  0.009636      True       False   
2  0.217391  0.458434         3  0.053115      True       False   
3  0.326087  0.181464         0  0.333010      True       False   
4  0.304348  0.347592         0  0.043816      True       False   

   region_northwest  region_southeast  region_southwest  
0             False             False              True  
1             False              True             False  
2             False              True             False  
3              True             False             False  
4              True             False             False  


In [10]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=['charges'])  # Features
y = df['charges']  # Target variable (insurance cost)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred))
print("R-squared Score:", r2_score(y_test, y_pred))


Mean Absolute Error: 0.04085016919100503
R-squared Score: 0.8655184426538141


In [12]:
def recommend_policy(predicted_cost):
    if predicted_cost < 5000:
        return "Basic Health Plan"
    elif predicted_cost < 15000:
        return "Comprehensive Plan"
    else:
        return "Premium Plan with Critical Illness Cover"

In [13]:
sample_user = [[40, 28, 1, 0, 1, 0, 0, 1]]  # Example: 40-year-old, BMI=28, Male, Smoker
predicted_cost = model.predict(sample_user)[0]
policy = recommend_policy(predicted_cost)

print(f"Predicted Insurance Cost: {predicted_cost}")
print(f"Recommended Policy: {policy}")

Predicted Insurance Cost: 0.7565360294633622
Recommended Policy: Basic Health Plan




In [1]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# Load dataset (make sure you have 'insurance.csv' in your working directory)
df = pd.read_csv("insurance.csv")

# Convert categorical variables
df["sex_male"] = df["sex"].apply(lambda x: 1 if x == "male" else 0)
df["smoker_yes"] = df["smoker"].apply(lambda x: 1 if x == "yes" else 0)
df["region_northwest"] = df["region"].apply(lambda x: 1 if x == "northwest" else 0)
df["region_southeast"] = df["region"].apply(lambda x: 1 if x == "southeast" else 0)
df["region_southwest"] = df["region"].apply(lambda x: 1 if x == "southwest" else 0)

# Select features & target variable
X = df[["age", "bmi", "sex_male", "smoker_yes", "region_northwest", "region_southeast", "region_southwest"]]
y = df["charges"]

# Train model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)

# Save model as .pkl file
joblib.dump(model, "insurance_model.pkl")

print("✅ Model saved successfully as 'insurance_model.pkl'")


✅ Model saved successfully as 'insurance_model.pkl'
