In [101]:
import pandas as pd
import numpy as np

In [102]:
df = pd.read_csv("insurance.csv")

In [3]:
print(df)

      age     sex     bmi  children smoker     region      charges
0      19  female  27.900         0    yes  southwest  16884.92400
1      18    male  33.770         1     no  southeast   1725.55230
2      28    male  33.000         3     no  southeast   4449.46200
3      33    male  22.705         0     no  northwest  21984.47061
4      32    male  28.880         0     no  northwest   3866.85520
...   ...     ...     ...       ...    ...        ...          ...
1333   50    male  30.970         3     no  northwest  10600.54830
1334   18  female  31.920         0     no  northeast   2205.98080
1335   18  female  36.850         0     no  southeast   1629.83350
1336   21  female  25.800         0     no  southwest   2007.94500
1337   61  female  29.070         0    yes  northwest  29141.36030

[1338 rows x 7 columns]


In [4]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


Checking NaN Value

In [103]:
df.isna().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

Encoding the Categorical Data

1.) Using Map

In [104]:
 df["sex"] = df["sex"].map({"male":0, "female":1})

In [55]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,1,27.9,0,yes,southwest,16884.924
1,18,0,33.77,1,no,southeast,1725.5523
2,28,0,33.0,3,no,southeast,4449.462
3,33,0,22.705,0,no,northwest,21984.47061
4,32,0,28.88,0,no,northwest,3866.8552


In [105]:
df["smoker"] = df["smoker"].map({"no":0, "yes":1})

In [57]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,1,27.9,0,1,southwest,16884.924
1,18,0,33.77,1,0,southeast,1725.5523
2,28,0,33.0,3,0,southeast,4449.462
3,33,0,22.705,0,0,northwest,21984.47061
4,32,0,28.88,0,0,northwest,3866.8552


2.) Using Label Encoder

In [106]:
from sklearn.preprocessing import LabelEncoder

In [107]:
df["region"] = LabelEncoder().fit_transform(df["region"])

In [60]:
df.head(10)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,1,27.9,0,1,3,16884.924
1,18,0,33.77,1,0,2,1725.5523
2,28,0,33.0,3,0,2,4449.462
3,33,0,22.705,0,0,1,21984.47061
4,32,0,28.88,0,0,1,3866.8552
5,31,1,25.74,0,0,2,3756.6216
6,46,1,33.44,1,0,2,8240.5896
7,37,1,27.74,3,0,1,7281.5056
8,37,0,29.83,2,0,0,6406.4107
9,60,1,25.84,0,0,1,28923.13692


Normalization

In [108]:
from sklearn.preprocessing import MinMaxScaler

In [109]:
df2 = df.copy()

In [110]:
df2.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,1,27.9,0,1,3,16884.924
1,18,0,33.77,1,0,2,1725.5523
2,28,0,33.0,3,0,2,4449.462
3,33,0,22.705,0,0,1,21984.47061
4,32,0,28.88,0,0,1,3866.8552


In [111]:
df2["age"] = MinMaxScaler().fit_transform(df2[["age"]])

In [65]:
df2.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,0.021739,1,27.9,0,1,3,16884.924
1,0.0,0,33.77,1,0,2,1725.5523
2,0.217391,0,33.0,3,0,2,4449.462
3,0.326087,0,22.705,0,0,1,21984.47061
4,0.304348,0,28.88,0,0,1,3866.8552


In [112]:
df2["bmi"] = MinMaxScaler().fit_transform(df2[["bmi"]])

In [67]:
df2.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,0.021739,1,0.321227,0,1,3,16884.924
1,0.0,0,0.47915,1,0,2,1725.5523
2,0.217391,0,0.458434,3,0,2,4449.462
3,0.326087,0,0.181464,0,0,1,21984.47061
4,0.304348,0,0.347592,0,0,1,3866.8552


In [113]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   float64
 1   sex       1338 non-null   int64  
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   int64  
 5   region    1338 non-null   int32  
 6   charges   1338 non-null   float64
dtypes: float64(3), int32(1), int64(3)
memory usage: 68.1 KB


In [114]:
# Separate features and target 
features = ["age", "sex", "bmi", "smoker", "region"]
X = df2[features].values
Y = df2["charges"].values

In [115]:
df2[features]

Unnamed: 0,age,sex,bmi,smoker,region
0,0.021739,1,0.321227,1,3
1,0.000000,0,0.479150,0,2
2,0.217391,0,0.458434,0,2
3,0.326087,0,0.181464,0,1
4,0.304348,0,0.347592,0,1
...,...,...,...,...,...
1333,0.695652,0,0.403820,0,1
1334,0.000000,1,0.429379,0,0
1335,0.000000,1,0.562012,0,2
1336,0.065217,1,0.264730,0,3


In [116]:
# Get the values in the numpy array
df2[features].values

array([[0.02173913, 1.        , 0.3212268 , 1.        , 3.        ],
       [0.        , 0.        , 0.47914985, 0.        , 2.        ],
       [0.2173913 , 0.        , 0.45843422, 0.        , 2.        ],
       ...,
       [0.        , 1.        , 0.56201238, 0.        , 2.        ],
       [0.06521739, 1.        , 0.26472962, 0.        , 3.        ],
       [0.93478261, 1.        , 0.35270379, 1.        , 1.        ]])

In [117]:
# Printing the features and labels
print("Features:", X[:10], '\nLabels:', Y[:10], sep='\n')

Features:
[[0.02173913 1.         0.3212268  1.         3.        ]
 [0.         0.         0.47914985 0.         2.        ]
 [0.2173913  0.         0.45843422 0.         2.        ]
 [0.32608696 0.         0.18146355 0.         1.        ]
 [0.30434783 0.         0.34759214 0.         1.        ]
 [0.2826087  1.         0.26311542 0.         2.        ]
 [0.60869565 1.         0.47027172 0.         2.        ]
 [0.41304348 1.         0.31692225 0.         1.        ]
 [0.41304348 0.         0.37315039 0.         0.        ]
 [0.91304348 1.         0.26580576 0.         1.        ]]

Labels:
[16884.924    1725.5523   4449.462   21984.47061  3866.8552   3756.6216
  8240.5896   7281.5056   6406.4107  28923.13692]


In [118]:
from sklearn.model_selection import train_test_split

In [79]:
# Spilt data 70%-30% into training and test set
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.30)

# '''
# X_train: training data feature
# X_test: testing data feature
# Y_train: training data target
# Y_test: testing data target
# '''

In [119]:
Y_test

array([9.71709242e-02, 1.90268511e-01, 8.35517575e-02, 8.13736897e-03,
       9.70465645e-02, 7.98605183e-03, 1.15127695e-01, 7.59022863e-01,
       5.90666028e-01, 2.34580223e-01, 1.64922363e-01, 9.74697954e-02,
       4.97700034e-02, 1.62248993e-01, 8.10006579e-02, 1.11644954e-01,
       2.14570156e-03, 1.19465542e-01, 6.19473004e-02, 1.80837821e-02,
       9.67158953e-03, 1.73319810e-02, 3.44460408e-02, 3.94629531e-02,
       8.84995540e-02, 4.37254425e-02, 3.28011082e-02, 1.28845186e-01,
       1.61720366e-01, 1.39728787e-01, 4.55086449e-02, 3.86464768e-01,
       6.80485987e-01, 1.10877437e-01, 8.73385584e-02, 1.62016245e-02,
       5.83848814e-02, 1.59041386e-01, 1.15029806e-02, 8.69796682e-02,
       8.62167195e-01, 9.37055704e-03, 1.60286127e-01, 6.18527130e-01,
       8.16686701e-02, 7.25910361e-01, 1.65854284e-01, 1.81543167e-01,
       1.65379091e-01, 9.93352550e-02, 1.11651517e-01, 1.71125256e-01,
       1.46904142e-01, 1.82800310e-01, 4.51014272e-02, 1.96800202e-01,
      

In [120]:
# Train the model
from sklearn.linear_model import LinearRegression 

In [121]:
# Fit a linear regression model on the training set

model = LinearRegression().fit(X_train, Y_train)
print(model)

LinearRegression()


Evaluate the train model

In [122]:
import numpy as np
predictions = model.predict(X_test)

In [123]:
# Predicted
print("Predicted labels: ", np.round(predictions, 0)[:10])
print("Actual labels: ", Y_test[:10])

Predicted labels:  [0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
Actual labels:  [0.09717092 0.19026851 0.08355176 0.00813737 0.09704656 0.00798605
 0.1151277  0.75902286 0.59066603 0.23458022]


In [124]:
from sklearn.metrics import mean_squared_error, r2_score

mse = mean_squared_error(Y_test, predictions)
print("MSE:", mse)

rmse = np.sqrt(mse)
print("RMSE:", rmse)

r2 = r2_score(Y_test, predictions)
print("R2:", r2*100)

MSE: 0.008904997226378154
RMSE: 0.09436629285066864
R2: 75.16282039732984


Used the Trained Model

In [125]:
model

In [126]:
pwd

'C:\\Users\\Lenovo\\OneDrive\\Desktop\\Juypter_projects'

In [127]:
import joblib

In [128]:
# Save the model as pickel file
joblib.dump(model, "medical_insurance_model.pkl")

['medical_insurance_model.pkl']

In [129]:
# load the model from the file
loaded_model = joblib.load("medical_insurance_model.pkl")

In [130]:
X_train[0]

array([0.80434783, 1.        , 0.39117568, 0.        , 3.        ])

In [131]:
df2.head(1)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,0.021739,1,0.321227,0,1,3,16884.924


In [132]:
age = float(input("Enter the age: "))
sex = float(input("Enter the sex: " ))
bmi = float(input("Enter the bmi: "))
smoker = float(input("Enter whether you are smoker or not: "))
region = float(input("Enter the region you belong: "))

Enter the age:  20
Enter the sex:  1
Enter the bmi:  19
Enter whether you are smoker or not:  1
Enter the region you belong:  0


In [133]:
X_new = np.array([[age, sex, bmi, smoker, region]])

In [134]:
X_new

array([[20.,  1., 19.,  1.,  0.]])

In [135]:
result = loaded_model.predict(X_new)
result

array([7.94920848])