###### 1.Data Gathering
###### 2.Data Preprocessing
###### 3.Feature Engineering
###### 4.Model Training
###### 5.Testing

###### Dataset used: https://www.kaggle.com/datasets/mirichoi0218/insurance

In [87]:
import pandas as pd
# !conda install -c conda-forge pandas -y



In [88]:
df = pd.read_csv("insurance.csv")

In [89]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [90]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [91]:
df.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [92]:
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [93]:
# !conda install -c conda-forge scikit-learn -y

In [94]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.model_selection import train_test_split

###### Split dataset before encoding

In [95]:
train_df,test_df = train_test_split(df,test_size=0.2,random_state=42)

###### Encode categorical variable

In [96]:
label_encoder = {}
for col in ['sex','smoker','region']:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col])
    test_df[col] = le.transform(test_df[col])
    label_encoder[col] = le

###### Features and Targets

In [97]:
X_train = train_df.drop(columns=['charges'])
y_train = train_df['charges']

X_test = test_df.drop(columns=['charges'])
y_test = test_df['charges']

In [98]:
X_train.head()

Unnamed: 0,age,sex,bmi,children,smoker,region
560,46,0,19.95,2,0,1
1285,47,0,24.32,0,0,0
1142,52,0,24.86,0,0,2
969,39,0,34.32,5,0,2
486,54,0,21.47,3,0,1


In [99]:
y_train.head()

560      9193.83850
1285     8534.67180
1142    27117.99378
969      8596.82780
486     12475.35130
Name: charges, dtype: float64

###### Normalize Features

In [100]:
scaler = StandardScaler()


X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [101]:
X_train

array([[ 0.47222651, -1.0246016 , -1.75652513,  0.73433626, -0.50874702,
        -0.45611589],
       [ 0.54331294, -1.0246016 , -1.03308239, -0.91119211, -0.50874702,
        -1.35325561],
       [ 0.8987451 , -1.0246016 , -0.94368672, -0.91119211, -0.50874702,
         0.44102382],
       ...,
       [ 1.3252637 ,  0.97598911, -0.89153925, -0.91119211, -0.50874702,
        -1.35325561],
       [-0.16755139, -1.0246016 ,  2.82086429,  0.73433626,  1.96561348,
         1.33816354],
       [ 1.1120044 ,  0.97598911, -0.10932713, -0.91119211, -0.50874702,
         1.33816354]])

###### Convert to tensors

In [102]:
X_train_tensor = torch.tensor(X_train,dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values,dtype=torch.float32).view(-1,1)
X_test_tensor = torch.tensor(X_test,dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values,dtype=torch.float32).view(-1,1)

In [103]:
y_train_tensor.shape

torch.Size([1070, 1])

In [104]:
X_test.shape

(268, 6)

###### Define Neural Network Model

In [105]:
class SimpleNNRegressionModel(nn.Module):
    def __init__(self,input_dim):
        super(SimpleNNRegressionModel,self).__init__()
        self.network = nn.Sequential(
            nn.Linear(input_dim,64),
            nn.ReLU(),
            nn.Linear(64,128),
            nn.ReLU(),
            nn.Linear(128,1)
        )    

    def forward(self,x):
        return self.network(x)
    

In [106]:
input_dim = X_train_tensor.shape[1]
model = SimpleNNRegressionModel(input_dim=input_dim)

In [107]:
model

SimpleNNRegressionModel(
  (network): Sequential(
    (0): Linear(in_features=6, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=128, bias=True)
    (3): ReLU()
    (4): Linear(in_features=128, out_features=1, bias=True)
  )
)

##### Loss and Optimizers

In [108]:
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(),lr=0.01)



##### Training Loop

In [109]:
epochs = 30000

for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    prediction = model(X_train_tensor)
    loss = criterion(prediction,y_train_tensor)
    loss.backward()


    optimizer.step()

    if (epoch+1) % 10 == 0:
        print(f"Epoch [{epoch+1}/{epochs}],Loss:{loss.item():.4f}")

Epoch [10/30000],Loss:322032544.0000
Epoch [20/30000],Loss:319790592.0000
Epoch [30/30000],Loss:312943040.0000
Epoch [40/30000],Loss:297053824.0000
Epoch [50/30000],Loss:266801616.0000
Epoch [60/30000],Loss:218520576.0000
Epoch [70/30000],Loss:155329600.0000
Epoch [80/30000],Loss:93189792.0000
Epoch [90/30000],Loss:56894616.0000
Epoch [100/30000],Loss:47253616.0000
Epoch [110/30000],Loss:41076024.0000
Epoch [120/30000],Loss:36539456.0000
Epoch [130/30000],Loss:34696500.0000
Epoch [140/30000],Loss:33766104.0000
Epoch [150/30000],Loss:33257840.0000
Epoch [160/30000],Loss:32902838.0000
Epoch [170/30000],Loss:32576728.0000
Epoch [180/30000],Loss:32263748.0000
Epoch [190/30000],Loss:31960654.0000
Epoch [200/30000],Loss:31661976.0000
Epoch [210/30000],Loss:31359284.0000
Epoch [220/30000],Loss:31054754.0000
Epoch [230/30000],Loss:30750922.0000
Epoch [240/30000],Loss:30447078.0000
Epoch [250/30000],Loss:30149718.0000
Epoch [260/30000],Loss:29855974.0000
Epoch [270/30000],Loss:29571212.0000
Epo

##### Model Evaluation

In [110]:
model.eval()
y_pred = model(X_test_tensor).detach().numpy()

In [111]:
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score

y_test_numpy = y_test_tensor.numpy()

## Calculate metrics

mse = mean_squared_error(y_test_numpy,y_pred)
rmse = mse ** 0.5
mae = mean_absolute_error(y_test_numpy,y_pred)
r2 = r2_score(y_test_numpy,y_pred)



print("MSE",mse)
print("RMSE",rmse)
print("MAE",mae)
print("r2_score",r2)

MSE 62347912.0
RMSE 7896.069401923973
MAE 5175.62890625
r2_score 0.5983998775482178


In [112]:
def predict_charges(age,sex,bmi,children,smoker,region):
    input_data = pd.DataFrame([[age,sex,bmi,children,smoker,region]],
                 columns=['age','sex','bmi','children','smoker','region'])
    
    
    for col in ['sex','smoker','region']:
        input_data[col] = label_encoder[col].transform(input_data[col])
        

    input_data = scaler.transform(input_data)
    input_tensor = torch.tensor(input_data,dtype=torch.float32)
    predicted_charge = model(input_tensor).item()
    return predicted_charge

In [113]:
predicted = predict_charges(60,'female',36.005,0,'no','northeast')

In [114]:
print(f"Predicted insurance Charge is ${predicted:.2f}")

Predicted insurance Charge is $13739.09
