###### 1.Data Gathering
###### 2.Data Preprocessing
###### 3.Feature Engineering
###### 4.Model Training
###### 5.Testing

###### Dataset used: https://www.kaggle.com/datasets/mirichoi0218/insurance

In [44]:
import pandas as pd
# !conda install -c conda-forge pandas -y



In [45]:
df = pd.read_csv("../Linear Regression/insurance.csv")

In [46]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [47]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [48]:
df.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [49]:
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [50]:
# !conda install -c conda-forge scikit-learn -y

In [51]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.model_selection import train_test_split

###### Split dataset before encoding

In [52]:
train_df,test_df = train_test_split(df,test_size=0.2,random_state=42)

###### Encode categorical variable

In [53]:
label_encoder = {}
for col in ['sex','smoker','region']:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col])
    test_df[col] = le.transform(test_df[col])
    label_encoder[col] = le

###### Features and Targets

In [54]:
X_train = train_df.drop(columns=['charges'])
y_train = train_df['charges']

X_test = test_df.drop(columns=['charges'])
y_test = test_df['charges']

In [55]:
X_train.head()

Unnamed: 0,age,sex,bmi,children,smoker,region
560,46,0,19.95,2,0,1
1285,47,0,24.32,0,0,0
1142,52,0,24.86,0,0,2
969,39,0,34.32,5,0,2
486,54,0,21.47,3,0,1


In [56]:
y_train.head()

560      9193.83850
1285     8534.67180
1142    27117.99378
969      8596.82780
486     12475.35130
Name: charges, dtype: float64

###### Normalize Features

In [57]:
scaler = StandardScaler()


X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [58]:
X_train

array([[ 0.47222651, -1.0246016 , -1.75652513,  0.73433626, -0.50874702,
        -0.45611589],
       [ 0.54331294, -1.0246016 , -1.03308239, -0.91119211, -0.50874702,
        -1.35325561],
       [ 0.8987451 , -1.0246016 , -0.94368672, -0.91119211, -0.50874702,
         0.44102382],
       ...,
       [ 1.3252637 ,  0.97598911, -0.89153925, -0.91119211, -0.50874702,
        -1.35325561],
       [-0.16755139, -1.0246016 ,  2.82086429,  0.73433626,  1.96561348,
         1.33816354],
       [ 1.1120044 ,  0.97598911, -0.10932713, -0.91119211, -0.50874702,
         1.33816354]])

###### Convert to tensors

In [59]:
X_train_tensor = torch.tensor(X_train,dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values,dtype=torch.float32).view(-1,1)
X_test_tensor = torch.tensor(X_test,dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values,dtype=torch.float32).view(-1,1)

In [60]:
y_train_tensor.shape

torch.Size([1070, 1])

In [61]:
X_test.shape

(268, 6)

###### Define Neural Network Model

In [62]:
class SimpleNNRegressionModel(nn.Module):
    def __init__(self,input_dim):
        super(SimpleNNRegressionModel,self).__init__()
        self.network = nn.Sequential(
            nn.Linear(input_dim,64),
            nn.ReLU(),
            nn.Linear(64,128),
            nn.ReLU(),
            nn.Linear(128,1)
        )    

    def forward(self,x):
        return self.network(x)
    

In [63]:
input_dim = X_train_tensor.shape[1]
model = SimpleNNRegressionModel(input_dim=input_dim)

In [64]:
model

SimpleNNRegressionModel(
  (network): Sequential(
    (0): Linear(in_features=6, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=128, bias=True)
    (3): ReLU()
    (4): Linear(in_features=128, out_features=1, bias=True)
  )
)

##### Loss and Optimizers

In [65]:
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(),lr=0.01)



######

###### x_train_tensor = 1000000 ---> 10gb --> OOM(out of memory)
###### 1000000 ---> weights and bias calculation

###### we are teaching human: A book of 1000 pages --> student: 10 : student, i'm not able to understand.
###### 1000
###### 10 --> 100
###### 100 feedback = 1000 pages --> epoch

###### 1000 pages total
###### 100 epoch
###### 10 pages feedback
###### 100 iteration * 100

######

##### Training Loop

In [66]:
# epochs = 30000

# for epoch in range(epochs):
#     model.train()
#     optimizer.zero_grad()
#     prediction = model(X_train_tensor)
#     loss = criterion(prediction,y_train_tensor)
#     loss.backward()


#     optimizer.step()

#     if (epoch+1) % 10 == 0:
#         print(f"Epoch [{epoch+1}/{epochs}],Loss:{loss.item():.4f}")

##### Understanding Components of a Custom DataLoader in PyTorch

###### 1. Dataset (torch.utils.data.Dataset)
###### 2. DataLoader (torch.utils.data.DataLoader)


#### Creating our own Custom Dataset in pytorch
###### init() --> initialised the dataset,loads data,applied preprocessing 
###### len() -->  return the total number of samples in the dataset
###### getitem() --> Defines how to retrieve a single data sample when an index is provide

In [67]:
import torch
from torch.utils.data import Dataset,DataLoader

In [68]:
class InsuranceDataSet(Dataset):
    def __init__(self,X,y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        features = torch.tensor(self.X[idx],dtype=torch.float32)
        target = torch.tensor(self.y.values[idx],dtype=torch.float32)
        return features,target

In [69]:
dataset = InsuranceDataSet(X_train,y_train)

In [None]:
dataloader = DataLoader(dataset,batch_size=32,shuffle=True,num_workers=4)

In [71]:
for batch_idx,(features,targets) in enumerate(dataloader):
    print(f"Batch {batch_idx+1}:")
    print("Features:",features.shape)
    print("Targets",targets.shape)
    # if batch_idx == 5:
    #     break

Batch 1:
Features: torch.Size([32, 6])
Targets torch.Size([32])
Batch 2:
Features: torch.Size([32, 6])
Targets torch.Size([32])
Batch 3:
Features: torch.Size([32, 6])
Targets torch.Size([32])
Batch 4:
Features: torch.Size([32, 6])
Targets torch.Size([32])
Batch 5:
Features: torch.Size([32, 6])
Targets torch.Size([32])
Batch 6:
Features: torch.Size([32, 6])
Targets torch.Size([32])
Batch 7:
Features: torch.Size([32, 6])
Targets torch.Size([32])
Batch 8:
Features: torch.Size([32, 6])
Targets torch.Size([32])
Batch 9:
Features: torch.Size([32, 6])
Targets torch.Size([32])
Batch 10:
Features: torch.Size([32, 6])
Targets torch.Size([32])
Batch 11:
Features: torch.Size([32, 6])
Targets torch.Size([32])
Batch 12:
Features: torch.Size([32, 6])
Targets torch.Size([32])
Batch 13:
Features: torch.Size([32, 6])
Targets torch.Size([32])
Batch 14:
Features: torch.Size([32, 6])
Targets torch.Size([32])
Batch 15:
Features: torch.Size([32, 6])
Targets torch.Size([32])
Batch 16:
Features: torch.Size([32

In [73]:
epochs = 1000

for epoch in range(epochs):
    model.train()

    for batch_idx,(batch_X, batch_y) in enumerate(dataloader):
        print(f"Current batch: {batch_idx}")
        optimizer.zero_grad()
        prediction = model(batch_X)
        loss = criterion(prediction,batch_y)
        loss.backward()


        optimizer.step()

        print(f"Batch [{batch_idx+1}/{epochs}],Loss:{loss.item():.4f}")

        if (epoch+1) % 100 == 0:
            print(f"Epoch [{epoch+1}/{epochs}],Loss:{loss.item():.4f}")

Current batch: 0
Batch [1/1000],Loss:225601424.0000
Current batch: 1
Batch [2/1000],Loss:150816080.0000
Current batch: 2
Batch [3/1000],Loss:124603984.0000
Current batch: 3
Batch [4/1000],Loss:124175696.0000
Current batch: 4
Batch [5/1000],Loss:172308400.0000
Current batch: 5
Batch [6/1000],Loss:214159696.0000
Current batch: 6
Batch [7/1000],Loss:192323776.0000
Current batch: 7
Batch [8/1000],Loss:90564720.0000
Current batch: 8
Batch [9/1000],Loss:143669408.0000
Current batch: 9
Batch [10/1000],Loss:171751520.0000
Current batch: 10
Batch [11/1000],Loss:123288056.0000
Current batch: 11
Batch [12/1000],Loss:97344864.0000
Current batch: 12
Batch [13/1000],Loss:144550544.0000
Current batch: 13
Batch [14/1000],Loss:182598864.0000
Current batch: 14
Batch [15/1000],Loss:182096048.0000
Current batch: 15
Batch [16/1000],Loss:155770496.0000
Current batch: 16
Batch [17/1000],Loss:177026560.0000
Current batch: 17
Batch [18/1000],Loss:127618968.0000
Current batch: 18
Batch [19/1000],Loss:61929484.0