#### Creating **custom dataset** for the model

In [None]:
# example implementation framework

import pandas as pd
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

# Defining dataset processing class
class WaterDataset(Dataset):
    def __init__(self, csv_path):
        super().__init__()
        df = pd.read_csv(csv_path)
        self.data = df.to_numpy()
    
    def __len__(self):
        return self.data.shape[0]

    def __getitem__(self, idx):
        features = self.data[idx, :-1]
        label = self.data[idx, -1]
        return features, label


# creating class object and loading the data in the object.
dataset_train = WaterDataset('water_train.csv')

# loading the data from the class object using dataloader
dataloader_train = DataLoader(dataset_train,
                              batch_size = 2,
                              shuffle = True)

features, labels = next(iter(dataloader_train))
print(f'Features {features}\nLabels: {labels}')

#### **Sequential** VS **Class-based** model definition

In [None]:
# Sequential Model Definition
import torch.nn as nn

net = nn.Sequential(
    nn.Linear(9, 16),
    nn.ReLU(),
    nn.Linear(16, 8),
    nn.ReLU(),
    nn.Linear(8, 1),
    nn.Sigmoid()
)

In [1]:
# Class-based model definition
import torch.nn as nn

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(9, 16)
        self.fc2 = nn.Linear(16,8)
        self.fc3 = nn.Linear(8, 1)
    
    def forward(self, x):
        x = nn.functional.relu(self.fc1(x))
        x = nn.functional.relu(self.fc2(x))
        x = nn.functional.sigmoid(self.fc3(x))
        return x

net = Net()

#### Training Loop

In [None]:
# Example

import torch.nn as nn
import torch.optim as optim

# Using Binary Cross Entropy Loss function (BCELoss) which is commonly used for binary classification
criterion = nn.BCELoss()
optimizer = optim.SGD(net.parameters(), lr = 0.01)

for epoch in range(10000):
    for features, labels in dataloader_train:
        optimizer.zero_grad()
        outputs = net(features)
        loss = criterion(outputs, labels.view(-1, 1))
        loss.backward()
        optimizer.step()



#### Model Evaluation 

In [None]:
import torch
from torchmetrics import Accuracy

acc = Accuracy(task='binary')

# putting model in evaluation mode
net.eval()
with torch.no_grad():
    for features, labels in dataloader_train:
        outputs = net(features)
        preds = (outputs >= 0.5).float()
        acc(preds, labels.view(-1, 1))

accuracy = acc.compute()
print(f'Accuracy: {accuracy}')

#### Vanishing and Exploding Gradients

Gradients that get smaller and smaller during backward pass are called Vanishing gradients. here, the earlier layers get small parameter updates and the model doesn't learn.

Gradients that get bigger and bigger are called exploding gradients. here, the paarameter updates are too large and the training diverges.

To address this problem, we need :-
- Proper weight Initialization
- Good Activations
- Batch Normalization

1. **Proper weight Initialization**
    - ensures that the variance of layer inputs = vairnace of layer outputs
    - variance of gradients are the same before and after a layer.
    - We can use He/Kaiming weight Initialization for ReLU and similar actv. functions.
    - e.g. :-
        ```py
        import torch.nn.init as init

        init.kaiming_uniform_(layer.weight)
        print(layer.weight)
        ```
    - for actv. using classes :-
        ```py
        init.kaiming_uniform_(self.fc1.weight)
        init.kaiming_uniform_(self.fc2.weight)
        init.kaiming_uniform_(self.fc3.weight, nonlinearity='sigmoid')
        ```

In [None]:
# He/Kaiming Weight initialization implementation-
import torch.nn as nn
import torch.nn.init as init

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(9, 16)
        self.fc2 = nn.Linear(16, 8)
        self.fc3 = nn.Linear(8, 1)
    
        init.kaiming_uniform_(self.fc1.weight)
        init.kaiming_uniform_(self.fc2.weight)
        init.kaiming_uniform_(self.fc3.weight, nonlinearity='sigmoid')
    
    def forward(self, x):
        x = nn.functional.relu(self.fc1(x))
        x = nn.functional.relu(self.fc2(x))
        x = nn.functional.sigmoid(self.fc3(x))
        return x


2. **Activation Functions**
   - ReLU is often used as default activation 
   - It can be summoned using `nn.functional.relu()`
   - Drawback - it suffers from dying neurons problem (zero for negative inputs)
   - ELU (Exponential linear unit) is an alternative that avoids dying neurons problem. 
   - It's non-zero gradients for negative values helps against dying neurons.
   - It's average output is near zero, so it helps against vanishing gradients.
   - It can be summoned using `nn.functional.elu()`

3. **Batch Normalization** - After adding a layer, follow the steps -

   1.  Noramalize the layer's outputs by 
       - subtracting the mean
       - dividing by the standard deviation
   
   2. Scale and shift normalized outputs using learnned parameters
      -   Model Learns optimal inputs distribution for each layer and helps in
          -   faster loss decrease,
          -   against unstable gradients

In [None]:
# Batch Normalization example -

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(9, 16)
        self.bn1 = nn.BatchNorm1d(16)
        # ...

    def forward(self, x):
        x = self.fc1(x)
        x = self.bn1(x)
        x = nn.functional.elu(x)        