<a href="https://colab.research.google.com/github/and-is/learning-pytorch/blob/main/ann.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Building a simple ANN using pytorch.

Approach: \
ANN with 784 input layers, then hidden layer with 128 and 64 neurons, then output layer with 10 neurons.
\
We're using relu activation in hidden layers and softmax in output layer (multiclass classification).

Method: \
Dataloader Objects Creation \
Training Loop \
Evaluation

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import matplotlib.pyplot as plt

In [2]:
torch.manual_seed(42)

<torch._C.Generator at 0x7ff5b73a1a90>

In [3]:
# Change runtime, then do this to switch to gpu/cuda

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [5]:
df = pd.read_csv('fashion-mnist_train.csv')
df.head()

Unnamed: 0,label,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783,pixel784
0,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,6,0,0,0,0,0,0,0,5,0,...,0,0,0,30,43,0,0,0,0,0
3,0,0,0,0,1,2,0,0,0,0,...,3,0,0,0,0,1,0,0,0,0
4,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
X = df.iloc[:, 1:].values
y = df.iloc[:, 0].values

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
X_train = X_train/255.0
X_test = X_test/255.0

In [9]:
class CustomDataset(Dataset):
    def __init__(self, features, labels):
        self.features = torch.tensor(features, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.features)

    def __getitem__(self, index):
        return self.features[index], self.labels[index]

In [10]:
train_dataset = CustomDataset(X_train, y_train)

In [11]:
len(train_dataset)

48000

In [12]:
test_dataset = CustomDataset(X_test, y_test)

In [13]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# can use larger batch size upto 128, also pin_memory=True can be done for better GPU training time.

In [14]:
class MyNN(nn.Module):
    def __init__(self, num_features):
      super().__init__()
      self.model = nn.Sequential(
          nn.Linear(num_features, 128),
          nn.ReLU(),
          nn.Linear(128,64),
          nn.ReLU(),
          nn.Linear(64,10)
      )

    def forward(self, x):
      return self.model(x)


In [15]:
epochs = 100
learning_rate = 0.1

In [16]:
# model instantiation
model = MyNN(X_train.shape[1])

# moving our model to GPU
model.to(device)

# loss function
criterion = nn.CrossEntropyLoss()

# optimizer
optimizer = optim.SGD(model.parameters(), lr=learning_rate)

In [23]:
# training loop
for epoch in range(epochs):
  total_epoch_loss = 0

  for batch_features, batch_labels in train_loader:

    # moving data to GPU
    batch_features, batch_labels = batch_features.to(device), batch_labels.to(device)

    # forward pass
    outputs = model(batch_features)

    # calculate loss
    loss = criterion(outputs, batch_labels)

    # back pass
    optimizer.zero_grad()
    loss.backward()

    # update grads
    optimizer.step()

    total_epoch_loss = total_epoch_loss + loss.item()

  print(f'Epoch: {epoch+1}, Loss: {total_epoch_loss/len(train_loader)}')

Epoch: 1, Loss: 0.6280928785204888
Epoch: 2, Loss: 0.4941576734731595
Epoch: 3, Loss: 0.45573020658393704
Epoch: 4, Loss: 0.4350636695971092
Epoch: 5, Loss: 0.4201909519781669
Epoch: 6, Loss: 0.4028795042385658
Epoch: 7, Loss: 0.3912401213447253
Epoch: 8, Loss: 0.3810711074322462
Epoch: 9, Loss: 0.37584457791348297
Epoch: 10, Loss: 0.37036596979449193
Epoch: 11, Loss: 0.3614758733610312
Epoch: 12, Loss: 0.35659096115330857
Epoch: 13, Loss: 0.3522998010913531
Epoch: 14, Loss: 0.3461671963557601
Epoch: 15, Loss: 0.34683001887301607
Epoch: 16, Loss: 0.3393433167487383
Epoch: 17, Loss: 0.33526700356105965
Epoch: 18, Loss: 0.3289367815256119
Epoch: 19, Loss: 0.33349811102449894
Epoch: 20, Loss: 0.3246629723310471
Epoch: 21, Loss: 0.3270870805929105
Epoch: 22, Loss: 0.31952908780177436
Epoch: 23, Loss: 0.32093986740956704
Epoch: 24, Loss: 0.3154519301230709
Epoch: 25, Loss: 0.31319850735614696
Epoch: 26, Loss: 0.3146964717457692
Epoch: 27, Loss: 0.3122544954617818
Epoch: 28, Loss: 0.31243075

In [24]:
# Set model to evaluation mode
model.eval()

MyNN(
  (model): Sequential(
    (0): Linear(in_features=784, out_features=128, bias=True)
    (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.3, inplace=False)
    (4): Linear(in_features=128, out_features=64, bias=True)
    (5): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU()
    (7): Dropout(p=0.3, inplace=False)
    (8): Linear(in_features=64, out_features=10, bias=True)
  )
)

In [28]:
# evaluation code
total = 0
correct = 0
with torch.no_grad():
  for batch_features, batch_labels in test_loader:

    # moving data to GPU
    batch_features, batch_labels = batch_features.to(device), batch_labels.to(device)


    outputs = model(batch_features)
    _, predicted = torch.max(outputs, 1)
    total += batch_labels.shape[0]
    correct += (predicted == batch_labels).sum().item()
print(correct/total)


# Compares the predicted labels (predicted) with the true labels (batch_labels) using the equality operator (==).
# (predicted == batch_labels) produces a tensor of Boolean values where True indicates a correct prediction.
# .sum() counts the number of True values in the tensor (i.e., the number of correct predictions in the batch).
# .item() converts the scalar tensor value to a Python integer.

0.889


In [27]:
total = 0
correct = 0
with torch.no_grad():
  for batch_features, batch_labels in train_loader:

    # moving data to GPU
    batch_features, batch_labels = batch_features.to(device), batch_labels.to(device)


    outputs = model(batch_features)
    _, predicted = torch.max(outputs, 1)
    total += batch_labels.shape[0]
    correct += (predicted == batch_labels).sum().item()
print(correct/total)


0.9399583333333333


## Reducing Overfitting
Our model seems to overfit, because it has low performance in test data, but high in train data. \
So we need to reduce ovrfitting by optimization.

We can reduce overfitting by:
1. Adding more data
2. Reducing complexity of NN arch
3. Regularization (adding penalty term in loss function, so our model aims to balance that penalty as well, reducing overfitting) (we can do this here)
4. Dropout (turning off random neurons during forward propagation) (can do this as well)
5. Data Augmentation (modifying our inputs by little, applying transformations to increase variations, for CNN arch usually)
6. Batch Normalization (can be used)
7. Early Stopping (stopping training in earlier epochs)


So, here we'll try
- Regularization
- Dropouts
- Batch Normalization

### Dropouts
Dropouts are always applied to hidden layers after using the ReLU activation function.
\
We specify via p how much neurons we want to turn off.\
We do not use dropouts during evaluation.


### Batch Normalization
improves training stability by normalizing the rapidly changing parameters. \
- applied to hidden layers
- applied after linear layers and before activation functions
- normalizes activations
- includes learnable parameters (gamma and beta for scaling and shifting)
- reduces internal covariate shift, stabilizing training process
- regularization effect as statistics computation adds noise to the training process
- during evaluation, batchNorm uses the running mean and variance accumulated during training

### L2 Regularization
- Done upon optimizer. \
The original loss function has a penalty term added. L2 has a squared penalty term, L1 has modulus value only. \
This reduces overfitting by increasing noise.
- Only done upon weights, not biases. \
Loss = Loss + λ ∑w^2
- directly modifies gradient update rule, subtracting with decay (shrinking weights). \
w = w - η(∇Loss+λw)
\
This reduces load over one neuron.
\
- value of λ determines how much this effect we need here.
- affects only during training.

### Implementation (Dropouts and BatchNorm)

In [21]:
class MyNN(nn.Module):
    def __init__(self, num_features):
      super().__init__()
      self.model = nn.Sequential(
          nn.Linear(num_features, 128),

          nn.BatchNorm1d(128),     # argument is the no of parameters we're receiving

          nn.ReLU(),

          nn.Dropout(p=0.3),     # 30% of neurons to turn off

          nn.Linear(128,64),

          nn.BatchNorm1d(64),       # same here

          nn.ReLU(),

          nn.Dropout(p=0.3),     # 30% of neurons to turn off

          nn.Linear(64,10)
      )

    def forward(self, x):
      return self.model(x)

epochs = 100
learning_rate = 0.1

### Implementation (L2 Regularization)

In [22]:
# model instantiation
model = MyNN(X_train.shape[1])

# moving our model to GPU
model.to(device)

# loss function
criterion = nn.CrossEntropyLoss()

# optimizer
optimizer = optim.SGD(model.parameters(), lr=learning_rate, weight_decay=1e-4)     # This weight decay addition is enough.
