# Introduction to PyTorch

## The Basics

In [64]:
# !pip install requirements.txt

### Tensor

In [65]:
from torch import tensor

In [66]:
input_tensor = tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
input_tensor

tensor([[1., 2., 3.],
        [4., 5., 6.]])

In [67]:
input_tensor.shape

torch.Size([2, 3])

In [68]:
input_tensor.dtype

torch.float32

### Linear Layer

In [69]:
from torch import nn

In [70]:
linear_layer = nn.Linear(in_features=3, out_features=2)
linear_layer

Linear(in_features=3, out_features=2, bias=True)

In [71]:
linear_layer.weight

Parameter containing:
tensor([[ 0.1174, -0.2019,  0.2889],
        [ 0.1178,  0.1725,  0.0413]], requires_grad=True)

In [72]:
linear_layer.bias

Parameter containing:
tensor([-0.5338, -0.4550], requires_grad=True)

In [73]:
output = linear_layer(input_tensor)
output

tensor([[0.0468, 0.1316],
        [0.6603, 1.1264]], grad_fn=<AddmmBackward0>)

### Stacking Layers

In [74]:
from torch import nn

In [75]:
n_features = 3
n_output = 2

model = nn.Sequential(
    nn.Linear(in_features=n_features, out_features=8),
    nn.Linear(in_features=8, out_features=4),
    nn.Linear(in_features=4, out_features=n_output),
)
model

Sequential(
  (0): Linear(in_features=3, out_features=8, bias=True)
  (1): Linear(in_features=8, out_features=4, bias=True)
  (2): Linear(in_features=4, out_features=2, bias=True)
)

### Model Parameters

In [76]:
from torch import nn

model = nn.Sequential(nn.Linear(3, 8), nn.Linear(8, 4), nn.Linear(4, 2))

In [77]:
total_params = 0
for parameter in model.parameters():
    # print(parameter)
    print(parameter.numel())
    print("=" * 50)
    total_params += parameter.numel()

print(f"Total parameters: {total_params}")

24
8
32
4
8
2
Total parameters: 78


## Neural Networks Architecture

### Activation Layer: Sigmoid Function

In [78]:
from torch import tensor, nn

input_tensor = tensor([[6.0, -6.0]])
input_tensor

tensor([[ 6., -6.]])

In [79]:
sigmoid_layer = nn.Sigmoid()
sigmoid_layer(input_tensor)

tensor([[0.9975, 0.0025]])

### Activation Layer: Softmax Function

In [80]:
from torch import tensor, nn

input_tensor = tensor([[4.3, 6.1, 2.3]])
input_tensor

tensor([[4.3000, 6.1000, 2.3000]])

In [81]:
softmax_layer = nn.Softmax(dim=-1)
softmax_layer(input_tensor)

tensor([[0.1392, 0.8420, 0.0188]])

### Activation Layer: ReLU (Rectified Linear Unit)

In [82]:
from torch import tensor, nn

input_tensor = tensor([[4.3, 6.1, -2.3]])
input_tensor

tensor([[ 4.3000,  6.1000, -2.3000]])

In [83]:
relu_layer = nn.ReLU()
relu_layer(input_tensor)

tensor([[4.3000, 6.1000, 0.0000]])

### Activation Layer: Leaky ReLU 

In [84]:
from torch import tensor, nn

input_tensor = tensor([[4.3, 6.1, -2.3]])
input_tensor

tensor([[ 4.3000,  6.1000, -2.3000]])

In [85]:
leaky_relu = nn.LeakyReLU(negative_slope=0.05)
leaky_relu(input_tensor)

tensor([[ 4.3000,  6.1000, -0.1150]])

### Forward Pass: Binary Classification

In [86]:
from torch import tensor, nn, randn

input_tensor = randn(
    5, 6
)  # Creates a tensor of shape (5, 6) with random values from normal distribution
input_tensor

tensor([[ 0.3187,  0.6542, -0.0515,  0.6265, -0.3443, -0.1625],
        [ 0.9483, -0.3478,  0.1829, -0.8966, -0.3599,  1.6161],
        [-0.4228,  0.2898, -0.7049, -0.1499, -1.1045, -0.3708],
        [ 0.0203, -0.1241,  0.5387, -1.5377, -0.1299, -1.4830],
        [-1.3510, -0.1652,  0.1479, -1.1891, -0.3713, -0.6106]])

In [87]:
model = nn.Sequential(
    nn.Linear(in_features=6, out_features=4),  # First Linear Layer
    nn.Linear(in_features=4, out_features=1),  # Second Linear Layer
    nn.Sigmoid(),  # Sigmoid Activation Function
)

model(input_tensor)

tensor([[0.6425],
        [0.5920],
        [0.7075],
        [0.6084],
        [0.6185]], grad_fn=<SigmoidBackward0>)

### Forward Pass: Multi-class Classification

In [88]:
from torch import tensor, nn, randn

# Creates a tensor of shape (5, 6) with random values from normal distribution
input_tensor = randn(5, 6)
input_tensor

tensor([[ 0.7387,  0.6655, -1.1441, -0.9659,  0.7755,  0.1815],
        [ 0.0683,  0.0246, -1.2833, -0.5384,  0.4655,  0.6290],
        [ 0.3619, -0.3852, -1.3110, -1.4664, -0.1708,  0.3833],
        [-0.4381,  1.6959,  0.3629,  0.3464, -0.2951, -0.0422],
        [-0.9933,  0.7187, -0.1279, -0.3941,  2.1049, -0.2579]])

In [89]:
n_classes = 3

model = nn.Sequential(
    nn.Linear(in_features=6, out_features=4),  # First Linear Layer
    nn.Linear(in_features=4, out_features=n_classes),  # Second Linear Layer
    nn.Softmax(dim=-1),  # Softmax Activation Function
)

model(input_tensor)

tensor([[0.3391, 0.4129, 0.2480],
        [0.2888, 0.4268, 0.2845],
        [0.3270, 0.4240, 0.2490],
        [0.2667, 0.4556, 0.2778],
        [0.2401, 0.4674, 0.2926]], grad_fn=<SoftmaxBackward0>)

### Forward Pass: Regression

In [90]:
from torch import tensor, nn, randn

# Creates a tensor of shape (5, 6) with random values from normal distribution
input_tensor = randn(5, 6)
input_tensor

tensor([[ 0.9306, -1.1583, -0.1087,  0.9812,  0.9504,  0.2963],
        [ 0.7394, -1.4574,  0.3019,  0.3631,  0.1547, -0.2733],
        [-0.7157,  0.4555, -0.9758, -0.3009,  0.8302, -0.9867],
        [ 0.6476, -0.1697, -1.2056,  1.1932,  0.5764, -0.5991],
        [ 0.6867, -0.0507,  0.0548, -0.8879, -1.8169, -0.1939]])

In [91]:
model = nn.Sequential(
    nn.Linear(in_features=6, out_features=4),  # First Linear Layer
    nn.Linear(in_features=4, out_features=1),  # Second Linear Layer
)

model(input_tensor)

tensor([[0.1639],
        [0.2963],
        [0.4162],
        [0.1337],
        [0.6436]], grad_fn=<AddmmBackward0>)

## Loss Functions

### Transforming Labels with One-hot Encoding

In [92]:
from torch import tensor
import torch.nn.functional as F

F.one_hot(tensor([0, 1, 2]), num_classes=3)

tensor([[1, 0, 0],
        [0, 1, 0],
        [0, 0, 1]])

### Cross Entropy Loss

In [93]:
from torch import tensor, nn

y_hat = tensor([[-5.2, 4.6, 0.8]])
y_hat

tensor([[-5.2000,  4.6000,  0.8000]])

In [94]:
y = tensor([0])
one_hot_y = F.one_hot(y, num_classes=3)
one_hot_y

tensor([[1, 0, 0]])

In [95]:
loss_fn = nn.CrossEntropyLoss()
loss = loss_fn(y_hat.double(), one_hot_y.double())
loss

tensor(9.8222, dtype=torch.float64)

## Sample Dataset

#### Load sample dataset

In [96]:
import pandas as pd

animals = pd.read_csv("animal_dataset.csv")
animals

Unnamed: 0,animal_name,hair,feathers,eggs,milk,predator,legs,tail,type
0,sparrow,0,1,1,0,0,2,1,0
1,eagle,0,1,1,0,1,2,1,0
2,cat,1,0,0,1,1,4,1,1
3,dog,1,0,0,1,0,4,1,1
4,lizard,0,0,1,0,1,4,1,2


#### Features

In [97]:
features = animals.iloc[:, 1:-1]
features

Unnamed: 0,hair,feathers,eggs,milk,predator,legs,tail
0,0,1,1,0,0,2,1
1,0,1,1,0,1,2,1
2,1,0,0,1,1,4,1
3,1,0,0,1,0,4,1
4,0,0,1,0,1,4,1


In [98]:
X = features.to_numpy()
X

array([[0, 1, 1, 0, 0, 2, 1],
       [0, 1, 1, 0, 1, 2, 1],
       [1, 0, 0, 1, 1, 4, 1],
       [1, 0, 0, 1, 0, 4, 1],
       [0, 0, 1, 0, 1, 4, 1]])

#### Target Values

In [99]:
target = animals.iloc[:, -1]
target

0    0
1    0
2    1
3    1
4    2
Name:  type, dtype: int64

In [100]:
y = target.to_numpy()
y

array([0, 0, 1, 1, 2])

#### TensorDataset

In [101]:
from torch.utils.data import TensorDataset
from torch import tensor

dataset = TensorDataset(tensor(X), tensor(y))
dataset

<torch.utils.data.dataset.TensorDataset at 0x1795c9460>

In [102]:
input_sample, label_sample = dataset[0]
print(f"input_sample: {input_sample}")
print(f"label_sample: {label_sample}")

input_sample: tensor([0, 1, 1, 0, 0, 2, 1])
label_sample: 0


#### DataLoader

In [103]:
from torch.utils.data import DataLoader

dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

In [104]:
for batch_inputs, batch_labels in dataloader:
    print(f"batch_inputs: {batch_inputs}")
    print(f"batch_labels: {batch_labels}")
    print("=" * 50)

batch_inputs: tensor([[0, 1, 1, 0, 0, 2, 1],
        [0, 0, 1, 0, 1, 4, 1]])
batch_labels: tensor([0, 2])
batch_inputs: tensor([[1, 0, 0, 1, 1, 4, 1],
        [0, 1, 1, 0, 1, 2, 1]])
batch_labels: tensor([1, 0])
batch_inputs: tensor([[1, 0, 0, 1, 0, 4, 1]])
batch_labels: tensor([1])


## Training Loop

### Load Salary Dataset

In [105]:
import pandas as pd

salary = pd.read_csv("salary_dataset.csv")
salary

Unnamed: 0,experience_level,employment_type,remote_ratio,company_size,salary_in_usd
0,0,0,0.5,1,0.036
1,1,0,1.0,2,0.133
2,2,0,0.0,1,0.234
3,1,0,1.0,0,0.076
4,2,0,1.0,1,0.17


### Features & Target

In [106]:
features = salary.iloc[:, :-1]
features

Unnamed: 0,experience_level,employment_type,remote_ratio,company_size
0,0,0,0.5,1
1,1,0,1.0,2
2,2,0,0.0,1
3,1,0,1.0,0
4,2,0,1.0,1


In [107]:
X = features.to_numpy()
X

array([[0. , 0. , 0.5, 1. ],
       [1. , 0. , 1. , 2. ],
       [2. , 0. , 0. , 1. ],
       [1. , 0. , 1. , 0. ],
       [2. , 0. , 1. , 1. ]])

In [108]:
target = salary.iloc[:, -1]
target

0    0.036
1    0.133
2    0.234
3    0.076
4    0.170
Name:  salary_in_usd, dtype: float64

In [109]:
y = target.to_numpy()
y

array([0.036, 0.133, 0.234, 0.076, 0.17 ])

### Tensor Dataset & DataLoader

In [110]:
from torch.utils.data import TensorDataset, DataLoader
from torch import tensor

dataset = TensorDataset(tensor(X).float(), tensor(y).float())
dataset

<torch.utils.data.dataset.TensorDataset at 0x134de7560>

In [111]:
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)

### Model, Loss Function & SDG (Stochastic Gradient Descent) Optimizer

In [112]:
from torch import nn, optim

model = nn.Sequential(nn.Linear(4, 2), nn.Linear(2, 1))

# Loss Function a.k.a Criterion
criterion = nn.MSELoss()

# learning rate controls the step size
# typical range between 0.01 and 0.0001
learning_rate = 0.001

# momentum helps escape local optima
# typical range between 0.85 and 0.99
momentum = 0.8

optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum)

### The Training Loop

In [113]:
num_epochs = 10

for epoch in range(num_epochs):
    for data in dataloader:
        feature, target = data

        # Run the forward pass
        pred = model(feature)

        # Reshape target to match prediction shape
        target = target.view(-1, 1)
        # Compute the loss
        loss = criterion(pred, target)

        # Backpropagation
        loss.backward()  # Compute gradients
        optimizer.step()  # Update weights
        optimizer.zero_grad()  # Reset gradients

## Layer Initialization & Transfer Learning

In [114]:
from torch import nn

linear_layer = nn.Linear(64, 128)

print(f"min weight: {linear_layer.weight.min()}")
print(f"max weight: {linear_layer.weight.max()}")

min weight: -0.12498833239078522
max weight: 0.12498781085014343


### Uniform Wight Initialization

In [115]:
from torch import nn

linear_layer = nn.Linear(64, 128)
nn.init.uniform_(linear_layer.weight)

print(f"min weight: {linear_layer.weight.min()}")
print(f"max weight: {linear_layer.weight.max()}")

min weight: 2.4318695068359375e-05
max weight: 0.9999579191207886


### Transfer Learning: Fine-tuning

In [116]:
import torch

some_pretrained_layer = nn.Sequential(
    nn.Linear(256, 32), nn.Linear(32, 64), nn.Linear(64, 128)
)
torch.save(some_pretrained_layer, "some_pretrained_layer.pth")

#### Freeze layers of a pretrained model

In [117]:
import torch
from torch import nn

pretrained_layer = torch.load("some_pretrained_layer.pth")
for name, param in pretrained_layer.named_parameters():
    # Check for first layer's weight
    if name == "0.weight":
        # Freeze this weight
        param.requires_grad = False

    # Check for second layer's weight
    if name == "1.weight":
        # Freeze this weight
        param.requires_grad = False

#### Add more layers of a pretrained model

In [118]:
modified_layer = nn.Sequential(
    pretrained_layer, nn.Linear(128, 256), nn.Linear(256, 64)
)

## Evaluating Model Performance

In [119]:
import pandas as pd

salary = pd.read_csv("salary_dataset.csv")

features = salary.iloc[:, :-1]
X = features.to_numpy()

target = salary.iloc[:, -1]
y = target.to_numpy()

In [120]:
from torch.utils.data import TensorDataset, DataLoader
from torch import tensor

dataset = TensorDataset(tensor(X).float(), tensor(y).float())
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)

In [121]:
from torch import nn, optim

model = nn.Sequential(nn.Linear(4, 2), nn.Linear(2, 1))
criterion = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.8)

### Training Loss & Epoch Loss

In [122]:
training_loss = 0.0

for feature, target in dataloader:
    # Run the forward pass
    pred = model(feature)

    # Reshape target to match prediction shape
    target = target.view(-1, 1)
    # Compute the loss
    loss = criterion(pred, target)

    # Backpropagation
    loss.backward()  # Compute gradients
    optimizer.step()  # Update weights
    optimizer.zero_grad()  # Reset gradients

    # Calculate and sum the loss
    training_loss += loss.item()

epoch_loss = training_loss / len(dataloader)
print(f"training loss: {training_loss}")
print(f"epoch loss: {epoch_loss}")

training loss: 1.037819266319275
epoch loss: 0.5189096331596375


### Validation Loss

In [123]:
# Set model into evaluation mode
model.eval()

validation_loss = 0.0
with torch.no_grad():
    for feature, target in dataloader:
        # Run the forward pass
        pred = model(feature)

        # Reshape target to match prediction shape
        target = target.view(-1, 1)
        # Compute the loss
        loss = criterion(pred, target)

        # Calculate and sum the loss
        validation_loss += loss.item()

epoch_loss = validation_loss / len(dataloader)
print(f"training loss: {validation_loss}")
print(f"epoch loss: {epoch_loss}")

# Set model back into training mode
model.train()

training loss: 0.9873130917549133
epoch loss: 0.49365654587745667


Sequential(
  (0): Linear(in_features=4, out_features=2, bias=True)
  (1): Linear(in_features=2, out_features=1, bias=True)
)

Indication of model overfitting:
- training loss dreceasing
- validation loss increasing

### Using `torchmetrics`

In [124]:
import torchmetrics

# Initialize the metric
metric = torchmetrics.MeanSquaredError()

# Set model to evaluation mode
model.eval()

# Evaluate without computing gradients
with torch.no_grad():
    for feature, target in dataloader:
        # Get predictions
        pred = model(feature)

        # Reshape target to match prediction shape
        target = target.view(-1, 1)

        # Update metric
        metric.update(pred, target)

# Compute final metric
mse = metric.compute()
print(f"Mean Squared Error: {mse:.4f}")

# Reset metric for future use
metric.reset()

# Set model back to training mode
model.train()

Mean Squared Error: 0.4904


Sequential(
  (0): Linear(in_features=4, out_features=2, bias=True)
  (1): Linear(in_features=2, out_features=1, bias=True)
)

## Fighting Overfitting: Regularization

### Regularization with Dropout Layer

Dropout Layer, randomly zeroes out elements of the input tensor during training preventing the model from becoming too dependent on specific features.

In [125]:
import torch
from torch import nn

model = nn.Sequential(nn.Linear(8, 4), nn.ReLU(), nn.Dropout(p=0.5))
features = torch.randn((1, 8))
predictions = model(features)
predictions

tensor([[0.0000, 0.0000, 0.1579, 0.2963]], grad_fn=<MulBackward0>)

### Regularization with Weight Decay

Weight Decay encourages smaller weights by adding a penalty during optimization. <br/>
It helps reduce overfitting by keeping weights smaller and improving generalization.

In [126]:
import torch
from torch import nn, optim

model = nn.Sequential(nn.Linear(8, 4), nn.ReLU())
optimizer = optim.SGD(model.parameters(), lr=0.001, weight_decay=0.0001)

features = torch.randn((1, 8))
predictions = model(features)
predictions

tensor([[0.2751, 0.3743, 0.0000, 0.0000]], grad_fn=<ReluBackward0>)