# Barebones Implementation

In [1]:
import pandas as pd
import torch
from torch import Tensor

In [2]:
data = pd.read_csv("./data/IRIS.csv").drop("Id", axis=1)

In [3]:
data.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


### Prepare Data

In [4]:
target_map = {val: index for index, val in enumerate(data.Species.unique())}
X_numpy = data.drop("Species", axis=1).values
y_numpy = data.Species.map(target_map).values

X = torch.tensor(X_numpy, dtype=torch.float32)
y = torch.tensor(y_numpy)

### One-Hot-Encode Class Labels

In [5]:
def one_hot_encode(vector, n_classes):
    # assumes that vector is one dimentional
    one_hot = torch.zeros((vector.shape[0], n_classes)).type(torch.LongTensor)
    return one_hot.scatter(1, vector.type(torch.LongTensor).unsqueeze(1), 1)

y_one_hot = one_hot_encode(y, 3)

### Initialize Model Parameters

In [6]:
torch.manual_seed(-999)

<torch._C.Generator at 0x124edcb10>

In [7]:
w = torch.rand((4, 3))
b = torch.rand(3)

### Define Softmax Activation and Cross Entropy Loss Functions

In [8]:
def softmax_activation(z: Tensor) -> Tensor:
    exponentials: Tensor = torch.exp(z)
    exponentials_row_sums: Tensor = torch.sum(exponentials, axis=1).unsqueeze(1)
    return exponentials / exponentials_row_sums

def cross_entropy_loss(targets: Tensor, activations: Tensor) -> Tensor:
    return torch.mean(-torch.log(torch.sum(targets * activations, axis=1)))

### Simple Training Loop

In [9]:
n_iterations = 100
learning_rate = 0.1
for i in range(1, n_iterations + 1):
    
    Z = torch.mm(X, w) + b
    predictions = softmax_activation(Z)
    loss = cross_entropy_loss(y_one_hot, predictions)
    w_gradients = -torch.mm(X.transpose(0, 1), y_one_hot - predictions) / X.shape[0]
    b_gradients = -torch.mean(y_one_hot - predictions, axis=0)
    
    w -= learning_rate * w_gradients
    b -= learning_rate * b_gradients
    
    if i % 10 == 0:
        print("Loss at iteration {}: {}".format(i, loss))
    

Loss at iteration 10: 0.6981450319290161
Loss at iteration 20: 0.6961764693260193
Loss at iteration 30: 0.6425224542617798
Loss at iteration 40: 0.602511465549469
Loss at iteration 50: 0.5691211223602295
Loss at iteration 60: 0.5393685698509216
Loss at iteration 70: 0.5117704272270203
Loss at iteration 80: 0.48551255464553833
Loss at iteration 90: 0.4601267874240875
Loss at iteration 100: 0.4353489577770233


# Free Differentiation with PyTorch's Autograd

In [10]:
w_autograd = torch.rand((4, 3), requires_grad=True)
b_autograd = torch.rand(3, requires_grad=True)

In [11]:
Z = torch.mm(X, w_autograd) + b_autograd
predictions = softmax_activation(Z)
loss = cross_entropy_loss(y_one_hot, predictions)
loss.backward()

In [12]:
w_autograd.grad

tensor([[-0.5716,  1.4152, -0.8436],
        [-0.5902,  0.8152, -0.2250],
        [ 0.2740,  0.8346, -1.1086],
        [ 0.1701,  0.2812, -0.4513]])

In [13]:
-torch.mm(X.transpose(0, 1), y_one_hot - predictions.detach()) / X.shape[0]

tensor([[-0.5716,  1.4152, -0.8436],
        [-0.5902,  0.8152, -0.2250],
        [ 0.2740,  0.8346, -1.1086],
        [ 0.1701,  0.2812, -0.4513]])

In [14]:
n_iterations = 100
learning_rate = 0.1
for i in range(1, n_iterations + 1):
    if w_autograd.grad is not None:
        w_autograd.grad.zero_()
    if b_autograd.grad is not None:
        b_autograd.grad.zero_()
    
    
    Z = torch.mm(X, w_autograd) + b_autograd
    predictions = softmax_activation(Z)
    loss = cross_entropy_loss(y_one_hot, predictions)
    loss.backward()
    
    with torch.no_grad():
        w_autograd -= learning_rate * w_autograd.grad
        b_autograd -= learning_rate * b_autograd.grad
    
    if i % 10 == 0:
        print("Loss at iteration {}: {}".format(i, loss))

Loss at iteration 10: 0.954537034034729
Loss at iteration 20: 0.7833609580993652
Loss at iteration 30: 0.7053467035293579
Loss at iteration 40: 0.6537990570068359
Loss at iteration 50: 0.6124182343482971
Loss at iteration 60: 0.5758307576179504
Loss at iteration 70: 0.5418170690536499
Loss at iteration 80: 0.5093513131141663
Loss at iteration 90: 0.47795018553733826
Loss at iteration 100: 0.4474251866340637


## Fight Overfitting with Regularization

In [15]:
w_regularized = torch.rand((4, 3), requires_grad=True)
b_regularized = torch.rand(3, requires_grad=True)

In [16]:
l = 0.5

Z = torch.mm(X, w_regularized) + b_regularized
predictions = softmax_activation(Z)
loss = cross_entropy_loss(y_one_hot, predictions) - l * torch.sum(w_regularized ** 2)
loss.backward()

In [17]:
w_regularized.grad

tensor([[-1.7040,  2.4630, -2.3124],
        [-1.2814,  1.1602, -1.3662],
        [-1.2611,  0.9958, -1.9769],
        [-0.5951, -0.2039, -1.1867]])

In [18]:
-torch.mm(X.transpose(0, 1), y_one_hot - predictions) / X.shape[0] - (2 * l * w_regularized)

tensor([[-1.7040,  2.4630, -2.3124],
        [-1.2814,  1.1602, -1.3662],
        [-1.2611,  0.9958, -1.9769],
        [-0.5951, -0.2039, -1.1867]], grad_fn=<SubBackward0>)

# PyTorch Neural Network Module

In [19]:
model = torch.nn.Sequential(
    torch.nn.Linear(4, 3)
)

In [20]:
sgd_optimizer = torch.optim.SGD(model.parameters(), lr=0.1)

In [21]:
cross_entropy_loss = torch.nn.CrossEntropyLoss()

In [22]:
n_iterations = 100
for i in range(1, n_iterations + 1):
    Z = model(X)
    loss = cross_entropy_loss(Z, y)
    sgd_optimizer.zero_grad()
    loss.backward()
    sgd_optimizer.step()
    
    if i % 10 == 0:
        print("Loss at iteration {}: {}".format(i, loss))

Loss at iteration 10: 1.0347145795822144
Loss at iteration 20: 0.8628233671188354
Loss at iteration 30: 0.779863178730011
Loss at iteration 40: 0.7254924178123474
Loss at iteration 50: 0.6824817657470703
Loss at iteration 60: 0.6448083519935608
Loss at iteration 70: 0.6099228858947754
Loss at iteration 80: 0.5766125321388245
Loss at iteration 90: 0.5442736148834229
Loss at iteration 100: 0.5126192569732666


In [23]:
torch.softmax(model(X), 1)[:10]

tensor([[0.8972, 0.1016, 0.0012],
        [0.8358, 0.1614, 0.0028],
        [0.8750, 0.1228, 0.0022],
        [0.8312, 0.1648, 0.0040],
        [0.9060, 0.0928, 0.0012],
        [0.8905, 0.1082, 0.0013],
        [0.8759, 0.1215, 0.0026],
        [0.8739, 0.1242, 0.0019],
        [0.8158, 0.1790, 0.0052],
        [0.8442, 0.1532, 0.0026]], grad_fn=<SliceBackward>)