In [57]:
# we will only import torch and nn
import torch 
import torch.nn as nn
import math

In [58]:

# write a softmax function
def softmax(tensor , dim):
    z_max = torch.max(tensor , dim = dim , keepdim = True).values
    logits = tensor - z_max

    n = torch.exp(logits)
    sum = torch.sum(n , dim = dim , keepdim = True)

    return n / sum

In [59]:
#loss function
def crossEntropy(S , y_true):
    batch_size = S.shape[0]
    S = torch.clamp(S , 1e-12 , 1.0)
    correct_class = S[torch.arange(batch_size) , y_true]
    loss = -torch.log(correct_class)

    return loss.mean()


In [81]:
s = torch.zeros(1 , 3)
y = torch.tensor([2])
s[0 , y] = 1.0
s

tensor([[0., 0., 1.]])

In [74]:
import torch

input_dim = 3
output_dim = 2
lr = 0.1

dummy = torch.tensor([[2.0 , 3.0 , 4.0]])
y_true = torch.tensor([1])

weights = torch.randn(input_dim , output_dim , requires_grad = False)
bias = torch.zeros(output_dim)

# y = wx + b  , where x is the data
for epoch in range(50):
    z = torch.matmul(dummy , weights) + bias

    exp_z = torch.exp(z - torch.max(z)) 
    S = exp_z / torch.sum(exp_z, dim=-1, keepdim=True)
    loss = S[0 , y_true]
    loss = -torch.log(loss)

    y_hot = torch.zeros_like(S)
    y_hot[0 , y_true] = 1.0

    dz = S - y_hot

    w_ = torch.matmul(dummy.t() , dz)
    b_ = dz.sum(dim = 0)

    weights = weights - lr * w_
    bias = bias - lr * b_

    print(f"loss for epoch {epoch} | {loss.item()}")

print(f"final probs {S}")


loss for epoch 0 | 0.0720861479640007
loss for epoch 1 | 0.04807163029909134
loss for epoch 2 | 0.0364856943488121
loss for epoch 3 | 0.02953125163912773
loss for epoch 4 | 0.024858413264155388
loss for epoch 5 | 0.021489759907126427
loss for epoch 6 | 0.018940292298793793
loss for epoch 7 | 0.0169407669454813
loss for epoch 8 | 0.015328949317336082
loss for epoch 9 | 0.014001118019223213
loss for epoch 10 | 0.012887735851109028
loss for epoch 11 | 0.011940314434468746
loss for epoch 12 | 0.011124026961624622
loss for epoch 13 | 0.010413378477096558
loss for epoch 14 | 0.009788942523300648
loss for epoch 15 | 0.009235760197043419
loss for epoch 16 | 0.008742287755012512
loss for epoch 17 | 0.008299242705106735
loss for epoch 18 | 0.007899348624050617
loss for epoch 19 | 0.007536426652222872
loss for epoch 20 | 0.007205695379525423
loss for epoch 21 | 0.006902921479195356
loss for epoch 22 | 0.006624658592045307
loss for epoch 23 | 0.0063682482577860355
loss for epoch 24 | 0.00613091420

In [4]:
import torch
import torch.nn.functional as F

# Network architecture
input_dim = 3
hidden_dim = 4  # neurons in hidden layer
output_dim = 2
lr = 0.1

# Data
dummy = torch.tensor([[2.0, 3.0, 4.0]])  # [1, 3]
y_true = torch.tensor([1])  # [1]

# Initialize weights and biases for both layers
W1 = torch.randn(input_dim, hidden_dim, requires_grad=False)  # [3, 4]
b1 = torch.zeros(hidden_dim)  # [4]

W2 = torch.randn(hidden_dim, output_dim, requires_grad=False)  # [4, 2]
b2 = torch.zeros(output_dim)  # [2]



for epoch in range(100):
    z1 = torch.matmul(dummy, W1) + b1 
    h1 = F.relu(z1)  
    
    # Layer 2: hidden -> output
    z2 = torch.matmul(h1, W2) + b2  
    
    # Softmax
    z2_max = torch.max(z2, dim=-1, keepdim=True)[0]
    exp_z2 = torch.exp(z2 - z2_max)
    S = exp_z2 / torch.sum(exp_z2, dim=-1, keepdim=True)  # [1, 2]
    
    # Loss
    loss = -torch.log(S[0, y_true])  
    
   
    y_hot = torch.zeros_like(S)
    y_hot[0, y_true] = 1.0
    
    dz2 = S - y_hot  # [1, 2]
    
    dW2 = torch.matmul(h1.T, dz2)  
    db2 = dz2.sum(dim=0)  # [2]
    
    dh1 = torch.matmul(dz2, W2.T)  # [1, 2] @ [2, 4] = [1, 4]
    
    dz1 = dh1.clone()
    dz1[z1 <= 0] = 0  

    dW1 = torch.matmul(dummy.T, dz1)  # [3, 1] @ [1, 4] = [3, 4]
    db1 = dz1.sum(dim=0)  # [4]
    
    W2 -= lr * dW2
    b2 -= lr * db2
    W1 -= lr * dW1
    b1 -= lr * db1
    
    if epoch % 20 == 0:
        print(f"Epoch {epoch:3d} | Loss: {loss.item():.6f} | Probs: {S[0].detach().numpy()}")

print("="*50)
print(f"Final probabilities: {S[0].detach().numpy()}")
print(f"Sum: {S.sum().item():.2f}")

Epoch   0 | Loss: 0.000001 | Probs: [9.7624911e-07 9.9999905e-01]
Epoch  20 | Loss: 0.000001 | Probs: [9.7578834e-07 9.9999905e-01]
Epoch  40 | Loss: 0.000001 | Probs: [9.7534735e-07 9.9999905e-01]
Epoch  60 | Loss: 0.000001 | Probs: [9.7491215e-07 9.9999905e-01]
Epoch  80 | Loss: 0.000001 | Probs: [9.7447708e-07 9.9999905e-01]
Final probabilities: [9.7406269e-07 9.9999905e-01]
Sum: 1.00


In [1]:
import torch
import torch.nn.functional as F

input_dim = 3
hidden_dim = 4  
output_dim = 2
lr = 0.1

# Data
dummy = torch.tensor([[2.0, 3.0, 4.0]])  # [1, 3]
y_true = torch.tensor([1])  # [1]

W1 = torch.randn(input_dim, hidden_dim, requires_grad=False)  # [3, 4]
b1 = torch.zeros(hidden_dim)  # [4]

W2 = torch.randn(hidden_dim, output_dim, requires_grad=False)  # [4, 2]
b2 = torch.zeros(output_dim)  # [2]

for epoch in range(100):
    z1 = torch.matmul(dummy , W1) + b1 # [1 , 4]
    h1 = F.relu(z1)

    z = torch.matmul(h1 , W2) + b2 # [1 , 2]

    zexp = torch.exp(z - torch.max(z))
    sum_ = torch.sum(zexp , dim = -1 , keepdim = True)

    S = zexp / sum_ #[1 , 2]

    loss = -torch.log(S[0 , y_true])

    y_hot = torch.zeros_like(S)
    y_hot[0 , y_true] = 1.0

    dz2 = S - y_hot

    dw2 = torch.matmul(h1.T , dz2)
    db2 = dz2.sum(dim = 0)

    dh1 = torch.matmul(dz2 , W2.T)
    dh1[z1 <= 0] = 0

    dw1 = torch.matmul(dummy.T , dh1)
    db1 = dh1.sum(dim = 0)

    W1 -= lr * dw1
    b1 -= lr * db1
    W2 -= lr * dw2
    b2 -= lr * db2

    print(f"loss {loss.item()}")

S

loss 7.105299472808838
loss 0.5982127785682678
loss 0.5586756467819214
loss 0.5229262113571167
loss 0.4905574917793274
loss 0.4612022936344147
loss 0.4345317780971527
loss 0.41025298833847046
loss 0.388106107711792
loss 0.3678610622882843
loss 0.34931495785713196
loss 0.3322887122631073
loss 0.31662413477897644
loss 0.3021819293498993
loss 0.2888389825820923
loss 0.27648669481277466
loss 0.26502883434295654
loss 0.2543801963329315
loss 0.2444651871919632
loss 0.23521646857261658
loss 0.22657445073127747
loss 0.218485489487648
loss 0.210902139544487
loss 0.20378142595291138
loss 0.1970851868391037
loss 0.19077888131141663
loss 0.18483155965805054
loss 0.17921511828899384
loss 0.17390449345111847
loss 0.16887637972831726
loss 0.16411027312278748
loss 0.15958721935749054
loss 0.15529008209705353
loss 0.15120309591293335
loss 0.1473119556903839
loss 0.14360356330871582
loss 0.14006587862968445
loss 0.13668812811374664
loss 0.13345980644226074
loss 0.13037189841270447
loss 0.127415627241134

tensor([[0.0517, 0.9483]])