In [41]:
import torch
import torch.nn as nn
import torch.optim as optim

In [3]:
# input  data
X = torch.tensor([[0.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 1.0]])
y = torch.tensor([0.0, 1.0, 1.0, 0.0])  # XOR Problem


In [11]:
# define the model
class LinearModel(nn.Module):
  def __init__(self, in_features, out_features):
    super(LinearModel, self).__init__()
    self.linear = nn.Linear(in_features, out_features)

  def forward(self, x):
    return self.linear(x)

# instantiate the model
model = LinearModel(2, 1)

In [12]:
# randomized by default, right?
output = model(X[0])
print(output)

tensor([-0.2910], grad_fn=<ViewBackward0>)


In [76]:
class BinaryClassifer(nn.Module):
  def __init__(self, hidden_layer=4):
    super(BinaryClassifer, self).__init__()
    self.hidden_layer = hidden_layer
    self.linear = nn.Linear(2, hidden_layer)
    self.hidden = nn.Linear(self.hidden_layer, 1)
    self.sigmoid = nn.Sigmoid()

  def forward(self, x):
    output = self.linear(x)
    output = torch.relu(output) # ReLU helps a lot.
    # Rectifier! Activator!
    output = self.hidden(output)
    output = self.sigmoid(output)
    return output

binary_model = BinaryClassifer()
bc_output = binary_model(X[0])
bc_output

tensor([0.5795], grad_fn=<SigmoidBackward0>)

In [77]:
# my two main questions
# how to write the loss function, checking out the `calc_loss_batch` function
# I think we can make our not batches a batch by doing `.unsqueeze(0)` or variation
# loss = torch.nn.functional.cross_entropy(select_logits, target_batch)
# which means we need the selected_logits and the target_batch

# iterate the samples
# for sample, label in zip(X, y):
#   label = label.unsqueeze(0)
#   print(sample)
#   print(label)
#   output = binary_model(sample)
#   print(output)
#   loss = torch.nn.functional.binary_cross_entropy(output, label)
#   # OR BCELoss
#   print(loss)



In [78]:
# how to write the training loop

# Hyperparameters
learning_rate = 0.01

optimizer = optim.SGD(binary_model.parameters(), lr=learning_rate)
epochs = 1000
# can we just drop an Adam in here?
# optimizer = optim.Adam(binary_model.parameters(), lr=learning_rate)
# epochs = 1000
# optimizer = optim.AdamW(binary_model.parameters(), lr=learning_rate)
# epochs = 1000

# loss_function = nn.BCELoss()

# Training Loop!
for epoch in range(epochs):
  epoch_loss = 0.0
  for sample, label in zip(X, y):
    sample = sample #.unsqueeze(0)
    label = label.unsqueeze(0)

    # Forward pass
    output = binary_model(sample)

    # Compute loss
    loss = torch.nn.functional.binary_cross_entropy(output, label)

    # Backward pass
    optimizer.zero_grad() # 0 the grads out
    loss.backward()
    optimizer.step()

    # Accumlate loss
    epoch_loss += loss.item()

  if (epoch + 1) % 100 == 0:
  # if (epoch + 1) % 10 == 0:
    print(f'Epoch {epoch + 1} loss: {epoch_loss:.4f}')

Epoch 100 loss: 2.7916
Epoch 200 loss: 2.7866
Epoch 300 loss: 2.7845
Epoch 400 loss: 2.7824
Epoch 500 loss: 2.7786
Epoch 600 loss: 2.7729
Epoch 700 loss: 2.7625
Epoch 800 loss: 2.7430
Epoch 900 loss: 2.7057
Epoch 1000 loss: 2.6530


In [75]:
# Test the model
with torch.no_grad():
  for sample, label in zip(X, y):
    output = binary_model(sample)
    prediction = (output > 0.5).float()
    print(f"{prediction.item()=}")

prediction.item()=0.0
prediction.item()=1.0
prediction.item()=1.0
prediction.item()=0.0


In [57]:
# cool, that's pretty neat
# whoa, changed optimizer to Adam and got convergence much faster
# ah, I needed to re-create the model
# it was using SGD's work.
# okay after a reset it was at least still 5x faster.
# result: 0.0434 loss after 1000 epochs with AdamW
# result: 0.0909 loss after 1000 epochs with Adam
# result: 2.6530 loss after 1000 epochs with SGD

# I've read Adam models take up more memory, because they hold two new parameters for each parameter
# that already exists, but boy they are fast and efficient on this super small dataset.
