# Other findings for the day.
# We can now train a reasoning model using https://unsloth.ai/blog/r1-reasoning, with a single  7GB VRAM GPU

In [2]:
# Day 2: Training a simple neural net

import pandas as pd
import numpy as np

data=pd.read_csv("../Day1/bert_embeddings.csv")

data.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,759,760,761,762,763,764,765,766,767,label
0,0.314854,-0.315803,0.482772,0.406596,0.651483,-0.514341,0.287409,0.361406,0.113588,-0.435126,...,-0.024764,-0.214943,-0.171342,0.197776,-0.088101,0.018636,0.041526,-0.015001,-0.348425,0
1,0.151626,-0.027887,0.407615,0.026858,0.002005,-0.376765,0.378584,0.799385,-0.485622,-0.581311,...,0.189612,-0.078242,-0.092983,0.178424,-0.132592,-0.271396,-0.030238,0.402832,-0.168941,0
2,0.530745,0.066995,0.319736,0.006169,0.396067,-0.457637,0.494592,0.412954,-0.056115,-0.468745,...,0.007627,-0.090966,-0.280942,-0.050126,-0.263792,-0.210276,-0.273926,0.368495,-0.379933,0
3,0.469785,-0.18799,-0.036101,-0.062842,0.632343,-0.298325,0.766535,0.998686,-0.244634,0.203218,...,-0.067121,-0.180987,-0.7038,0.086969,-0.030878,-0.199462,-0.478008,-0.176455,0.067534,1
4,0.206392,0.141666,0.001321,0.033523,0.189142,-0.165017,0.106561,0.311256,0.091679,-0.673464,...,0.270695,-0.048474,-0.252568,-0.305456,-0.062712,-0.016795,0.013259,0.275179,-0.087495,0


# Basic forward pass i.e actication(WX)+bias , it's just weights be learned during the process.

In [3]:
# A quick revision for the forward pass of the data.
import torch
x=torch.tensor([1,2,3])
#Everything that happens inside pytorch is done using tensors.
weights=torch.randn(3)
bias=torch.randn(1)

weighted_sum=torch.sum(x*weights)+bias
output=torch.relu(weighted_sum)

print("Input:", x)
print("Weights:", weights)
print("Bias:", bias)
print("Weighted sum:", weighted_sum)
print("Output after ReLU:", output)


Input: tensor([1, 2, 3])
Weights: tensor([-0.0696,  0.6583,  1.0663])
Bias: tensor([1.2171])
Weighted sum: tensor([5.6628])
Output after ReLU: tensor([5.6628])


In [4]:
import torch

# Input features
x = torch.tensor([1.0, 2.0, 3.0])

# Let's create a layer with 4 neurons
# Each neuron needs weights for each input (3 inputs in this case)
layer_weights = torch.randn(4, 3)  # 4 neurons x 3 inputs
layer_biases = torch.randn(4)      # 1 bias per neuron

# Forward pass through the layer
# matmul does the weighted sum for all neurons at once
weighted_sums = torch.matmul(layer_weights, x) + layer_biases
layer_outputs = torch.relu(weighted_sums)

print("Input:", x)
print("\nLayer weights (4 neurons, 3 weights each):\n", layer_weights)
print("\nLayer biases:", layer_biases)
print("\nOutputs from each neuron:", layer_outputs)

# Let's see what each neuron computed individually
for i in range(4):
    neuron_weights = layer_weights[i]
    neuron_bias = layer_biases[i]
    neuron_output = torch.relu(torch.sum(neuron_weights * x) + neuron_bias)
    print(f"\nNeuron {i+1} detailed computation:")
    print(f"Weights: {neuron_weights}")
    print(f"Bias: {neuron_bias:.4f}")
    print(f"Output: {neuron_output:.4f}")

Input: tensor([1., 2., 3.])

Layer weights (4 neurons, 3 weights each):
 tensor([[ 1.2774, -1.9667,  1.7751],
        [-0.6306, -0.9692, -1.5649],
        [ 0.1256, -0.2905,  1.3283],
        [-0.1064,  0.7906, -0.2822]])

Layer biases: tensor([ 1.0052, -0.4294,  0.7263,  0.4101])

Outputs from each neuron: tensor([3.6744, 0.0000, 4.2557, 1.0384])

Neuron 1 detailed computation:
Weights: tensor([ 1.2774, -1.9667,  1.7751])
Bias: 1.0052
Output: 3.6744

Neuron 2 detailed computation:
Weights: tensor([-0.6306, -0.9692, -1.5649])
Bias: -0.4294
Output: 0.0000

Neuron 3 detailed computation:
Weights: tensor([ 0.1256, -0.2905,  1.3283])
Bias: 0.7263
Output: 4.2557

Neuron 4 detailed computation:
Weights: tensor([-0.1064,  0.7906, -0.2822])
Bias: 0.4101
Output: 1.0384


In [12]:
# Setup X and Y
X=torch.tensor(data.iloc[:,:-1].values,dtype=torch.float32)
print(X.shape)
print(X)
y=torch.tensor(data.iloc[:,-1].values,dtype=torch.float32)
print(Y.shape)
print(Y)

torch.Size([179, 768])
tensor([[ 0.3149, -0.3158,  0.4828,  ...,  0.0415, -0.0150, -0.3484],
        [ 0.1516, -0.0279,  0.4076,  ..., -0.0302,  0.4028, -0.1689],
        [ 0.5307,  0.0670,  0.3197,  ..., -0.2739,  0.3685, -0.3799],
        ...,
        [ 0.4713, -0.2122,  0.2477,  ...,  0.0108,  0.0847, -0.3637],
        [ 0.1620, -0.3751,  0.4522,  ...,  0.0424, -0.0872, -0.0646],
        [ 0.1286, -0.4601,  0.3153,  ...,  0.1095,  0.1296, -0.1109]])
torch.Size([179])
tensor([0., 0., 0., 1., 0., 0., 1., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 1.,
        0., 1., 1., 0., 0., 1., 1., 0., 1., 0., 0., 1., 0., 1., 1., 1., 0., 1.,
        1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 1., 0., 1., 1., 1., 0.,
        1., 0., 0., 1., 0., 1., 1., 0., 1., 0., 0., 1., 1., 1., 0., 0., 0., 1.,
        0., 0., 0., 1., 1., 0., 1., 1., 0., 1., 0., 0., 0., 1., 0., 0., 1., 1.,
        0., 0., 1., 0., 0., 0., 0., 1., 0., 1., 1., 1., 0., 0., 1., 1., 0., 0.,
        0., 0., 1., 1., 0., 1., 1., 1., 1., 0

In [10]:
import torch

# 1. Define network architecture
input_size = 768    # Size of BERT embeddings
hidden_size = 64    # Size of hidden layer
output_size = 1     # Binary classification
learning_rate = 0.01

# 2. Initialize weights and biases with correct shapes
# First layer (input → hidden)
weights1 = torch.randn(input_size, hidden_size) * 0.01  # Shape: (768, 64)
bias1 = torch.zeros(hidden_size)                        # Shape: (64)

# Second layer (hidden → output)
weights2 = torch.randn(hidden_size, output_size) * 0.01 # Shape: (64, 1)
bias2 = torch.zeros(output_size)                        # Shape: (1)

# Let's inspect our network parameters
print("Weights shapes:")
print("First layer:", weights1.shape)
print("Second layer:", weights2.shape)
print("\nBias shapes:")
print("First layer:", bias1.shape)
print("Second layer:", bias2.shape)

# 3. Test forward pass with one sample
sample = X[0]  # Take first sample
print("\nInput sample shape:", sample.shape)  # Should be (768,)

# Forward pass through first layer
# sample needs to be (768,) to multiply with weights1 (768, 64)
hidden = torch.matmul(sample, weights1) + bias1  # Result: (64,)
hidden_activated = torch.relu(hidden)
print("After first layer:", hidden_activated.shape)  # Should be (64,)

# Forward pass through second layer
# hidden_activated (64,) multiplies with weights2 (64, 1)
output = torch.matmul(hidden_activated, weights2) + bias2  # Result: (1,)
final_output = torch.sigmoid(output)
print("Final output shape:", final_output.shape)
print("Predicted probability:", final_output.item())

# Let's also test with a batch of samples
batch = X[:5]  # Take first 5 samples
print("\nTesting with batch:")
print("Input batch shape:", batch.shape)  # Should be (5, 768)

# Forward pass with batch
hidden_batch = torch.matmul(batch, weights1) + bias1  # Result: (5, 64)
hidden_activated_batch = torch.relu(hidden_batch)
print("After first layer:", hidden_activated_batch.shape)

output_batch = torch.matmul(hidden_activated_batch, weights2) + bias2  # Result: (5, 1)
final_output_batch = torch.sigmoid(output_batch)
print("Final output shape:", final_output_batch.shape)
print("Predicted probabilities for batch:", final_output_batch.squeeze())

Weights shapes:
First layer: torch.Size([768, 64])
Second layer: torch.Size([64, 1])

Bias shapes:
First layer: torch.Size([64])
Second layer: torch.Size([1])

Input sample shape: torch.Size([768])
After first layer: torch.Size([64])
Final output shape: torch.Size([1])
Predicted probability: 0.5000473260879517

Testing with batch:
Input batch shape: torch.Size([5, 768])
After first layer: torch.Size([5, 64])
Final output shape: torch.Size([5, 1])
Predicted probabilities for batch: tensor([0.5000, 0.4992, 0.4992, 0.4976, 0.4988])


In [13]:
import torch

# Assuming we have our weights and biases from before
num_epochs = 100
batch_size = 32
learning_rate = 0.01

# Lists to store metrics
losses = []
accuracies = []

# Training loop
for epoch in range(num_epochs):
    # Shuffle data
    indices = torch.randperm(len(X))
    X_shuffled = X[indices]
    y_shuffled = y[indices]
    
    epoch_losses = []
    epoch_accuracies = []
    
    # Process mini-batches
    for i in range(0, len(X), batch_size):
        # Get mini-batch
        X_batch = X_shuffled[i:i + batch_size]
        y_batch = y_shuffled[i:i + batch_size]
        
        # Forward pass
        # First layer
        hidden = torch.matmul(X_batch, weights1) + bias1
        hidden_activated = torch.relu(hidden)
        
        # Second layer
        output = torch.matmul(hidden_activated, weights2) + bias2
        predictions = torch.sigmoid(output)
        
        # Calculate loss (Binary Cross Entropy)
        loss = -torch.mean(
            y_batch.view(-1, 1) * torch.log(predictions + 1e-10) + 
            (1 - y_batch.view(-1, 1)) * torch.log(1 - predictions + 1e-10)
        )
        
        # Backpropagation
        # Output layer gradients
        output_error = predictions - y_batch.view(-1, 1)
        weights2_grad = torch.matmul(hidden_activated.t(), output_error)
        bias2_grad = torch.sum(output_error, dim=0)
        
        # Hidden layer gradients
        hidden_error = torch.matmul(output_error, weights2.t())
        hidden_error = hidden_error * (hidden_activated > 0).float()  # ReLU gradient
        weights1_grad = torch.matmul(X_batch.t(), hidden_error)
        bias1_grad = torch.sum(hidden_error, dim=0)
        
        # Update weights and biases
        weights1 -= learning_rate * weights1_grad
        bias1 -= learning_rate * bias1_grad
        weights2 -= learning_rate * weights2_grad
        bias2 -= learning_rate * bias2_grad
        
        # Calculate accuracy for this batch
        predicted_labels = (predictions >= 0.5).float()
        accuracy = (predicted_labels == y_batch.view(-1, 1)).float().mean()
        
        epoch_losses.append(loss.item())
        epoch_accuracies.append(accuracy.item())
    
    # Average metrics for this epoch
    avg_loss = sum(epoch_losses) / len(epoch_losses)
    avg_accuracy = sum(epoch_accuracies) / len(epoch_accuracies)
    losses.append(avg_loss)
    accuracies.append(avg_accuracy)
    
    # Print progress every 10 epochs
    if epoch % 10 == 0:
        print(f"Epoch {epoch}/{num_epochs}")
        print(f"Loss: {avg_loss:.4f}")
        print(f"Accuracy: {avg_accuracy:.4f}\n")

print("Training completed!")
print(f"Final Loss: {losses[-1]:.4f}")
print(f"Final Accuracy: {accuracies[-1]:.4f}")

Epoch 0/100
Loss: 0.6905
Accuracy: 0.6006

Epoch 10/100
Loss: 0.2037
Accuracy: 0.9652

Epoch 20/100
Loss: 0.0241
Accuracy: 1.0000

Epoch 30/100
Loss: 0.0100
Accuracy: 1.0000

Epoch 40/100
Loss: 0.0049
Accuracy: 1.0000

Epoch 50/100
Loss: 0.0029
Accuracy: 1.0000

Epoch 60/100
Loss: 0.0022
Accuracy: 1.0000

Epoch 70/100
Loss: 0.0017
Accuracy: 1.0000

Epoch 80/100
Loss: 0.0014
Accuracy: 1.0000

Epoch 90/100
Loss: 0.0012
Accuracy: 1.0000

Training completed!
Final Loss: 0.0011
Final Accuracy: 1.0000
