In [2]:
!pip install import-ipynb



In [3]:
import numpy as np
import torch
import torch.nn as nn
import import_ipynb
from google.colab import drive

In [4]:
drive.mount("mnt")
%cd "mnt/My Drive/Colab Notebooks/Advanced ML/PonderNet"

Mounted at mnt
/content/mnt/My Drive/Colab Notebooks/Advanced ML/PonderNet


In [14]:
import ParityPonderNet as Chird

##Dataset Ex:

In [15]:
demo_data = Chird.CustomData(8, 10, 0)                                                #Create dataset with 100 samples of 8 features each with seed 0

In [16]:
print(f"No. of samples: {len(demo_data)}")
data_sample = demo_data[0]
print(f"Sample 10 Fetures: {data_sample[0]}")
print(f"Sample 10 Parity: {data_sample[1]}")

No. of samples: 10
Sample 10 Fetures: tensor([-1.,  0., -1.,  0.,  0.,  1., -1.,  1.])
Sample 10 Parity: 0


In [17]:
demo_dataloader = torch.utils.data.DataLoader(demo_data, 
                batch_size=2, num_workers=2)                                    #Create dataloader and stream data in batches


for x,y in demo_dataloader:                                                     #Each iteration streams <= sample/batch_size no. of data items (< on the last iteration if not divisible)
    print(f"Features are: {x}\n")
    print(f"Target is: {y}")

    break

Features are: tensor([[ 1.,  1., -1.,  1.,  1.,  1., -1., -1.],
        [ 0.,  1.,  0.,  1.,  1., -1.,  1.,  1.]])

Target is: tensor([1, 1])


##Model Ex:

In [20]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")           #Specify which resource to use

model = Chird.ParityPonderNetwork(8, nn.GRUCell, 5, 
                64, seed=0).to(device, torch.float32)                           #Init the model wth GRU step-function              

optimizer = torch.optim.Adam(model.parameters(), lr=0.0003)                     #Init the optimizer with Adam algorithm and learning rate of 0.0003

In [21]:
for x,y in demo_dataloader:                                                     

    features, labels = x.to(device, torch.float32), y.to(device, 
                        torch.float32)                                          #Attach the fetures and labels to the device
    
    predictions, probabilities, _, _ = model(features, demo=True)               #Run forward propogation for PonderNet
    break

**********************************************PONDER STEP 0**************************************************

The hidden states are: tensor([[ 0.0744,  0.0085,  0.0140,  0.0910, -0.0647, -0.0382, -0.0912, -0.0809,
          0.0560, -0.0381,  0.0811, -0.1058, -0.1844,  0.1014,  0.0775,  0.0974,
         -0.0413, -0.0663, -0.0702,  0.1249, -0.0238,  0.1550, -0.0129,  0.0297,
         -0.0467, -0.0330,  0.1171,  0.0295, -0.0932, -0.0182, -0.0583,  0.0359,
         -0.0564, -0.0211, -0.1543, -0.0022,  0.0010, -0.1314,  0.0515, -0.1607,
         -0.0400, -0.1312, -0.1036, -0.0315,  0.0851,  0.0578,  0.1147, -0.0645,
         -0.0649, -0.0611, -0.0661,  0.0726, -0.0079, -0.0694, -0.0110,  0.0355,
          0.0476,  0.0510, -0.0137, -0.0473,  0.0501,  0.0377, -0.0847, -0.0301],
        [ 0.0775, -0.0548, -0.0422,  0.0977, -0.0339,  0.0744,  0.0317,  0.1150,
          0.0120, -0.2050,  0.1565,  0.1546, -0.1033,  0.0624, -0.0599,  0.0008,
         -0.0234,  0.0638,  0.0385,  0.0113, -0.0985,  

##Loss Ex:

In [22]:
model_loss = Chird.PonderLoss(5, 
                nn.BCEWithLogitsLoss(reduction='none'), 
                    0.3, 0.01).to(device, torch.float32)                        #Init the model loss with binary cross-entropy, a lambda probability of 0.3 and beta of 0.01

###Reconstruction Loss

In [23]:
for i, (x,y) in enumerate(demo_dataloader):                                                     

    features, labels = x.to(device, torch.float32), y.to(device, 
                        torch.float32)
    
    predictions, probabilities, _, _ = model(features)                          #Run forward propogation for PonderNet
    print(f"*****************************EPOCH {i}***********************************")

    loss = model_loss(probabilities, labels, predictions, True)                 #Compute the total loss of the predictions
    '''Set the gradients to 0, compute the gradients and take the gradient 
    descent step.
    ''' 
    optimizer.zero_grad()                                                       #Clear gradients
    loss.backward()                                                             #Compute the gradients
    optimizer.step()                                                            #Update the parameters

*****************************EPOCH 0***********************************
The probabilities are: tensor([[0.4744, 0.4744],
        [0.2449, 0.2526],
        [0.1300, 0.1324],
        [0.0697, 0.0686],
        [0.0810, 0.0719]], grad_fn=<CopySlices>)
BCE loss at step 0 = tensor([0.6594, 0.7280], grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
Weighted loss at step 0 = tensor([0.3128, 0.3454], grad_fn=<MulBackward0>)
BCE loss at step 1 = tensor([0.6462, 0.7471], grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
Weighted loss at step 1 = tensor([0.1583, 0.1887], grad_fn=<MulBackward0>)
BCE loss at step 2 = tensor([0.6414, 0.7606], grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
Weighted loss at step 2 = tensor([0.0834, 0.1007], grad_fn=<MulBackward0>)
BCE loss at step 3 = tensor([0.6405, 0.7705], grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
Weighted loss at step 3 = tensor([0.0446, 0.0529], grad_fn=<MulBackward0>)
BCE loss at step 4 = tensor([0.6408, 0.7781], grad_fn=<BinaryCrossEntrop

###Regularization Loss

In [24]:
#Create initial probability distribution
p_not_halted = 1.                                                               #Helper variable that lets us know the probability of not stoppping at the current step
geometric_probs = []
for _ in range(5):
    geometric_probs.append(p_not_halted * 0.3)
    p_not_halted = p_not_halted * (1-0.3)                                       #Update the probs. of not halting; this follows a geometric distribution as it continuously reduces

print(f"List representation geometric_probabaility distributino: {geometric_probs}\n")
print(f"The sum of the geometric prob. distribution: {sum(geometric_probs)}")

List representation geometric_probabaility distributino: [0.3, 0.21, 0.14699999999999996, 0.10289999999999998, 0.07202999999999997]

The sum of the geometric prob. distribution: 0.83193


In [25]:
for i, (x,y) in enumerate(demo_dataloader):                                                     

    features, labels = x.to(device, torch.float32), y.to(device, 
                        torch.float32)
    
    predictions, probabilities, _, _ = model(features)                          #Run forward propogation for PonderNet
    print(f"*****************************EPOCH {i}***********************************")
    loss = model_loss(probabilities, labels, predictions, demo_reg=True)        #Compute the total loss of the predictions
    '''Set the gradients to 0, compute the gradients and take the gradient 
    descent step.
    ''' 
    optimizer.zero_grad()                                                       #Clear gradients
    loss.backward()                                                             #Compute the gradients
    optimizer.step()                                                            #Update the parameters

*****************************EPOCH 0***********************************
The probabilities are: tensor([[0.4745, 0.2405, 0.1283, 0.0701, 0.0866],
        [0.4745, 0.2563, 0.1330, 0.0677, 0.0685]],
       grad_fn=<TransposeBackward0>)
Batch geo probs: tensor([[0.3000, 0.2100, 0.1470, 0.1029, 0.0720],
        [0.3000, 0.2100, 0.1470, 0.1029, 0.0720]])

The regularisation loss is: -0.1188916563987732


*****************************EPOCH 1***********************************
The probabilities are: tensor([[0.4745, 0.2599, 0.1332, 0.0667, 0.0657],
        [0.4745, 0.2530, 0.1323, 0.0684, 0.0718]],
       grad_fn=<TransposeBackward0>)
Batch geo probs: tensor([[0.3000, 0.2100, 0.1470, 0.1029, 0.0720],
        [0.3000, 0.2100, 0.1470, 0.1029, 0.0720]])

The regularisation loss is: -0.1177581325173378


*****************************EPOCH 2***********************************
The probabilities are: tensor([[0.4745, 0.2469, 0.1299, 0.0690, 0.0797],
        [0.4745, 0.2640, 0.1343, 0.0659, 0.0613]],


##Halting Ex:

In [26]:
model.eval()
with torch.no_grad():
    for i, (x,y) in enumerate(demo_dataloader):                                                     

        features, labels = x.to(device, torch.float32), y.to(device, 
                            torch.float32)
        
        predictions, probabilities, _, _ = model(features, is_prediction=True, 
                                                demo_halting=True)                 #Run forward propogation for PonderNet

The halted batch indeces are: tensor([5., 3.])
The batch was halted at ponder step: 4

The halted batch indeces are: tensor([1., 1.])
The batch was halted at ponder step: 0

The halted batch indeces are: tensor([1., 1.])
The batch was halted at ponder step: 0

The halted batch indeces are: tensor([2., 3.])
The batch was halted at ponder step: 2

The halted batch indeces are: tensor([2., 2.])
The batch was halted at ponder step: 1

