In [None]:
%load_ext autoreload
%autoreload 2

# TODO: figure our why manual update ot gradient ar enot working.

import torch
import os,sys
import copy
from datetime import datetime
import numpy as np
import torch.nn as nn
from CCustomTokenizer import CCustomTokenizer
from CCustomInference import CCustomInference
from Decoder import Decoder

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from DifferentialPrivacy.CDP_SGD import CDP_SGD

In [None]:
# CUDA assertions
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [None]:
# print current path
import os
print(os.getcwd())
# Load the tokenizer
customTokenizer = CCustomTokenizer("../../data/SampleSentencesCorrected.txt")
print("Number of tokens:",customTokenizer.getVocabSize())

In [None]:
# Specify the dimensions for the Transformer model
dimEmbeddings = 64 # 64 embeddings
VocabSize = customTokenizer.getMaxTokenId() # Since the embedding layer is index based used the idx
maxLen = customTokenizer.getMaxLen()
attentionKeysSize = 16 # size of q,k and v. Attention output size = noOfHeads*attentionKeysSize
noOfHeads = 4
noOfTransformerBlocks = 2

Define the Decoder, set the specific dimensions

In [None]:
torch.manual_seed(42) # Set seed for reproducibility across runs
model = Decoder(vocab_size = VocabSize,
                 max_len= maxLen, 
                 d_k = attentionKeysSize, 
                 d_model = dimEmbeddings, 
                 n_heads = noOfHeads, 
                 n_layers = noOfTransformerBlocks,
                 dropout_prob = 0.0) # 0.1


In [None]:
#paramCount = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("# Trainable model parameters:", model.getParamCount())

In [None]:
# Parameters for each layer in cascading format
# Embeddings layer, input: 43, output: 64.
# Total params: 43*64 = 2752, embeddings do not have bias.
for name, layer in model.named_modules():
    if name != "":
        total_params = sum(p.numel() for p in layer.parameters() if p.requires_grad)
        print(f"Layer: {name:<10} Parameters: {total_params}")

In [None]:
print ("CUDA:",torch.cuda.is_available())
device = "cpu" #torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
model.to(device)

Inference Routines

In [None]:
def Infer(prompt, temperature=0.0, topP=1.0):
    infer = CCustomInference(model, customTokenizer, device, debug=False)
    print(f"{infer.getInferenceOutput(prompt, temperature=temperature,topP=topP)}") # All are lower case


In [None]:
# Test inference
def testInfer_1(temperature=0.0):
    prompt = "Romi"
    Infer(prompt, temperature)

def testInfer_2(temperature=0.0):
    prompt = "Romi is a" 
    Infer(prompt, temperature)

def testInfer_3(temperature=0.0):
    prompt = "" 
    Infer(prompt, temperature)
    
# Check inference with current model
testInfer_1(0)
#testInfer_2(0.0)

In [None]:
# Prepare the training data
trainData = customTokenizer.getAllTrainingRows()
trainDataTensor = torch.tensor(trainData)
print("Train data shape:", trainDataTensor.shape)
# Shape is [154, 12]: 154 samples with 12 tokens

Main training loop

In [None]:
useDPSGD = True
bestModel = None
if useDPSGD:
  learningRate = 0.001 # Can implement adaptive learning rate
  noOfEpochs = 30
else:
  learningRate = 0.01
  noOfEpochs = 10
#------ DP_SGD Parameters
eps = .1
delta = 1e-7 #.5

# A function to encapsulate the training loop
# N - batch size 
# T - sequence length (number of tokens in a sentence)
# V - vocab size
def train(model, criterion, optimizer, epochs):
  train_losses = np.zeros(epochs)
  lowestLoss = sys.float_info.max
  # create object for DP-SGD here so that it can be reint in each call.
  countRowsTrain = trainDataTensor.shape[0]
  dpsgd = CDP_SGD(model=model, learningRate=learningRate,delta=delta,eps=eps,totalSamples=countRowsTrain) # Use defaults  
  
  for iter in range(epochs):
    model.train()
    t0 = datetime.now()
    train_loss = []
    # Go through each sample in the training dataset
    # update the model parameters after each sample like SGD
    # each row of trainingDataTensor

    idxTrain = 0
    for i in range(countRowsTrain): 
      # For DP-SGD the values have to be picked with sampling probability L/N 
      # In our case L = 1 
      if useDPSGD: # pick random sample
        # Pick a random sample from the training data
        idxTrain = np.random.randint(0, countRowsTrain)
      else:
        idxTrain = i # Go through all samples sequentially in the training data
      idxTrain = i # Temporary
      x_t = trainDataTensor[idxTrain].unsqueeze(0).to(device)
     
      if not useDPSGD:
        optimizer.zero_grad() # set all grads to 0
      else:
        dpsgd.setZeroGrads()
        
      # shift targets backwards
      # Original: <CLS> The cat sat on the mat <SEP>
      # Becomes: The cat sat on the mat <SEP> <PAD>
      targets = x_t.clone().detach()
      # shifts = -1, will shift the target to left by 1
      targets = torch.roll(targets, shifts=-1, dims=1)
      # PAD token is ignored in the loss so set last token to PAD
      targets[:, -1] = customTokenizer.getPadTokenId()

      # Forward pass
      outputs = model(x_t)
      # outputs are N x T x V
      # but PyTorch expects N x V x T

      transposedOutputs = outputs.transpose(2, 1)
      loss = criterion(transposedOutputs, targets)
      # N, T, V = outputs.shape
      # loss = criterion(outputs.view(N * T, V), targets.view(N * T))
        
      # Backward and optimize
      # This will update the grad values in the model parameters
      loss.backward()
      
      # Apply DP-SGD here
      if useDPSGD:
        dpsgd.singleStep()
      else:
        optimizer.step() # update the parameters
      train_loss.append(loss.item())

    # Get train loss and test loss
    train_loss = np.mean(train_loss)

    # Save losses
    train_losses[iter] = train_loss
    
    if train_loss < lowestLoss:
      bestModel = copy.deepcopy(model)
      lowestLoss = train_loss
      
    dt = datetime.now() - t0
    print(f'Epoch {iter+1}/{epochs}, Train Loss: {train_loss:.4f}, Duration: {dt}')
    if useDPSGD:
        (spentEps, spentDelta) = dpsgd.getPrivacySpent()
        print(f"*** Using DPSGD, Spent: ε={spentEps}, δ={spentDelta}  ***")
        if dpsgd.hasReachedPrivacyLimit(): # Check at tend of each epoch
          print("Privacy budget used up, stopping training")
          break

  # Set model to best model
  if bestModel is not None:
    print(f"### Best model has loss: {lowestLoss:.4f}")
    model = copy.deepcopy(bestModel)
  
  return train_losses

In [None]:
# Loss and optimizer
# CrossEntropyLoss is meant for classification problems
# 
# from: https://developers.google.com/machine-learning/glossary/#logits
# logits:The vector of raw (non-normalized) predictions that a classification model generates, 
# which is ordinarily then passed to a normalization function
# 
# input: The input is expected to contain the unnormalized logits for each class (which do not need to be positive or sum to 1, in general)
# hence the input will be a vector
#
# target: is Class indices in the range [0,C)where C is the number of classes; 
# if ignore_index is specified, this loss also accepts this class index 
# (this index may not necessarily be in the class range).
# Here the number of classes is the vocab size

criterion = nn.CrossEntropyLoss(ignore_index= customTokenizer.getPadTokenId()) 
# Set the optimizer
# Use SGG to be consistent with the manual methods used with DPSGD=true
#optimizer = torch.optim.Adam(model.parameters(),lr=learningRate)
optimizer = torch.optim.SGD(model.parameters(),lr=learningRate)

In [None]:
# Start the training loop
train_losses = train(
    model, criterion, optimizer, epochs=noOfEpochs)

Model has been trained, following sections will deal with model inference and metrics

In [None]:
testInfer_2(0)