In [5]:
%load_ext autoreload
%autoreload 2

#from transformers import AutoTokenizer, DataCollatorWithPadding
import torch
import os,sys
from datetime import datetime
import numpy as np
import torch.nn as nn
from CCustomTokenizer import CCustomTokenizer
from CCustomInference import CCustomInference
from Decoder import Decoder

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from DifferentialPrivacy.CDP_SGD import CDP_SGD

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
# CUDA assertions
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [7]:
# print current path
import os
print(os.getcwd())
# Load the tokenizer
customTokenizer = CCustomTokenizer("../../data/SampleSentencesCorrected.txt")
print("Number of tokens:",customTokenizer.getVocabSize())

c:\ChaitanyaBelwal\ACTIVE\Development\GitHub\Python\MachineLearning\ReferenceCode\NLP\Transformers\TransformersDP\Decoder
Number of tokens: 43


In [8]:
# Specify the dimension
dimEmbeddings = 64 # 64 embeddinds
VocabSize = customTokenizer.getMaxTokenId() # Since the embedding layer is index based used the idx
maxLen = customTokenizer.getMaxLen()
attentionKeysSize = 16 # size of q,k and v. Attention output size = noOfHeads*attentionKeysSize
noOfHeads = 4
noOfTransformerBlocks = 2

Define the Decoder, set the specific dimensions

In [9]:
torch.manual_seed(42) # Set seed for reproducibility across runs
model = Decoder(vocab_size = VocabSize,
                 max_len= maxLen, 
                 d_k = attentionKeysSize, 
                 d_model = dimEmbeddings, 
                 n_heads = noOfHeads, 
                 n_layers = noOfTransformerBlocks,
                 dropout_prob = 0.0) # 0.1


In [10]:
#paramCount = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("# Trainable model parameters:", model.getParamCount())

# Trainable model parameters: 105643


In [11]:
# Parameters for each layer in cascading format
# Embeddings layer, input: 43, output: 64.
# Total params: 43*64 = 2752, embeddings do not have bias.
for name, layer in model.named_modules():
    if name != "":
        total_params = sum(p.numel() for p in layer.parameters() if p.requires_grad)
        print(f"Layer: {name:<10} Parameters: {total_params}")

Layer: embedding  Parameters: 2752
Layer: pos_encoding Parameters: 0
Layer: pos_encoding.dropout Parameters: 0
Layer: transformer_blocks Parameters: 99968
Layer: transformer_blocks.0 Parameters: 49984
Layer: transformer_blocks.0.ln1 Parameters: 128
Layer: transformer_blocks.0.ln2 Parameters: 128
Layer: transformer_blocks.0.mha Parameters: 16640
Layer: transformer_blocks.0.mha.key Parameters: 4160
Layer: transformer_blocks.0.mha.query Parameters: 4160
Layer: transformer_blocks.0.mha.value Parameters: 4160
Layer: transformer_blocks.0.mha.fc Parameters: 4160
Layer: transformer_blocks.0.ann Parameters: 33088
Layer: transformer_blocks.0.ann.0 Parameters: 16640
Layer: transformer_blocks.0.ann.1 Parameters: 0
Layer: transformer_blocks.0.ann.2 Parameters: 16448
Layer: transformer_blocks.0.ann.3 Parameters: 0
Layer: transformer_blocks.0.dropout Parameters: 0
Layer: transformer_blocks.1 Parameters: 49984
Layer: transformer_blocks.1.ln1 Parameters: 128
Layer: transformer_blocks.1.ln2 Parameters: 

In [12]:
print ("CUDA:",torch.cuda.is_available())
device = "cpu" #torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
model.to(device)

CUDA: True
cpu


Decoder(
  (embedding): Embedding(43, 64)
  (pos_encoding): PositionalEncoding(
    (dropout): Dropout(p=0.0, inplace=False)
  )
  (transformer_blocks): Sequential(
    (0): TransformerBlock(
      (ln1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (ln2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (mha): CausalSelfAttention(
        (key): Linear(in_features=64, out_features=64, bias=True)
        (query): Linear(in_features=64, out_features=64, bias=True)
        (value): Linear(in_features=64, out_features=64, bias=True)
        (fc): Linear(in_features=64, out_features=64, bias=True)
      )
      (ann): Sequential(
        (0): Linear(in_features=64, out_features=256, bias=True)
        (1): GELU(approximate='none')
        (2): Linear(in_features=256, out_features=64, bias=True)
        (3): Dropout(p=0.0, inplace=False)
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (1): TransformerBlock(
      (ln1): LayerNorm((64,), eps=1e-05, el

Inference Routines

In [13]:
def Infer(prompt, temperature=0.0, topP=1.0):
    infer = CCustomInference(model, customTokenizer, device, debug=False)
    print(f"{infer.getInferenceOutput(prompt, temperature=temperature,topP=topP)}") # All are lower case


In [32]:
# Test inference
def testInfer_1(temperature=0.0):
    prompt = "Romi"
    Infer(prompt, temperature)

def testInfer_2(temperature=0.0):
    prompt = "" 
    Infer(prompt, temperature)
    
# Check inference with current model
testInfer_1(0)
#testInfer_2(0.0)

KeyboardInterrupt: 

In [15]:
# Prepare the training data
trainData = customTokenizer.getAllTrainingRows()
trainDataTensor = torch.tensor(trainData)
print("Train data shape:", trainDataTensor.shape)
# Shape is [154, 12]: 154 samples with 12 tokens

Train data shape: torch.Size([154, 12])


Main training loop

In [29]:
learningRate = 0.01
useDPSGD = True
#------ DP_SGD Parameters
eps = .5
delta = 1e-7 #.5
dpsgd = CDP_SGD(model, learningRate) # Use defaults

# A function to encapsulate the training loop
# N - batch size 
# T - sequence length (number of tokens in a sentence)
# V - vocab size
def train(model, criterion, optimizer, epochs):
  train_losses = np.zeros(epochs)
  

  for iter in range(epochs):
    model.train()
    t0 = datetime.now()
    train_loss = []
    # Go through each sample in the training dataset
    # update the model parameters after each sample like SGD
    # each row of trainingDataTensor
    countRowsTrain = trainDataTensor.shape[0]

    idxTrain = 0
    for i in range(countRowsTrain): 
      # For DP-SGD the values have to be picked with sampling probability L/N 
      # In our case L = 1 
      if useDPSGD: # pick random sample
        # Pick a random sample from the training data
        idxTrain = np.random.randint(0, countRowsTrain)
      else:
        idxTrain = i # Go through all samples sequentially in the training data
      x_t = trainDataTensor[idxTrain].unsqueeze(0).to(device)
     
      if useDPSGD:
        (spentEps, spentDelta) = dpsgd.getPrivacySpent()
        print(f"*** Using DPSGD, Spent: eps={spentEps}, delta={spentDelta}  ***")

        if dpsgd.hasReachedPrivacyLimit():
          print("Privacy budget used up, stopping training")
          break
      else:
        optimizer.zero_grad() # set all grads to 0
        
      # shift targets backwards
      # Original: <CLS> The cat sat on the mat <SEP>
      # Becomes: The cat sat on the mat <SEP> <PAD>
      targets = x_t.clone().detach()
      # shifts = -1, will shift the target to left by 1
      targets = torch.roll(targets, shifts=-1, dims=1)
      # PAD token is ignored in the loss so set last token to PAD
      targets[:, -1] = customTokenizer.getPadTokenId()

      # Forward pass
      outputs = model(x_t)
      # outputs are N x T x V
      # but PyTorch expects N x V x T

      transposedOutputs = outputs.transpose(2, 1)
      loss = criterion(transposedOutputs, targets)
      # N, T, V = outputs.shape
      # loss = criterion(outputs.view(N * T, V), targets.view(N * T))
        
      # Backward and optimize
      # This will update the grad values in the model parameters
      loss.backward()

      # Apply DP-SGD here
      if useDPSGD:
        dpsgd.singleStep()
      else:
        optimizer.step() # update the parameters
      train_loss.append(loss.item())

    # Get train loss and test loss
    train_loss = np.mean(train_loss)

    # Save losses
    train_losses[iter] = train_loss
    
    dt = datetime.now() - t0
    print(f'Epoch {iter+1}/{epochs}, Train Loss: {train_loss:.4f}, Duration: {dt}')
  return train_losses

In [17]:
# Loss and optimizer
# CrossEntropyLoss is meant for classification problems
# 
# from: https://developers.google.com/machine-learning/glossary/#logits
# logits:The vector of raw (non-normalized) predictions that a classification model generates, 
# which is ordinarily then passed to a normalization function
# 
# input: The input is expected to contain the unnormalized logits for each class (which do not need to be positive or sum to 1, in general)
# hence the input will be a vector
#
# target: is Class indices in the range [0,C)where C is the number of classes; 
# if ignore_index is specified, this loss also accepts this class index 
# (this index may not necessarily be in the class range).
# Here the number of classes is the vocab size
#
criterion = nn.CrossEntropyLoss(ignore_index= customTokenizer.getPadTokenId()) 
# Set the optimizer
optimizer = torch.optim.Adam(model.parameters(),lr=0.001)

  from .autonotebook import tqdm as notebook_tqdm


In [30]:
#print(list(model.parameters()))
# Start the training loop
train_losses = train(
    model, criterion, optimizer, epochs=10)

TypeError: sqrt(): argument 'input' (position 1) must be Tensor, not float

Model has been trained, following sections will deal with model inference and metrics

In [None]:
testInfer_1(1)

<CLS> romi is a cat <SEP>
