Training an Encoder Decoer Model in different ways

1. Masked LM - Used for Encoder-Decoder type models (t5,flan-t5 etc)
2. Causal LM - Used for Decoder type models (gpt2,bloom,palm etc)
3. Teacher Forcing - Can be used for both


In [100]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import numpy as np
import torch
from torch.utils.data import Dataset

import re
from tqdm import tqdm
import os
import json

 ## Masked LM/ denoising training

https://huggingface.co/docs/transformers/main/model_doc/t5#training

In [3]:
# utility class for denoised training, taken from hugging face library
class FlaxDataCollatorForT5MLM:
    """
    From https://github.com/huggingface/transformers/blob/main/examples/flax/language-modeling/run_t5_mlm_flax.py
    """
    def __init__(self,tokenizer,noise_density,mean_noise_span_length) -> None:
        self.tokenizer = tokenizer
        self.noise_density = noise_density
        self.mean_noise_span_length =mean_noise_span_length

    def create_sentinel_ids(self, mask_indices):
        """
        Sentinel ids creation given the indices that should be masked.
        The start indices of each mask are replaced by the sentinel ids in increasing
        order. Consecutive mask indices to be deleted are replaced with `-1`.
        """
        start_indices = mask_indices - np.roll(mask_indices, 1, axis=-1) * mask_indices
        start_indices[:, 0] = mask_indices[:, 0]

        sentinel_ids = np.where(start_indices != 0, np.cumsum(start_indices, axis=-1), start_indices)
        sentinel_ids = np.where(sentinel_ids != 0, (len(self.tokenizer) - sentinel_ids), 0)
        sentinel_ids -= mask_indices - start_indices

        return sentinel_ids

    def filter_input_ids(self, input_ids, sentinel_ids):
        """
        Puts sentinel mask on `input_ids` and fuse consecutive mask tokens into a single mask token by deleting.
        This will reduce the sequence length from `expanded_inputs_length` to `input_length`.
        """
        batch_size = input_ids.shape[0]

        input_ids_full = np.where(sentinel_ids != 0, sentinel_ids, input_ids)
        # input_ids tokens and sentinel tokens are >= 0, tokens < 0 are
        # masked tokens coming after sentinel tokens and should be removed
        input_ids = input_ids_full[input_ids_full >= 0].reshape((batch_size, -1))
        input_ids = np.concatenate(
            [input_ids, np.full((batch_size, 1), self.tokenizer.eos_token_id, dtype=np.int32)], axis=-1
        )
        return input_ids

    def random_spans_noise_mask(self, length):
        """This function is copy of `random_spans_helper <https://github.com/google-research/text-to-text-transfer-transformer/blob/84f8bcc14b5f2c03de51bd3587609ba8f6bbd1cd/t5/data/preprocessors.py#L2682>`__ .
        # with the correction of this https://github.com/huggingface/transformers/pull/22938/files
        Noise mask consisting of random spans of noise tokens.
        The number of noise tokens and the number of noise spans and non-noise spans
        are determined deterministically as follows:
        num_noise_tokens = round(length * noise_density)
        num_nonnoise_spans = num_noise_spans = round(num_noise_tokens / mean_noise_span_length)
        Spans alternate between non-noise and noise, beginning with non-noise.
        Subject to the above restrictions, all masks are equally likely.
        Args:
            length: an int32 scalar (length of the incoming token sequence)
            noise_density: a float - approximate density of output mask
            mean_noise_span_length: a number
        Returns:
            a boolean tensor with shape [length]
        """

        orig_length = length

        num_noise_tokens = int(np.round(length * self.noise_density))
        num_nonnoise_tokens = length - num_noise_tokens
        # avoid degeneracy by ensuring positive numbers of noise and nonnoise tokens.
        num_noise_tokens = min(max(num_noise_tokens, 1), length - 1)
        # num_noise_tokens should be less than num_noise_tokens and num_nonnoise_tokens
        num_noise_spans = int(np.round(min(num_noise_tokens, num_nonnoise_tokens) / self.mean_noise_span_length))

        # avoid degeneracy by ensuring positive number of noise spans
        num_noise_spans = max(num_noise_spans, 1)

        # pick the lengths of the noise spans and the non-noise spans
        def _random_segmentation(num_items, num_segments):
            """Partition a sequence of items randomly into non-empty segments.
            Args:
                num_items: an integer scalar > 0
                num_segments: an integer scalar in [1, num_items]
            Returns:
                a Tensor with shape [num_segments] containing positive integers that add
                up to num_items
            """
            mask_indices = np.arange(num_items - 1) < (num_segments - 1)
            np.random.shuffle(mask_indices)
            first_in_segment = np.pad(mask_indices, [[1, 0]])
            segment_id = np.cumsum(first_in_segment)
            # count length of sub segments assuming that list is sorted
            _, segment_length = np.unique(segment_id, return_counts=True)
            return segment_length

        noise_span_lengths = _random_segmentation(num_noise_tokens, num_noise_spans)
        nonnoise_span_lengths = _random_segmentation(num_nonnoise_tokens, num_noise_spans)

        interleaved_span_lengths = np.reshape(
            np.stack([nonnoise_span_lengths, noise_span_lengths], axis=1), [num_noise_spans * 2]
        )
        span_starts = np.cumsum(interleaved_span_lengths)[:-1]
        span_start_indicator = np.zeros((length,), dtype=np.int8)
        span_start_indicator[span_starts] = True
        span_num = np.cumsum(span_start_indicator)
        is_noise = np.equal(span_num % 2, 1)

        return is_noise[:orig_length]


def get_denoised(FlaxDataCollatorForT5MLM, tokenizer, prompt):
    encoded = tokenizer(prompt, truncation=False, padding=False, return_tensors="pt")
    batch_size =1
    input_length = encoded.input_ids.shape[1]
    denoiser = FlaxDataCollatorForT5MLM(tokenizer,.55,1.5)
    mask_indices = np.asarray([denoiser.random_spans_noise_mask(input_length) for i in range(batch_size)])
    labels_mask = ~mask_indices
    input_ids_sentinel = denoiser.create_sentinel_ids(mask_indices.astype(np.int8))
    labels_sentinel = denoiser.create_sentinel_ids(labels_mask.astype(np.int8))
    input_ids = denoiser.filter_input_ids(encoded.input_ids, input_ids_sentinel)
    labels  =  denoiser.filter_input_ids(encoded.input_ids, labels_sentinel)
    return labels,input_ids



def print_token_id(tokenizer,token):
  # Encode the token
  encoded = tokenizer.encode(token)
  # Print the id
  print(token,encoded[0])
  return encoded[0]

def print_special_tokens(tokenizer):
    # Special tokens and their ids
    special_tokens = {}
    for attr in tokenizer.special_tokens_map:
        special_tokens[attr] = tokenizer.convert_tokens_to_ids(tokenizer.special_tokens_map[attr])

    # Print special tokens
    print(special_tokens)


In [4]:
def shift_tokens_right(input_ids, pad_token_id, eos_token_id):
  """ Shift input ids one token to the right, and add pad token at the first position, and eos token to the last """
  # Create a larger tensor that includes space for the EOS token
  shifted_input_ids = torch.zeros((input_ids.shape[0], input_ids.shape[1] + 1), dtype=input_ids.dtype)

  # Shift input_ids one step to the right
  shifted_input_ids[:, 1:] = input_ids

  # Set the first token to the pad_token_id
  shifted_input_ids[:, 0] = pad_token_id

  # Set the last token to the eos_token_id
  shifted_input_ids[:, -1] = eos_token_id

  return shifted_input_ids

In [5]:
 arr = np.array([[1, 2,3, 4,5]])
 arr = torch.tensor(arr)
 print(shift_tokens_right(arr,0,6))

tensor([[0, 1, 2, 3, 4, 6]])


## Using Masked LM for Seq to Seq model

In [6]:
#from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = 't5-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)# or T5Tokenizer
len_tokenizer =len(tokenizer) # 32100 to get the sentinel ids
print(f"len_tokenizer={len_tokenizer}")

Downloading config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

len_tokenizer=32100


For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [37]:
# This below is what happens in the denoised training
prompt = "The <extra_id_0> walks in <extra_id_1> park"
encoded_prompt = tokenizer(prompt, truncation=False, padding=False, return_tensors="pt").input_ids
print(f"encoded_prompt ={encoded_prompt}")
labels ="<extra_id_0> cute dog <extra_id_1> the <extra_id_2>"
encoded_labels = tokenizer(labels, truncation=False, padding=False, return_tensors="pt").input_ids
print(f"encoded_labels ={encoded_labels}")
print(f"encoded_prompt.shape=encoded_labels.shape {encoded_prompt.shape} ={encoded_labels.shape}")

# simulating the above

print("\n"*2)

prompt = "The cute dog walks in the green park"
labels, input_ids = get_denoised(FlaxDataCollatorForT5MLM, tokenizer, prompt)
print(f"denoised input_ids decoded = {tokenizer.decode(*input_ids,skip_special_tokens=False)}")
print(f"denoised labels decoded   = {tokenizer.decode(*labels,skip_special_tokens=False)}")
print(f"input_ids.shape {input_ids.shape} labels.shape {labels.shape}") # todo should this be equal

encoded_prompt =tensor([[   37, 32099, 10681,    16, 32098,  2447,     1]])
encoded_labels =tensor([[32099,  5295,  1782, 32098,     8, 32097,     1]])
encoded_prompt.shape=encoded_labels.shape torch.Size([1, 7]) =torch.Size([1, 7])



denoised input_ids decoded = The<extra_id_0> dog<extra_id_1> green park<extra_id_2></s>
denoised labels decoded   = <extra_id_0> cute<extra_id_1> walks in the<extra_id_2></s></s>
input_ids.shape (1, 8) labels.shape (1, 9)


In [39]:
labels2 ="<extra_id_0> cute dog <extra_id_1> the <extra_id_2>"
tokenizer(labels2).input_ids

[32099, 5295, 1782, 32098, 8, 32097, 1]

In [34]:
labels

array([[32099,  5295, 32098, 10681,    16, 32097,  2447,     1,     1]])

In [8]:
model = AutoModelForSeq2SeqLM.from_pretrained("t5-large") # or T5ForConditionalGeneration
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)

Downloading config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/2.95G [00:00<?, ?B/s]

Downloading generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [22]:
def load_json(file, path='../NumEval - Task 3/'):
    with open(os.path.join(path, file), 'r') as json_file:
        data = json.load(json_file)
    return data

'''
read train and dev files
'''
train_data = load_json('Train_Numerical_Reasoning.json')
dev_data = load_json('Dev_Numerical_Reasoning.json')

In [58]:
def process_data(sample, replace_token='mask', task='train'):
    '''
    teacher forcing only during training, hence reasoning prompt would be prepended only to the train samples
    '''
    
    news = sample['news']
    masked_headline = sample['masked headline']
    calculation = sample['calculation']
    ans = " " + str(sample['ans']) + " "
    
    if replace_token=='mask':
        replace_token = "<extra_id_0>"
        input_prompt = news + " " + masked_headline.replace('____', replace_token)
    else:
        input_prompt = news + " " + masked_headline.replace('____', ans)
    
    return input_prompt

def tokenize(sentence):
    return tokenizer.encode_plus(sentence,
                                 max_length=512,
                                 padding='max_length',
                                 truncation="only_first",
                                 return_tensors='pt',
                                 return_attention_mask=True)

In [28]:
train_processed = []
dev_processed = []

for i, sample in tqdm(enumerate(train_data)):
    if i==10575:
        continue
    else:
        train_processed.append(process_data(sample))

for i, sample in tqdm(enumerate(dev_data)):
    dev_processed.append(process_data(sample))

21157it [00:00, 337171.77it/s]
2572it [00:00, 314099.57it/s]


In [52]:
class NumDataset(Dataset):
    
    def __init__(self, prompts):
        'Initialization'
        self.prompts = prompts
        
    def __len__(self):
        'Denotes the total number of samples'
        return len(self.prompts)

    def __getitem__(self, idx):
        'Generates one sample of data'
        return self.prompts[idx]

In [53]:
params = {
    'batch_size':16,
    'shuffle':False
}

train_set = NumDataset(train_processed)
training_generator = torch.utils.data.DataLoader(train_set, **params)

dev_set = NumDataset(dev_processed)
dev_generator = torch.utils.data.DataLoader(dev_set, **params)

In [55]:
for epoch in range(100):
    for ind, batch in tqdm(enumerate(training_generator)):
        prompts = batch
        
        labels, input_ids = get_denoised(FlaxDataCollatorForT5MLM, tokenizer, prompt)
        denoised_input_ids = torch.from_numpy(input_ids)
        denoised_labels = torch.from_numpy(labels)
        denoised_attention_mask = torch.ones(input_ids.shape)
        
        outputs = model(input_ids=denoised_input_ids,attention_mask=denoised_attention_mask,
                      labels=denoised_labels)
        loss = outputs.loss
        if ind % 20 == 0:
            print(f"Epoch {epoch}  Loss {loss}")
            loss.backward()
        optimizer.step()
        optimizer.zero_grad()

0it [00:00, ?it/s]

Epoch 0  Loss 0.07789400219917297


20it [00:25,  1.20s/it]

Epoch 0  Loss 0.06955459713935852


40it [00:50,  1.21s/it]

Epoch 0  Loss 0.9799178242683411


60it [01:15,  1.27s/it]

Epoch 0  Loss 0.11499893665313721


80it [01:40,  1.23s/it]

Epoch 0  Loss 0.011922817677259445


100it [02:06,  1.31s/it]

Epoch 0  Loss 0.009716815315186977


120it [02:31,  1.26s/it]

Epoch 0  Loss 0.03928535804152489


140it [02:57,  1.30s/it]

Epoch 0  Loss 0.0045724892988801


160it [03:25,  1.32s/it]

Epoch 0  Loss 0.026859100908041


180it [03:52,  1.30s/it]

Epoch 0  Loss 0.33264219760894775


200it [04:18,  1.31s/it]

Epoch 0  Loss 0.01629958674311638


220it [04:45,  1.35s/it]

Epoch 0  Loss 0.6690669655799866


240it [05:11,  1.31s/it]

Epoch 0  Loss 0.06689233332872391


260it [05:37,  1.29s/it]

Epoch 0  Loss 0.011732911691069603


280it [06:02,  1.27s/it]

Epoch 0  Loss 0.4229007959365845


300it [06:29,  1.26s/it]

Epoch 0  Loss 0.5053970813751221


320it [06:55,  1.28s/it]

Epoch 0  Loss 0.016468189656734467


340it [07:21,  1.32s/it]

Epoch 0  Loss 0.0010785376653075218


360it [07:47,  1.28s/it]

Epoch 0  Loss 0.012188335880637169


380it [08:14,  1.36s/it]

Epoch 0  Loss 0.004730473272502422


400it [08:41,  1.38s/it]

Epoch 0  Loss 0.019742175936698914


420it [09:08,  1.32s/it]

Epoch 0  Loss 0.003066584700718522


440it [09:35,  1.38s/it]

Epoch 0  Loss 0.0064338697120547295


460it [10:02,  1.25s/it]

Epoch 0  Loss 0.0029338602907955647


480it [10:28,  1.27s/it]

Epoch 0  Loss 0.000575218815356493


500it [10:55,  1.29s/it]

Epoch 0  Loss 0.004621327854692936


520it [11:22,  1.34s/it]

Epoch 0  Loss 0.002053274307399988


540it [11:49,  1.36s/it]

Epoch 0  Loss 0.007949749939143658


560it [12:17,  1.41s/it]

Epoch 0  Loss 0.010626201517879963


580it [12:47,  1.46s/it]

Epoch 0  Loss 0.002431541681289673


600it [13:16,  1.47s/it]

Epoch 0  Loss 0.002975636161863804


620it [13:46,  1.45s/it]

Epoch 0  Loss 0.0036247691605240107


640it [14:16,  1.53s/it]

Epoch 0  Loss 0.003343170741572976


660it [14:47,  1.56s/it]

Epoch 0  Loss 0.002862487453967333


680it [15:18,  1.49s/it]

Epoch 0  Loss 0.0016629482852295041


700it [15:49,  1.50s/it]

Epoch 0  Loss 0.007615310605615377


720it [16:20,  1.47s/it]

Epoch 0  Loss 0.0026677215937525034


740it [16:50,  1.49s/it]

Epoch 0  Loss 0.0006694476469419897


760it [17:19,  1.45s/it]

Epoch 0  Loss 0.0009715721826069057


780it [17:48,  1.46s/it]

Epoch 0  Loss 0.001226431573741138


800it [18:17,  1.47s/it]

Epoch 0  Loss 0.001245873630978167


820it [18:46,  1.39s/it]

Epoch 0  Loss 0.00022867444204166532


840it [19:14,  1.38s/it]

Epoch 0  Loss 0.0017261299071833491


860it [19:42,  1.41s/it]

Epoch 0  Loss 0.0009141184855252504


880it [20:11,  1.39s/it]

Epoch 0  Loss 0.0006275668856687844


900it [20:39,  1.39s/it]

Epoch 0  Loss 0.001120431930758059


920it [21:07,  1.43s/it]

Epoch 0  Loss 0.0007306902552954853


940it [21:36,  1.39s/it]

Epoch 0  Loss 0.000612205418292433


960it [22:04,  1.40s/it]

Epoch 0  Loss 0.000849278992973268


980it [22:33,  1.47s/it]

Epoch 0  Loss 0.0011348326224833727


1000it [23:02,  1.39s/it]

Epoch 0  Loss 0.00032187538454309106


1020it [23:30,  1.39s/it]

Epoch 0  Loss 0.0006287314463406801


1040it [23:58,  1.41s/it]

Epoch 0  Loss 0.0003821659192908555


1060it [24:26,  1.39s/it]

Epoch 0  Loss 0.00016275572124868631


1080it [24:55,  1.43s/it]

Epoch 0  Loss 0.0006891119992360473


1100it [25:23,  1.38s/it]

Epoch 0  Loss 0.00037699550739489496


1120it [25:51,  1.38s/it]

Epoch 0  Loss 0.0010686098830774426


1140it [26:19,  1.38s/it]

Epoch 0  Loss 0.0001311562373302877


1160it [26:47,  1.40s/it]

Epoch 0  Loss 0.0004070055147167295


1180it [27:15,  1.37s/it]

Epoch 0  Loss 0.0007928189006634057


1200it [27:43,  1.39s/it]

Epoch 0  Loss 0.00038377640885300934


1220it [28:11,  1.37s/it]

Epoch 0  Loss 0.0006537500885315239


1240it [28:39,  1.39s/it]

Epoch 0  Loss 0.0003150069678667933


1260it [29:07,  1.39s/it]

Epoch 0  Loss 0.00019131542649120092


1280it [29:35,  1.37s/it]

Epoch 0  Loss 0.00016814906848594546


1300it [30:03,  1.43s/it]

Epoch 0  Loss 0.0005395199405029416


1320it [30:32,  1.39s/it]

Epoch 0  Loss 0.0002108645421685651


1323it [30:36,  1.39s/it]
0it [00:00, ?it/s]

Epoch 1  Loss 0.0003030502994079143


20it [00:28,  1.40s/it]

Epoch 1  Loss 0.00018077527056448162


40it [00:56,  1.37s/it]

Epoch 1  Loss 0.00024764114641584456


60it [01:24,  1.46s/it]

Epoch 1  Loss 0.00023361039347946644


80it [01:53,  1.44s/it]

Epoch 1  Loss 8.824485848890617e-05


100it [02:22,  1.38s/it]

Epoch 1  Loss 0.0004252936923876405


120it [02:51,  1.47s/it]

Epoch 1  Loss 0.00016344586038030684


140it [03:19,  1.38s/it]

Epoch 1  Loss 9.261619561584666e-05


160it [03:47,  1.38s/it]

Epoch 1  Loss 0.0008391097653657198


180it [04:16,  1.38s/it]

Epoch 1  Loss 0.0004090219154022634


200it [04:44,  1.39s/it]

Epoch 1  Loss 0.00033194359275512397


220it [05:12,  1.38s/it]

Epoch 1  Loss 0.00024051112995948642


240it [05:41,  1.38s/it]

Epoch 1  Loss 6.818146357545629e-05


260it [06:08,  1.37s/it]

Epoch 1  Loss 0.0001686817704467103


280it [06:37,  1.38s/it]

Epoch 1  Loss 4.097905184607953e-05


300it [07:05,  1.38s/it]

Epoch 1  Loss 7.71735722082667e-05


320it [07:33,  1.38s/it]

Epoch 1  Loss 6.650095019722357e-05


340it [08:01,  1.37s/it]

Epoch 1  Loss 0.00031529448460787535


360it [08:29,  1.38s/it]

Epoch 1  Loss 6.105688953539357e-05


380it [08:57,  1.42s/it]

Epoch 1  Loss 0.0002668470551725477


400it [09:26,  1.38s/it]

Epoch 1  Loss 4.166790677118115e-05


420it [09:54,  1.36s/it]

Epoch 1  Loss 2.4264816602226347e-05


440it [10:22,  1.38s/it]

Epoch 1  Loss 3.705896597239189e-05


460it [10:50,  1.36s/it]

Epoch 1  Loss 4.754846668220125e-05


480it [11:18,  1.39s/it]

Epoch 1  Loss 2.9907134376117028e-05


500it [11:47,  1.39s/it]

Epoch 1  Loss 6.367819878505543e-05


520it [12:15,  1.41s/it]

Epoch 1  Loss 0.00011960459960391745


540it [12:43,  1.38s/it]

Epoch 1  Loss 2.617196696519386e-05


560it [13:11,  1.39s/it]

Epoch 1  Loss 2.2847654690849595e-05


580it [13:40,  1.41s/it]

Epoch 1  Loss 0.00015735754277557135


600it [14:08,  1.38s/it]

Epoch 1  Loss 0.00022422974871005863


620it [14:36,  1.38s/it]

Epoch 1  Loss 0.00013689971819985658


640it [15:04,  1.38s/it]

Epoch 1  Loss 2.1072955860290676e-05


660it [15:31,  1.38s/it]

Epoch 1  Loss 3.19998616760131e-05


680it [15:59,  1.37s/it]

Epoch 1  Loss 5.469996904139407e-05


700it [16:27,  1.36s/it]

Epoch 1  Loss 1.7364440282108262e-05


720it [16:54,  1.38s/it]

Epoch 1  Loss 8.243259799201041e-05


740it [17:23,  1.39s/it]

Epoch 1  Loss 1.4874393855279777e-05


760it [17:51,  1.40s/it]

Epoch 1  Loss 0.0001179239188786596


780it [18:19,  1.37s/it]

Epoch 1  Loss 2.329816015844699e-05


800it [18:47,  1.37s/it]

Epoch 1  Loss 9.209606650983915e-05


820it [19:15,  1.38s/it]

Epoch 1  Loss 3.560205368557945e-05


840it [19:43,  1.37s/it]

Epoch 1  Loss 0.0001031850406434387


860it [20:10,  1.36s/it]

Epoch 1  Loss 0.000416235881857574


880it [20:38,  1.38s/it]

Epoch 1  Loss 0.00017554954683873802


900it [21:06,  1.37s/it]

Epoch 1  Loss 1.0702197869250085e-05


920it [21:34,  1.38s/it]

Epoch 1  Loss 0.0001513587194494903


940it [22:02,  1.43s/it]

Epoch 1  Loss 0.00018704681133385748


960it [22:30,  1.36s/it]

Epoch 1  Loss 0.0001174847930087708


980it [23:08,  2.73s/it]

Epoch 1  Loss 0.00012433278607204556


1000it [23:36,  1.38s/it]

Epoch 1  Loss 0.00013235994265414774


1020it [24:11,  1.38s/it]

Epoch 1  Loss 0.00010571414895821363


1040it [24:39,  1.37s/it]

Epoch 1  Loss 0.0002294821315445006


1060it [25:07,  1.36s/it]

Epoch 1  Loss 8.318039363075513e-06


1080it [25:35,  1.37s/it]

Epoch 1  Loss 7.80232367105782e-05


1100it [26:03,  1.36s/it]

Epoch 1  Loss 8.894873462850228e-05


1120it [26:31,  1.39s/it]

Epoch 1  Loss 7.205451311165234e-06


1140it [26:59,  1.38s/it]

Epoch 1  Loss 6.7286287048773374e-06


1160it [27:26,  1.35s/it]

Epoch 1  Loss 3.315221692901105e-05


1180it [27:54,  1.39s/it]

Epoch 1  Loss 5.9069534472655505e-05


1200it [28:29,  1.37s/it]

Epoch 1  Loss 6.056840720702894e-05


1220it [28:56,  1.36s/it]

Epoch 1  Loss 4.97595829074271e-05


1240it [29:24,  1.39s/it]

Epoch 1  Loss 3.827763794106431e-05


1260it [30:08,  6.06s/it]

Epoch 1  Loss 0.00016697643150109798


1280it [30:39,  1.40s/it]

Epoch 1  Loss 5.655773293256061e-06


1300it [31:25,  5.22s/it]

Epoch 1  Loss 6.778639362892136e-05


1320it [31:53,  1.37s/it]

Epoch 1  Loss 0.00020157432300038636


1323it [31:57,  1.45s/it]
0it [00:00, ?it/s]

Epoch 2  Loss 4.88754449179396e-06


20it [00:28,  1.39s/it]

Epoch 2  Loss 0.00016037265595514327


40it [00:56,  1.39s/it]

Epoch 2  Loss 3.999896580353379e-05


60it [01:24,  1.40s/it]

Epoch 2  Loss 6.879684224259108e-05


80it [01:53,  1.37s/it]

Epoch 2  Loss 3.084736454184167e-05


100it [02:21,  1.38s/it]

Epoch 2  Loss 1.7337870303890668e-05


120it [02:49,  1.39s/it]

Epoch 2  Loss 3.888478750013746e-05


140it [03:17,  1.37s/it]

Epoch 2  Loss 2.4609251340734772e-05


160it [03:45,  1.36s/it]

Epoch 2  Loss 2.9243923563626595e-05


180it [04:13,  1.39s/it]

Epoch 2  Loss 4.931075091008097e-05


200it [04:41,  1.36s/it]

Epoch 2  Loss 9.277691424358636e-05


220it [05:08,  1.35s/it]

Epoch 2  Loss 2.0370975107653067e-05


240it [05:36,  1.37s/it]

Epoch 2  Loss 0.0001217878088937141


260it [06:04,  1.37s/it]

Epoch 2  Loss 3.023868521268014e-05


280it [06:32,  1.39s/it]

Epoch 2  Loss 2.2370419173967093e-05


300it [07:00,  1.38s/it]

Epoch 2  Loss 1.9642153347376734e-05


320it [07:28,  1.38s/it]

Epoch 2  Loss 3.3378430543962168e-06


340it [07:56,  1.38s/it]

Epoch 2  Loss 1.6966818293440156e-05


360it [08:24,  1.37s/it]

Epoch 2  Loss 1.593396154930815e-05


380it [08:52,  1.39s/it]

Epoch 2  Loss 1.455649544368498e-05


400it [09:20,  1.39s/it]

Epoch 2  Loss 3.0994272037787596e-06


420it [09:48,  1.39s/it]

Epoch 2  Loss 0.00013523278175853193


440it [10:16,  1.38s/it]

Epoch 2  Loss 6.729603774147108e-05


460it [10:44,  1.38s/it]

Epoch 2  Loss 9.711771417642012e-05


480it [11:12,  1.38s/it]

Epoch 2  Loss 1.1629351320152637e-05


500it [11:41,  1.39s/it]

Epoch 2  Loss 1.2278231224627234e-05


520it [12:09,  1.38s/it]

Epoch 2  Loss 2.64769605564652e-05


540it [12:37,  1.38s/it]

Epoch 2  Loss 2.7815383418783313e-06


560it [13:05,  1.37s/it]

Epoch 2  Loss 5.4343356168828905e-05


580it [13:33,  1.39s/it]

Epoch 2  Loss 4.808076027984498e-06


600it [14:01,  1.41s/it]

Epoch 2  Loss 4.463527875486761e-05


620it [14:29,  1.39s/it]

Epoch 2  Loss 2.092777549478342e-06


640it [14:57,  1.41s/it]

Epoch 2  Loss 4.664849257096648e-05


660it [15:24,  1.36s/it]

Epoch 2  Loss 2.145727921742946e-05


680it [15:52,  1.38s/it]

Epoch 2  Loss 1.154988058260642e-05


700it [16:20,  1.37s/it]

Epoch 2  Loss 1.0331243174732663e-05


720it [16:48,  1.38s/it]

Epoch 2  Loss 8.529965271009132e-06


740it [17:16,  1.38s/it]

Epoch 2  Loss 1.4066406947677024e-05


760it [17:44,  1.38s/it]

Epoch 2  Loss 8.834630534693133e-06


780it [18:12,  1.36s/it]

Epoch 2  Loss 7.695527528994717e-06


800it [18:40,  1.38s/it]

Epoch 2  Loss 6.649170245509595e-06


820it [19:08,  1.39s/it]

Epoch 2  Loss 2.237115040770732e-05


840it [19:36,  1.36s/it]

Epoch 2  Loss 1.4569989161827834e-06


860it [20:04,  1.36s/it]

Epoch 2  Loss 0.0001030268394970335


880it [20:31,  1.39s/it]

Epoch 2  Loss 3.713897240231745e-05


900it [20:59,  1.35s/it]

Epoch 2  Loss 1.837117270042654e-05


920it [21:27,  1.40s/it]

Epoch 2  Loss 2.4435892555629835e-05


940it [21:55,  1.38s/it]

Epoch 2  Loss 3.6822177662543254e-06


960it [22:23,  1.36s/it]

Epoch 2  Loss 1.6715539459255524e-05


980it [22:51,  1.37s/it]

Epoch 2  Loss 8.504263678332791e-05


1000it [23:19,  1.36s/it]

Epoch 2  Loss 7.45007855584845e-05


1020it [23:47,  1.39s/it]

Epoch 2  Loss 6.265057891141623e-06


1040it [24:15,  1.38s/it]

Epoch 2  Loss 2.529875928303227e-06


1060it [24:43,  1.39s/it]

Epoch 2  Loss 3.283447585999966e-05


1080it [25:11,  1.40s/it]

Epoch 2  Loss 6.17232808508561e-06


1100it [25:39,  1.37s/it]

Epoch 2  Loss 1.6821713870740496e-06


1120it [26:07,  1.37s/it]

Epoch 2  Loss 4.927285317535279e-06


1140it [26:36,  1.42s/it]

Epoch 2  Loss 6.794829459977336e-06


1160it [27:04,  1.35s/it]

Epoch 2  Loss 5.404119292506948e-06


1180it [27:32,  1.37s/it]

Epoch 2  Loss 6.27827103016898e-06


1200it [28:00,  1.38s/it]

Epoch 2  Loss 4.1325652091472875e-06


1220it [28:28,  1.39s/it]

Epoch 2  Loss 6.563895294675604e-05


1240it [28:56,  1.37s/it]

Epoch 2  Loss 2.8543203370645642e-05


1260it [29:24,  1.34s/it]

Epoch 2  Loss 5.1392016757745296e-06


1280it [29:51,  1.39s/it]

Epoch 2  Loss 1.5099806205398636e-06


1300it [30:19,  1.37s/it]

Epoch 2  Loss 4.265021743776742e-06


1320it [30:47,  1.38s/it]

Epoch 2  Loss 1.4066521544009447e-05


1323it [30:52,  1.40s/it]
0it [00:00, ?it/s]

Epoch 3  Loss 5.5348868045257404e-05


20it [00:27,  1.36s/it]

Epoch 3  Loss 3.5232817481301026e-06


40it [00:55,  1.37s/it]

Epoch 3  Loss 1.3139351722202264e-05


60it [01:23,  1.35s/it]

Epoch 3  Loss 1.7219068695339956e-06


80it [01:51,  1.39s/it]

Epoch 3  Loss 4.6899418521206826e-05


83it [01:56,  1.40s/it]


KeyboardInterrupt: 

In [59]:
train_for_eval = []
dev_for_eval = []
for i, sample in tqdm(enumerate(train_data)):
    if i==10575:
        continue
    else:
        train_for_eval.append(process_data(sample))

for i, sample in tqdm(enumerate(dev_data)):
    dev_for_eval.append(process_data(sample))

21157it [00:00, 308200.34it/s]
2572it [00:00, 319551.82it/s]


In [60]:
train_for_eval[0]

"(Oct 7, 2014  12:40 PM CDT) As of Jan. 1, Walmart will no longer offer 30,000 of its employees health insurance. Bloomberg notes that's about 2% of its workforce. The move comes as a reaction to the company's rising health care costs as far more of its employees and their families enrolled in its health care plans than it had expected following the ObamaCare rollout. The AP reports those costs will surge $500 million this fiscal year, $170 million more than had been estimated. Those affected are employees who average fewer than 30 hours of work per week; the Wall Street Journal explains they were grandfathered in when Walmart in 2012 stopped offering insurance to new hires who didn't exceed the 30-hour threshold. A benefits expert says Walmart is actually late to the game in terms of cutting insurance to some part-time workers; Target, the Home Depot, and others have already done so. Meanwhile, Walmart's full time workers will see their premiums rise in 2015. Premiums for the basic pl

In [76]:
# After  training
model.eval()
test_prompt = train_for_eval[0]
encoded = tokenizer(test_prompt, truncation=False, padding=False, return_tensors="pt")
test_output = model.generate(input_ids = encoded.input_ids,num_return_sequences=1,max_length=5)
test_answer = tokenizer.decode(test_output[0], skip_special_tokens=True)
print(f"After Training: {test_prompt} \n\n")
print(f"Prediction: {test_answer}")

After Training: (Oct 7, 2014  12:40 PM CDT) As of Jan. 1, Walmart will no longer offer 30,000 of its employees health insurance. Bloomberg notes that's about 2% of its workforce. The move comes as a reaction to the company's rising health care costs as far more of its employees and their families enrolled in its health care plans than it had expected following the ObamaCare rollout. The AP reports those costs will surge $500 million this fiscal year, $170 million more than had been estimated. Those affected are employees who average fewer than 30 hours of work per week; the Wall Street Journal explains they were grandfathered in when Walmart in 2012 stopped offering insurance to new hires who didn't exceed the 30-hour threshold. A benefits expert says Walmart is actually late to the game in terms of cutting insurance to some part-time workers; Target, the Home Depot, and others have already done so. Meanwhile, Walmart's full time workers will see their premiums rise in 2015. Premiums f

In [64]:
# After  training
model.eval()
test_prompt = train_for_eval[1]
encoded = tokenizer(test_prompt, truncation=False, padding=False, return_tensors="pt")
test_output = model.generate(input_ids = encoded.input_ids,num_return_sequences=1,max_length=125)
test_answer = tokenizer.decode(test_output[0], skip_special_tokens=True)
print(f"Original: {train_processed[1]} \n\n")
print(f"After Training: {test_prompt} \n\n")
print(f"Prediction: {test_answer}")

Original: (Oct 29, 2013  8:15 AM CDT) Dax Shepard and Kristen Bell got married at the Beverly Hills courthouse, in a ceremony about as different from Kim Kardashian's last wedding extravaganza as it is possible to be. As Shepard revealed last night on Jimmy Kimmel Live, the whole thing—including the fuel it took to get to the courthouse—cost $142.  It was just Kristen and I at this lonely courthouse,  he said, so friends showed up afterward with a cake reading, in icing,  The World's Worst Wedding.   How many people can say they threw the world's worst wedding?  Shepard asked. Dax Shepard: Wedding to Kristen Bell Cost $ 142  


After Training: (Oct 29, 2013  8:15 AM CDT) Dax Shepard and Kristen Bell got married at the Beverly Hills courthouse, in a ceremony about as different from Kim Kardashian's last wedding extravaganza as it is possible to be. As Shepard revealed last night on Jimmy Kimmel Live, the whole thing—including the fuel it took to get to the courthouse—cost $142.  It was 

In [65]:
# After  training
model.eval()
test_prompt = train_for_eval[2]
encoded = tokenizer(test_prompt, truncation=False, padding=False, return_tensors="pt")
test_output = model.generate(input_ids = encoded.input_ids,num_return_sequences=1,max_length=125)
test_answer = tokenizer.decode(test_output[0], skip_special_tokens=True)
print(f"Original: {train_processed[2]} \n\n")
print(f"After Training: {test_prompt} \n\n")
print(f"Prediction: {test_answer}")

Original: (Mar 6, 2016  10:50 AM) Nancy Reagan, the helpmate, backstage adviser, and fierce protector of Ronald Reagan in his journey from actor to president—and finally during his 10-year battle with Alzheimer's disease—died Sunday at the age of 94, reports the AP, via CBS News. The cause was congestive heart failure, notes ABC News. In addition to her famous campaign against drugs, the one-time actress promoted several causes while she was in the White House and even in the years after. She was a passionate advocate for lifting restrictions on stem cell research and promoting better treatment of America's veterans. But above all, Nancy Reagan was a fiercely devoted wife.  My life began with Ronnie,  she told Vanity Fair magazine in 1998. The first lady's public life had its share of controversy but also earned the respect of the nation, making Nancy Reagan one of America's most admired women in the 1980s and beyond. Anne Frances  Nancy  Robbins was born on July 6, 1921 in New York Ci

In [74]:
# After  training
model.eval()
test_prompt = train_for_eval[3]
encoded = tokenizer(test_prompt, truncation=False, padding=False, return_tensors="pt")
test_output = model.generate(input_ids = encoded.input_ids,num_return_sequences=1,max_length=3)
test_answer = tokenizer.decode(test_output[0], skip_special_tokens=True)
print(f"Original: {train_processed[3]} \n\n")
print(f"After Training: {test_prompt} \n\n")
print(f"Prediction: {test_answer}")

Original: (Aug 15, 2008  5:11 AM CDT) American Airlines faces FAA fines of more than $7 million for a series of safety and maintenance violations and for deficiencies in its drug and alcohol testing, the Wall Street Journal reports. In proposing one of its biggest fines ever, the FAA accuses American of knowingly flying planes that needed safety repairs, including one MD-80 that flew several times in 2007 with a faulty autopilot. American claims the violations were largely technical and plans to contest the  excessive  penalty.  We do not agree with the FAA's findings and characterizations of American's action in these cases,  said a spokesman. It's the latest of example of the FAA's growing aggressiveness on maintenance supervision, the Journal notes. American Airlines Faces $ 7 M Fine for Safety Violations 


After Training: (Aug 15, 2008  5:11 AM CDT) American Airlines faces FAA fines of more than $7 million for a series of safety and maintenance violations and for deficiencies in i

In [98]:
# After  training
test_prompt = dev_for_eval[3]
encoded = tokenizer(test_prompt, truncation=False, padding=False, return_tensors="pt")
test_output = model.generate(input_ids = encoded.input_ids,num_return_sequences=1,max_length=3)
test_answer = tokenizer.decode(test_output[0], skip_special_tokens=True)
print(f"Original: {dev_processed[3]} \n\n")
print(f"After Training: {test_prompt} \n\n")
print(f"Prediction: {test_answer}")

Original: (Dec 24, 2014  11:19 AM) Turns out you won't even have to leave your house if you want to watch controversial Sony flick The Interview. The company has announced that, as of 1pm Eastern today, the Seth Rogen and James Franco movie about a plot to assassinate Kim Jong Un is available on YouTube, Google Play, Xbox, and possibly Sony's site SeetheInterview.com. It'll cost you $14.99 to buy or $5.99 to rent, Business Insider reports. The movie was initially pulled from release after the Sony hack and ensuing threats, but it will also be released in some theaters tomorrow after all. You Can Watch The Interview at  1 pm 


After Training: (Dec 24, 2014  11:19 AM) Turns out you won't even have to leave your house if you want to watch controversial Sony flick The Interview. The company has announced that, as of 1pm Eastern today, the Seth Rogen and James Franco movie about a plot to assassinate Kim Jong Un is available on YouTube, Google Play, Xbox, and possibly Sony's site SeetheInt

In [86]:
def _filter(output, end_token='<extra_id_0>'):
    # The first token is <unk> (inidex at 0) and the second token is <extra_id_0> (indexed at 32099)
    _txt = tokenizer.decode(output[2:], skip_special_tokens=False, clean_up_tokenization_spaces=False)
    if end_token in _txt:
        _end_token_index = _txt.index(end_token)
        return _result_prefix + _txt[:_end_token_index] + _result_suffix
    else:
        return _result_prefix + _txt + _result_suffix

In [87]:
test_prompt = train_for_eval[3]
encoded = tokenizer(test_prompt, truncation=False, padding=False, return_tensors="pt")
outputs = model.generate(input_ids=encoded.input_ids, 
                          num_beams=200, num_return_sequences=1,
                          max_length=5)
_0_index = test_prompt.index('<extra_id_0>')
_result_prefix = test_prompt[:_0_index]
_result_suffix = test_prompt[_0_index+12:] # 12 is the length of <extra_id_0>
results = list(map(_filter, outputs))
results

["(Aug 15, 2008  5:11 AM CDT) American Airlines faces FAA fines of more than $7 million for a series of safety and maintenance violations and for deficiencies in its drug and alcohol testing, the Wall Street Journal reports. In proposing one of its biggest fines ever, the FAA accuses American of knowingly flying planes that needed safety repairs, including one MD-80 that flew several times in 2007 with a faulty autopilot. American claims the violations were largely technical and plans to contest the  excessive  penalty.  We do not agree with the FAA's findings and characterizations of American's action in these cases,  said a spokesman. It's the latest of example of the FAA's growing aggressiveness on maintenance supervision, the Journal notes. American Airlines Faces $7.5<extra_id_1>7.5M Fine for Safety Violations"]

In [90]:
test_prompt = train_for_eval[0]
encoded = tokenizer(test_prompt, truncation=False, padding=False, return_tensors="pt")
outputs = model.generate(input_ids=encoded.input_ids, 
                          num_beams=200, num_return_sequences=1,
                          max_length=5)
_0_index = test_prompt.index('<extra_id_0>')
_result_prefix = test_prompt[:_0_index]
_result_suffix = test_prompt[_0_index+12:] # 12 is the length of <extra_id_0>
results = list(map(_filter, outputs))
results

["(Oct 7, 2014  12:40 PM CDT) As of Jan. 1, Walmart will no longer offer 30,000 of its employees health insurance. Bloomberg notes that's about 2% of its workforce. The move comes as a reaction to the company's rising health care costs as far more of its employees and their families enrolled in its health care plans than it had expected following the ObamaCare rollout. The AP reports those costs will surge $500 million this fiscal year, $170 million more than had been estimated. Those affected are employees who average fewer than 30 hours of work per week; the Wall Street Journal explains they were grandfathered in when Walmart in 2012 stopped offering insurance to new hires who didn't exceed the 30-hour threshold. A benefits expert says Walmart is actually late to the game in terms of cutting insurance to some part-time workers; Target, the Home Depot, and others have already done so. Meanwhile, Walmart's full time workers will see their premiums rise in 2015. Premiums for the basic p

In [108]:
# After  training
# model.eval()
test_prompt = train_for_eval[4]
encoded = tokenizer(test_prompt, truncation=False, padding=False, return_tensors="pt")
test_output = model.generate(input_ids = encoded.input_ids,num_return_sequences=1,max_length=5)
test_answer = tokenizer.decode(test_output[0], skip_special_tokens=True)
print(f"Original: {train_processed[4]} \n\n")
print(f"After Training: {test_prompt} \n\n")
print(f"Prediction: {test_answer}")

Original: (Apr 18, 2016  1:02 PM CDT) Ingrid Lyne, the Seattle mom allegedly murdered while on a date, left behind three daughters—and a GoFundMe campaign set up to help the girls has raised more than $222,000 so far, Us reports. A friend of the family set up the campaign, and says that all the money raised will go into a trust for the girls, who are ages 12, 10, and 7. Lyne's date was charged with her murder last week. $ 222 K Raised for Kids of Mom Dismembered on Date 


After Training: (Apr 18, 2016  1:02 PM CDT) Ingrid Lyne, the Seattle mom allegedly murdered while on a date, left behind three daughters—and a GoFundMe campaign set up to help the girls has raised more than $222,000 so far, Us reports. A friend of the family set up the campaign, and says that all the money raised will go into a trust for the girls, who are ages 12, 10, and 7. Lyne's date was charged with her murder last week. $<extra_id_0>K Raised for Kids of Mom Dismembered on Date 


Prediction: 2.2.


In [103]:
# After  training
# model.eval()
test_prompt = train_for_eval[5]
encoded = tokenizer(test_prompt, truncation=False, padding=False, return_tensors="pt")
test_output = model.generate(input_ids = encoded.input_ids,num_return_sequences=1,max_length=5)
test_answer = tokenizer.decode(test_output[0], skip_special_tokens=True)
print(f"Original: {train_processed[5]} \n\n")
print(f"After Training: {test_prompt} \n\n")
print(f"Prediction: {test_answer}")

Original: (Sep 19, 2014  12:58 PM CDT) Ezekiel Emanuel is a healthy 57-year-old in all respects, as his recent hike up Mount Kilimanjaro would suggest. Which is why it might be disconcerting to read his essay in the Atlantic laying out the reasons why he hopes to be dead in 18 years. To Emanuel, 75 is the right age at which to die.  I will have lived a complete life,  he writes. He will have seen his grandkids begin their own lives and will have  made whatever contributions, important or not, I am going to make.  And, with luck, the inevitable mental and physical declines of old age will not have set in yet. Emanuel rejects what he calls the  American immortal —the concept that drives people to obsessively exercise, pop vitamins, do mental puzzles, etc., in the hope of cheating death. Yes, death is a loss, he writes.  But here is a simple truth that many of us seem to resist: living too long is also a loss. It renders many of us, if not disabled, then faltering and declining, a state t

In [101]:
pattern = re.compile(r'\d{1,3}(?:,\d{3})+|\d+[\/\.]{0,1}\d+|\d+')

In [122]:
pred_gt = []
num_gt = []
print(f"len of dev_processed = {len(dev_processed)}")
for i in tqdm(range(len(dev_processed))):
    test_prompt = dev_for_eval[i]
    encoded = tokenizer(test_prompt, truncation=False, padding=False, return_tensors="pt")
    test_output = model.generate(input_ids = encoded.input_ids,num_return_sequences=1,max_length=5)
    test_answer = tokenizer.decode(test_output[0], skip_special_tokens=True)
    
    generated_num_list = pattern.findall(test_answer)
    num_gt.append(dev_data[i]['ans'])
    pred_gt += generated_num_list
    
    
    
    # print(f"Original: {dev_processed[3]} \n\n")
    # print(f"After Training: {test_prompt} \n\n")
    # print(f"Prediction: {test_answer}")

len of dev_processed = 2572


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2572/2572 [23:27<00:00,  1.83it/s]


In [114]:
torch.save(model, 't5MLM.pth')

In [119]:
from sklearn.metrics import accuracy_score

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [125]:
len(pred_gt)

2516

In [130]:
accuracy_score(pred_gt[:1000], num_gt[:1000])

0.05

In [15]:
# After  training
model.eval()
test_prompt = "The  <extra_id_0> dog  walks in the <extra_id_2>"
encoded = tokenizer(test_prompt, truncation=False, padding=False, return_tensors="pt")
test_output = model.generate(input_ids = encoded.input_ids,num_return_sequences=1,max_length=125)
test_answer = tokenizer.decode(test_output[0], skip_special_tokens=True)
print(f"After Training:'{test_prompt}'-->'{test_answer}'")

After Training:'The  <extra_id_0> dog  walks in the <extra_id_2>'-->'cute park'


## Masked LM for Generative Models  

Let's try masked  LM on Decoder only model

GPT-2 is a causal language model, designed to predict the next token in a sequence given the previous tokens. It's usually trained using a standard language modeling objective, without the use of denoising.

As you can see from output, this sort of training is not effective for Generative models

In [None]:
  from transformers import AutoModelForCausalLM,AutoTokenizer
  model = AutoModelForCausalLM.from_pretrained("gpt2")
  tokenizer = AutoTokenizer.from_pretrained("gpt2")
  optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)

In [None]:
  prompt = "The cute dog walks in the green park"
  labels, input_ids = get_denoised(FlaxDataCollatorForT5MLM, tokenizer, prompt)
  print(f"denoised input_ids decoded = {tokenizer.decode(*input_ids,skip_special_tokens=False)}")
  print(f"denoised labels decoded   = {tokenizer.decode(*labels,skip_special_tokens=False)}")
  print(f"input_ids.shape {input_ids.shape} labels.shape {labels.shape}") # this should be equal for CausalLM models
  denoised_input_ids = torch.from_numpy(input_ids)
  denoised_labels = torch.from_numpy(labels)
  denoised_attention_mask = torch.ones(input_ids.shape)

  model.train()
  for epoch in range(100):
      outputs = model(input_ids=denoised_input_ids,attention_mask=denoised_attention_mask,
                      labels=denoised_labels)
      loss = outputs.loss
      if epoch % 20 == 0:
          print(f"Epoch {epoch}  Loss {loss}")
      loss.backward()
      optimizer.step()
      optimizer.zero_grad()
  print(f"Epoch {epoch}  Loss {loss}")

denoised input_ids decoded = The<|endoftext|> dog gazed the green informants<|endoftext|>
denoised labels decoded   = <|endoftext|> cute gazed walks in informants park<|endoftext|>
input_ids.shape (1, 8) labels.shape (1, 8)
Epoch 0  Loss 11.180167198181152
Epoch 20  Loss 0.0005788219859823585
Epoch 40  Loss 0.0019290262134745717
Epoch 60  Loss 0.000539803528226912
Epoch 80  Loss 5.3726726036984473e-05
Epoch 99  Loss 0.8452759385108948


In [None]:
  # After  training
  model.eval()
  test_prompt = "The  <extra_id_0> dog  walks in the <extra_id_2>"
  encoded = tokenizer(test_prompt, truncation=False, padding=False, return_tensors="pt")
  test_output = model.generate(input_ids = encoded.input_ids,num_return_sequences=1,max_length=125)
  test_answer = tokenizer.decode(test_output[0], skip_special_tokens=True)
  print(f"After Training:'{test_prompt}'-->'{test_answer}'")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


After Training:'The  <extra_id_0> dog  walks in the <extra_id_2>'-->'The  <extra_id_0> dog  walks in the <extra_id_2> gazed in."'


## Causal LM training - teacher forced for Sequence to Sequence Models

We are using Teacher forcing for Language Modeling; Basically predicting the next word, from the previous workds.

To force the labels to show the correct ground truth, we shift the label which is the ground truth to the right as shown below.

This type of training can be used best for CausalLM


In [None]:
print_token_id(tokenizer,"<\s>" )
print_special_tokens(tokenizer)

3
{'eos_token': 1, 'unk_token': 2, 'pad_token': 0, 'additional_special_tokens': [32099, 32098, 32097, 32096, 32095, 32094, 32093, 32092, 32091, 32090, 32089, 32088, 32087, 32086, 32085, 32084, 32083, 32082, 32081, 32080, 32079, 32078, 32077, 32076, 32075, 32074, 32073, 32072, 32071, 32070, 32069, 32068, 32067, 32066, 32065, 32064, 32063, 32062, 32061, 32060, 32059, 32058, 32057, 32056, 32055, 32054, 32053, 32052, 32051, 32050, 32049, 32048, 32047, 32046, 32045, 32044, 32043, 32042, 32041, 32040, 32039, 32038, 32037, 32036, 32035, 32034, 32033, 32032, 32031, 32030, 32029, 32028, 32027, 32026, 32025, 32024, 32023, 32022, 32021, 32020, 32019, 32018, 32017, 32016, 32015, 32014, 32013, 32012, 32011, 32010, 32009, 32008, 32007, 32006, 32005, 32004, 32003, 32002, 32001, 32000]}


In [None]:
model_name ="t5-small"
model = T5ForConditionalGeneration.from_pretrained(model_name)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
tokenizer = T5Tokenizer.from_pretrained(model_name)

In [None]:
test_prompt = "The cute dog walks in the green park"
encoded = tokenizer(test_prompt, truncation=False, padding=True, return_tensors="pt")
label_input_ids = shift_tokens_right(encoded.input_ids,model.config.pad_token_id,model.config.eos_token_id)

print(f"teacher_forced input_ids    = {(encoded.input_ids.squeeze())}")
print(f"teacher_forced input_ids decoded = {tokenizer.decode(encoded.input_ids.squeeze(),skip_special_tokens=False)}")# .squeeze() as it takes a batch
print(f"teacher_forced labels    = {(label_input_ids.squeeze())}")
print(f"teacher_forced labels decoded   = {tokenizer.decode(label_input_ids.squeeze(),skip_special_tokens=False)}")

teacher_forced input_ids    = tensor([   37,  5295,  1782, 10681,    16,     8,  1442,  2447,     1])
teacher_forced input_ids decoded = The cute dog walks in the green park</s>
teacher_forced labels    = tensor([    0,    37,  5295,  1782, 10681,    16,     8,  1442,  2447,     1])
teacher_forced labels decoded   = <pad> The cute dog walks in the green park</s>


In [None]:
model.train()
for epoch in range(200):
    outputs = model(input_ids=encoded.input_ids,attention_mask=encoded.attention_mask,
                    labels=label_input_ids)
    loss = outputs.loss
    if epoch % 40 == 0:
        print(f"Epoch {epoch}  Loss {loss}")
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
print(f"Epoch {epoch}  Loss {loss}")
  #-------------------------------------------------------------

Epoch 0  Loss 2.868603229522705
Epoch 40  Loss 0.622085690498352
Epoch 80  Loss 0.21471920609474182
Epoch 120  Loss 0.19337992370128632
Epoch 160  Loss 0.2608181834220886
Epoch 199  Loss 0.4793122708797455


In [None]:
  model.eval()
  test_prompt = "The cute dog walks in the"
  encoded = tokenizer(test_prompt, truncation=False, padding=True, return_tensors="pt")
  test_output = model.generate(input_ids = encoded.input_ids,max_new_tokens=125)
  test_answer = tokenizer.decode(test_output[0], skip_special_tokens=True)
  print(f"After Training:'{test_prompt}'-->'{test_answer}'")

After Training:'The cute dog walks in the'-->'The cute dog walks in the green park'


## Causal LM training -Teacher forced for Generative models

In [None]:
from transformers import AutoModelForCausalLM,AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")
print(model.config.pad_token_id,model.config.eos_token_id)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
# GPT does not have the pad token set, so we add a new token
pad_token_id = print_token_id(tokenizer,"[PAD]")
tokenizer.pad_token = pad_token_id
# and update the model with the token
model = AutoModelForCausalLM.from_pretrained("gpt2")
#model.resize_token_embeddings(len(tokenizer))


None 50256
[PAD] 50257


In [None]:
test_prompt = "The cute dog walks in the green park"

encoded = tokenizer(test_prompt, truncation=False, padding=True, return_tensors="pt")
#from torch.nn.functional import pad
# Pad the input_ids tensor on the right (at the end)
#encoded.input_ids = pad(encoded.input_ids, (0, 1), value=tokenizer.pad_token_id)
labels = shift_tokens_right(encoded.input_ids,pad_token_id,tokenizer.eos_token_id)

print(f"teacher_forced input_ids decoded = {tokenizer.decode(encoded.input_ids.squeeze(),skip_special_tokens=False)}")# .squeeze() as it takes a batch
print(f"teacher_forced labels decoded   = {tokenizer.decode(labels.squeeze(),skip_special_tokens=False)}")

# we need the sizes to match

# add a pad token to input ids to match the size
new_token = torch.tensor([tokenizer.eos_token_id])
new_token = new_token.view(1, -1)
# Append the new token
encoded.input_ids = torch.cat((encoded.input_ids, new_token),dim=1)
print(f"teacher_forced input_ids decoded = {tokenizer.decode(encoded.input_ids.squeeze(),skip_special_tokens=False)}")#

assert encoded.input_ids.size() == labels.size()

attention_mask = torch.ones(encoded.input_ids.shape)
print(attention_mask.shape)

# Note that training with the above the loss do not decrease

teacher_forced input_ids decoded = The cute dog walks in the green park
teacher_forced labels decoded   = [PAD]The cute dog walks in the green<|endoftext|>
teacher_forced input_ids decoded = The cute dog walks in the green park<|endoftext|>
torch.Size([1, 9])


In [None]:
model.train()
for epoch in range(10):
    outputs = model(input_ids=encoded.input_ids,attention_mask=attention_mask,
                    labels=labels)
    loss = outputs.loss
    if epoch % 20 == 0:
        print(f"Epoch {epoch}  Loss {loss}")
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
print(f"Epoch {epoch}  Loss {loss}")


Epoch 0  Loss 7.023041248321533
Epoch 9  Loss 8.018503189086914


In [None]:
# Genertaive models need the input shape and target shape to be exact; I guess the shifting cell to right is automatically happening here
# yes it is - https://discuss.huggingface.co/t/shifting-ids-to-the-right-when-training-gpt-2-on-text-generation/5308/2
# no need for things in above cell

# so lets try without shifting right; in the assumption that it is already taken care by the model
 #Note there is no padding for this tokenizeer-see above on how to set pad token for the tokenizer and model

from transformers import AutoModelForCausalLM,AutoTokenizer,GPT2LMHeadModel

tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained("gpt2") #GPT2LMHeadModel
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)


In [None]:
test_prompt = "The cute dog walks in the green park"
#test_prompt = ["The cute", "cute dog", "dog walks", "walks in" ,"in the", "the green", "green park"]
encoded = tokenizer(test_prompt, truncation=False, padding='longest', return_tensors="pt") #
label_input_ids = encoded.input_ids

model.train()
for epoch in range(50):
    outputs = model(input_ids=encoded.input_ids,attention_mask=encoded.attention_mask,
                    labels=label_input_ids)
    loss = outputs.loss
    if epoch % 20 == 0:
        print(f"Epoch {epoch}  Loss {loss}")
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
print(f"Epoch {epoch}  Loss {loss}")

Epoch 0  Loss 5.569321155548096
Epoch 20  Loss 1.2724746465682983
Epoch 40  Loss 0.00023454656184185296
Epoch 49  Loss 8.242394869739655e-06


In [None]:
model.eval()
test_prompt = "The"
encoded = tokenizer(test_prompt, truncation=False, padding=True, return_tensors="pt")
test_output = model.generate(encoded.input_ids,max_new_tokens=25)
test_answer = tokenizer.decode(test_output[0], skip_special_tokens=True)
print(f"After Training:'{test_prompt}'-->'{test_answer}'")
test_prompt = "Where does the cute dog walk"
encoded = tokenizer(test_prompt, truncation=False, padding=True, return_tensors="pt")
test_output = model.generate(encoded.input_ids,max_new_tokens=25)
test_answer = tokenizer.decode(test_output[0], skip_special_tokens=True)
print(f"After Training:'{test_prompt}'-->'{test_answer}'")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


After Training:'The'-->'The cute dog walks in the green park park park park park green park park park in the green park park park park in the green'
After Training:'Where does the cute dog walk'-->'Where does the cute dog walk in the green park park park park green park park park park in the green park park park park park in the green park park'


##  Teacher Forcing for Tasks

The target here is not the input ids, but the labels.

The label  is the ground truth (actual next word/sequence).

 During the forward pass, the model makes a prediction, and the difference
  between the prediction and this ground truth is calculated to compute the loss.

However here the training can be used for any arbitary tasks, like translation or QA etc

In [None]:
model_name ="t5-small"
model = T5ForConditionalGeneration.from_pretrained(model_name)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
tokenizer = T5Tokenizer.from_pretrained(model_name)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

In [None]:
  test_prompt = "Q:Where does the cute dog walk"
  label_prompt = "A:In the green park"
  encoded = tokenizer(test_prompt, truncation=False, padding=False, return_tensors="pt")
  label = tokenizer(label_prompt, truncation=False, padding=False, return_tensors="pt")
  model.train()
  for epoch in range(50):
      outputs = model(input_ids=encoded.input_ids,attention_mask=encoded.attention_mask,
                      labels=label.input_ids)
      loss = outputs.loss
      if epoch % 20 == 0:
          print(f"Epoch {epoch}  Loss {loss}")
      loss.backward()
      optimizer.step()
      optimizer.zero_grad()
  print(f"Epoch {epoch}  Loss {loss}")



Epoch 0  Loss 4.423830986022949
Epoch 20  Loss 1.126334309577942
Epoch 40  Loss 0.24285714328289032
Epoch 49  Loss 0.27664294838905334


In [None]:
  #-------------------------------------------------------------
  # After  training
  model.eval()
  test_prompt = "Q:Where does the cute dog walk"
  encoded = tokenizer(test_prompt, truncation=False, padding=False, return_tensors="pt")
  test_output = model.generate(input_ids = encoded.input_ids,max_length=125)
  test_answer = tokenizer.decode(test_output[0], skip_special_tokens=True)
  print(f"After Training:'{test_prompt}'-->'{test_answer}'")

After Training:'Q:Where does the cute dog walk'-->'A:In the green park'
