# DATA PREPARATION

In [1]:
# preparing the dataset for pre-training
# openwebtxt data

In [2]:
!pip install datasets
!pip install tiktoken
!pip install tqdm
!pip install numpy

Collecting datasets
  Downloading datasets-2.19.0-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.21.2 (from datasets)
  Downloading huggingface_hub-0.22.2-py3-none-a

In [3]:
! nvidia-smi

Wed May  1 18:17:06 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   41C    P8              11W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [4]:
# CNN DAILY MAIL DATASET

import os
from tqdm import tqdm
import numpy as np
import tiktoken
from datasets import load_dataset # huggingface datasets

num_proc = 8
# 28k

num_proc_load_dataset = num_proc

PAD_TOKEN = '<|pad|>'
SEP_1_TOKEN = '<|sep1|>'
SEP_2_TOKEN = '<|sep1|>'
SEP_3_TOKEN = '<|sep1|>'
SEP_4_TOKEN = '<|sep1|>'

enc = tiktoken.get_encoding("gpt2")
enc = tiktoken.Encoding(
    name="gpt2_with_sp_tokens",
    pat_str=enc._pat_str,
    mergeable_ranks=enc._mergeable_ranks,
     special_tokens={
        **enc._special_tokens,
        SEP_1_TOKEN : 50257,
        SEP_2_TOKEN : 50258,
        SEP_3_TOKEN : 50259,
        SEP_4_TOKEN : 50260,
    }
)

if __name__ == '__main__':
    dataset = load_dataset("cnn_dailymail", '3.0.0', num_proc=num_proc_load_dataset)

    def process(example):
        article_ids = enc.encode_ordinary(example['article']) # encode_ordinary ignores any special tokens
        article_ids.append(50257)

        highlights_ids = enc.encode_ordinary(example['highlights'])
        highlights_ids.append(enc.eot_token)

        data = article_ids + highlights_ids
        if len(data) > 1024:
            text = [0]
        else:
            # Pad sequences to length 1024
            text = [enc.eot_token]*1024
            text[:len(data)] = data

        out = {'data': text, 'data_len': len(data), 'article_lens': [len(article_ids)]}

        return out

    # tokenize the dataset
    tokenized = dataset.map(
        process,
        remove_columns=['article','highlights','id'],
        desc="tokenizing the splits",
        num_proc=num_proc,
    )

    # Drop examples with input sequence lengths < 1024
    tokenized['train'] = tokenized['train'].filter(lambda data: len(data['data']) == 1024)
    tokenized['validation'] = tokenized['validation'].filter(lambda data: len(data['data']) == 1024)
    tokenized['test'] = tokenized['test'].filter(lambda data: len(data['data']) == 1024)
    data_dir = '/content/cnn'
    # concatenate all the ids in each dataset into one large file we can use for training
    for split, dset in tokenized.items():
        filename = os.path.join(os.path.dirname(data_dir), f'cnn_{split}')
        np.save(filename, np.array(dset['data']))

        filename = os.path.join(os.path.dirname(data_dir), f'cnn_{split}_lens')
        np.save(filename, np.array(dset['article_lens']))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/15.6k [00:00<?, ?B/s]

Setting num_proc from 8 to 3 for the train split as it only contains 3 shards.


Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Setting num_proc from 8 back to 1 for the validation split to disable multiprocessing as it only contains one shard.


Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Setting num_proc from 8 back to 1 for the test split to disable multiprocessing as it only contains one shard.


Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

tokenizing the splits (num_proc=8):   0%|          | 0/287113 [00:00<?, ? examples/s]

tokenizing the splits (num_proc=8):   0%|          | 0/13368 [00:00<?, ? examples/s]

tokenizing the splits (num_proc=8):   0%|          | 0/11490 [00:00<?, ? examples/s]

Filter:   0%|          | 0/287113 [00:00<?, ? examples/s]

Filter:   0%|          | 0/13368 [00:00<?, ? examples/s]

Filter:   0%|          | 0/11490 [00:00<?, ? examples/s]

In [5]:
# STANFORD QUESTION AND ANSWERING

import os
from tqdm import tqdm
import numpy as np
import tiktoken
from datasets import load_dataset # huggingface datasets

#87599
num_proc = 8

num_proc_load_dataset = num_proc

PAD_TOKEN = '<|pad|>'
SEP_1_TOKEN = '<|sep1|>'
SEP_2_TOKEN = '<|sep1|>'
SEP_3_TOKEN = '<|sep1|>'
SEP_4_TOKEN = '<|sep1|>'

enc = tiktoken.get_encoding("gpt2")
enc = tiktoken.Encoding(
    name="gpt2_with_sp_tokens",
    pat_str=enc._pat_str,
    mergeable_ranks=enc._mergeable_ranks,
    special_tokens={
        **enc._special_tokens,
        SEP_1_TOKEN : 50257,
        SEP_2_TOKEN : 50258,
        SEP_3_TOKEN : 50259,
        SEP_4_TOKEN : 50260,

    }
)

if __name__ == '__main__':
    dataset = load_dataset("squad", num_proc=num_proc_load_dataset)

    def process(example):
        context_ids = enc.encode_ordinary(example['context']) # encode_ordinary ignores any special tokens
        context_ids.append(50258)

        question_ids = enc.encode_ordinary(example['question'])
        question_ids.append(50258)

        answers_ids = enc.encode_ordinary(example['answers']['text'][0])
        answers_ids.append(enc.eot_token)

        data = context_ids + question_ids + answers_ids
        if len(data) > 1024:
            text = [0]
        else:
            # Pad input sequences to length of 1024
            text = [enc.eot_token]*1024
            text[:len(data)] = data

        out = {'data': text, 'data_len': len(data), 'context_lens': [len(context_ids)+ len(question_ids)]}

        return out

    # tokenize the dataset
    tokenized = dataset.map(
        process,
        remove_columns=['id', 'title', 'context', 'question', 'answers'],
        desc="tokenizing the splits",
        num_proc=num_proc,
    )

    # Drop examples with input lengths greater than 1024
    tokenized['train'] = tokenized['train'].filter(lambda data: len(data['data']) == 1024)
    tokenized['validation'] = tokenized['validation'].filter(lambda data: len(data['data']) == 1024)

    # concatenate all the ids in each dataset into one large file we can use for training
    data_dir = 'fine_tuned_data'
    for split, dset in tokenized.items():
        filename = os.path.join(os.path.dirname(data_dir), f'squad_{split}')
        print(np.array(dset['data']).shape)
        np.save(filename, np.array(dset['data']))

        filename = os.path.join(os.path.dirname(data_dir), f'squad_{split}_lens')
        np.save(filename, np.array(dset['context_lens']))

Downloading readme:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

Setting num_proc from 8 back to 1 for the train split to disable multiprocessing as it only contains one shard.


Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Setting num_proc from 8 back to 1 for the validation split to disable multiprocessing as it only contains one shard.


Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

tokenizing the splits (num_proc=8):   0%|          | 0/87599 [00:00<?, ? examples/s]

tokenizing the splits (num_proc=8):   0%|          | 0/10570 [00:00<?, ? examples/s]

Filter:   0%|          | 0/87599 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10570 [00:00<?, ? examples/s]

(87598, 1024)
(10570, 1024)


In [6]:
# SENTIMENT ANALYSIS

from datasets import DatasetDict
import os
from tqdm import tqdm
import numpy as np
import tiktoken
from datasets import load_dataset # huggingface datasets

num_proc = 8
#4k

num_proc_load_dataset = num_proc

PAD_TOKEN = '<|pad|>'
SEP_1_TOKEN = '<|sep1|>'
SEP_2_TOKEN = '<|sep1|>'
SEP_3_TOKEN = '<|sep1|>'
SEP_4_TOKEN = '<|sep1|>'

enc = tiktoken.get_encoding("gpt2")
enc = tiktoken.Encoding(
    name="gpt2_with_sp_tokens",
    pat_str=enc._pat_str,
    mergeable_ranks=enc._mergeable_ranks,
    special_tokens={
        **enc._special_tokens,
        SEP_1_TOKEN : 50257,
        SEP_2_TOKEN : 50258,
        SEP_3_TOKEN : 50259,
        SEP_4_TOKEN : 50260,
    }
)

if __name__ == '__main__':
    dataset = load_dataset("financial_phrasebank", "sentences_66agree")

    def process(example):
        sentence_ids = enc.encode_ordinary(example['sentence']) # encode_ordinary ignores any special tokens
        sentence_ids.append(50259)

        # label_ids = enc.encode_ordinary(example['label'])
        label_ids = enc.encode_ordinary(str(example['label']))
        label_ids.append(enc.eot_token)

        data = sentence_ids + label_ids
        if len(data) > 1024:
            text = [0]
        else:
            # Pad sequences to length 1024
            text = [enc.eot_token]*1024
            text[:len(data)] = data

        out = {'data': text, 'data_len': len(data), 'sentence_lens': [len(sentence_ids)]}

        return out

    # tokenize the dataset
    tokenized = dataset.map(
        process,
        remove_columns=['sentence','label'],
        desc="tokenizing the splits",
        num_proc=num_proc,
    )

    # Drop examples with input sequence lengths < 1024
    full_length_data = tokenized['train'].filter(lambda data: len(data['data']) == 1024)
    # Now, split the filtered dataset to create a test dataset from 5% of it
    train_test_split = full_length_data.train_test_split(test_size=0.05)

    # Update the 'tokenized' DatasetDict with the new train and test datasets
    tokenized = DatasetDict({
        'train': train_test_split['train'],
        'test': train_test_split['test']
    })

    # concatenate all the ids in each dataset into one large file we can use for training
    data_dir = 'fine_tuned_data'
    for split, dset in tokenized.items():
        filename = os.path.join(os.path.dirname(data_dir), f'sentiment_analysis_{split}')
        np.save(filename, np.array(dset['data']))

        filename = os.path.join(os.path.dirname(data_dir), f'sentiment_analysis_{split}_lens')
        np.save(filename, np.array(dset['sentence_lens']))

Downloading data:   0%|          | 0.00/339k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/4217 [00:00<?, ? examples/s]

tokenizing the splits (num_proc=8):   0%|          | 0/4217 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4217 [00:00<?, ? examples/s]

In [80]:
# NER
import os
from tqdm import tqdm
import numpy as np
import tiktoken
from datasets import load_dataset

num_proc = 8
# 14k

num_proc_load_dataset = num_proc

PAD_TOKEN = '<|pad|>'
SEP_1_TOKEN = '<|sep1|>'
SEP_2_TOKEN = '<|sep1|>'
SEP_3_TOKEN = '<|sep1|>'
SEP_4_TOKEN = '<|sep1|>'

enc = tiktoken.get_encoding("gpt2")
enc = tiktoken.Encoding(
    name="gpt2_with_sp_tokens",
    pat_str=enc._pat_str,
    mergeable_ranks=enc._mergeable_ranks,
    special_tokens={
        **enc._special_tokens,
        SEP_1_TOKEN : 50257,
        SEP_2_TOKEN : 50258,
        SEP_3_TOKEN : 50259,
        SEP_4_TOKEN : 50260,

    }
)

if __name__ == '__main__':
    dataset = load_dataset('conll2003', num_proc=num_proc_load_dataset)

    def process(example):
      tokens_str = ", ".join(example['tokens'])
      token_ids = enc.encode_ordinary(tokens_str)
      token_ids.append(50260)

      ner_tags_str =  ", ".join(map(str, example['ner_tags']))
      ner_tags_ids = enc.encode_ordinary(ner_tags_str)
      ner_tags_ids.append(enc.eot_token)
      data = token_ids + ner_tags_ids
      if len(data) > 1024:
          text = [0]
      else:
          text = [enc.eot_token]*1024
          text[:len(data)] = data

      out = {'data': text, 'data_len': len(data), 'tokens_lens': [len(token_ids)]}

      return out

    # tokenize the dataset
    tokenized = dataset.map(
    process,
    remove_columns=['id','tokens','pos_tags','chunk_tags','ner_tags'],
    desc="tokenizing the splits",
    num_proc=num_proc)

    # Drop examples with input sequence lengths < 1024
    tokenized['train'] = tokenized['train'].filter(lambda data: len(data['data']) == 1024)
    tokenized['validation'] = tokenized['validation'].filter(lambda data: len(data['data']) == 1024)
    tokenized['test'] = tokenized['test'].filter(lambda data: len(data['data']) == 1024)

    # concatenate all the ids in each dataset into one large file we can use for training
    directory_path = 'fine_tuned_dataset'
    for split, dset in tokenized.items():
        filename = os.path.join(directory_path, f'conll_{split}')
        print(np.array(dset['data']).shape)
        np.save(filename, np.array(dset['data']))


        filename = os.path.join(directory_path, f'conll_{split}_lens')
        np.save(filename, np.array(dset['tokens_lens']))

(14041, 1024)
(3250, 1024)
(3453, 1024)


In [None]:
# enc = tiktoken.get_encoding("gpt2")
# enc = tiktoken.Encoding(
#     name="gpt2_with_sp_tokens",
#     pat_str=enc._pat_str,
#     mergeable_ranks=enc._mergeable_ranks,
#     special_tokens={
#         **enc._special_tokens,
#         SEP_1_TOKEN : 50257,
#         # SEP_2_TOKEN : 50258,
#         # SEP_3_TOKEN : 50259,
#         # SEP_4_TOKEN : 50260,

#     }
# )

In [None]:
# array = ["EU", "rejects", "German", "call", "to", "boycott", "British", "lamb", "."]
# text = "["+", ".join(array) + "]"
# tokenized_texts = enc.encode_ordinary(text)
# print(tokenized_texts)

In [None]:
# array= [3, 0, 7, 0, 0, 0, 7, 0, 0]
# tokenized_texts = [enc.encode_ordinary(str(text)) for text in array]
# print(tokenized_texts )

In [9]:
!pip install datasets
!pip install tiktoken
!pip install tqdm
!pip install numpy



PRE-TRAINING THE MODEL


In [81]:
import math
import inspect
from dataclasses import dataclass

import torch
import torch.nn as nn
from torch.nn import functional as F

class LayerNorm(nn.Module):
    def __init__(self, ndim, bias):
        super().__init__()
        self.weight = nn.Parameter(torch.ones(ndim))
        self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None

    def forward(self, input):
        return F.layer_norm(input, self.weight.shape, self.weight, self.bias, 1e-5)

class SelfAttention(nn.Module):

    def __init__(self, n_embd, n_head, block_size, dropout, bias):
        super().__init__()

        # key, query, value projections
        self.c_attn_q = nn.Linear(n_embd, n_embd, bias=bias)
        self.c_attn_v = nn.Linear(n_embd, n_embd, bias=bias)
        self.c_attn_k = nn.Linear(n_embd, n_embd, bias=bias)

        # output projection
        self.c_proj = nn.Linear(n_embd, n_embd, bias=bias)

        # regularization
        self.attn_dropout = nn.Dropout(dropout)
        self.resid_dropout = nn.Dropout(dropout)
        self.n_head = n_head
        self.n_embd = n_embd
        self.dropout = dropout

        self.register_buffer("bias", torch.tril(torch.ones(block_size, block_size))
                                        .view(1, 1, block_size, block_size))

    def forward(self, x):
        B, T, C = x.shape

        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
        q, k, v  = self.c_attn_q(x), self.c_attn_k(x), self.c_attn_v(x)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)

        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
        att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
        att = F.softmax(att, dim=-1)
        att = self.attn_dropout(att)
        y = att @ v
        y = y.transpose(1, 2).contiguous().view(B, T, C)

        # output projection
        y = self.resid_dropout(self.c_proj(y))
        return y

class MLP(nn.Module):
    def __init__(self, n_embd, bias, dropout):
        super().__init__()
        self.mlp = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd, bias=bias),
            nn.GELU(),
            nn.Linear(4 * n_embd, n_embd, bias=bias),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        return self.mlp(x)

class Block(nn.Module):

    def __init__(self, block_size, vocab_size, n_layer, n_head, n_embd, dropout, bias):
        super().__init__()
        self.ln_1 = LayerNorm(n_embd, bias=bias)
        self.attn = SelfAttention(n_embd, n_head, block_size, dropout, bias)
        self.ln_2 = LayerNorm(n_embd, bias=bias)
        self.mlp = MLP(n_embd, bias, dropout)

    def forward(self, x):
        residual = x
        x = self.attn(self.ln_1(x))
        x = x + residual

        residual = x
        x = self.mlp(self.ln_2(x))
        x = x + residual

        return x

class GPT(nn.Module):

    def __init__(self, block_size=1024, vocab_size=50304, n_layer=12, n_head=12, n_embd=768,
                dropout=0, bias=True):

        super().__init__()
        self.block_size = block_size
        self.vocab_size = vocab_size
        self.n_layer = n_layer
        self.n_embd = n_embd
        self.dropout = dropout
        self.bias = bias

        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(vocab_size, n_embd),
            wpe = nn.Embedding(block_size, n_embd),
            drop = nn.Dropout(dropout),
            h = nn.ModuleList([Block(block_size, vocab_size, n_layer, n_head, n_embd, dropout, bias) for _ in range(n_layer)]),
            ln_f = LayerNorm(n_embd, bias=bias),
        ))
        self.lm_head = nn.Linear(n_embd, vocab_size, bias=False)

        # Weight Tying
        self.transformer.wte.weight = self.lm_head.weight

        # weight initialisation
        self.apply(self._init_weights)

        # Scaled initialisation from GPT paper
        for pn, p in self.named_parameters():
            if pn.endswith('c_proj.weight'):
                torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * n_layer))

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None, return_all_logits=False):
        device = idx.device
        b, t = idx.shape
        pos = torch.arange(0, t, dtype=torch.long, device=device)

        tok_emb = self.transformer.wte(idx)
        pos_emb = self.transformer.wpe(pos)
        out = self.transformer.drop(tok_emb + pos_emb)
        for block in self.transformer.h:
            out = block(out)
        out = self.transformer.ln_f(out)

        if targets is not None:
            # Return cross entropy loss during training.
            logits = self.lm_head(out)
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
        else:
            if return_all_logits:
                # For fine-tuning tasks return all logits and compute custom loss.
                logits = self.lm_head(out)
            else:
                # Return just the last timestep during inference
                logits = self.lm_head(out[:, [-1], :])
            loss = None

        return logits, loss

    def configure_optimizers(self, weight_decay, learning_rate, betas, device_type):
        param_dict = {pn: p for pn, p in self.named_parameters() if p.requires_grad}

        # create optim groups. Any parameters that is 2D will be weight decayed, otherwise no.
        # i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't.
        decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
        nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
        optim_groups = [
            {'params': decay_params, 'weight_decay': weight_decay},
            {'params': nodecay_params, 'weight_decay': 0.0}
        ]

        kwargs = dict(fused=True)
        optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=betas, **kwargs)

        return optimizer

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -self.block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

In [82]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [83]:
!pip install wandb --quiet

In [84]:
import os
import time
import math
import pickle

import numpy as np
import torch

In [85]:
config = {
    # General training flags
    'eval_interval' : 50,
    'log_interval' : 1,
    'eval_iters' : 5,
    'out_dir' : 'result',
    'eval_only' : False,
    'always_save_checkpoint' : True,
    'init_from' : 'scratch',

    # Wand flags
    'wandb_log' : True,
    'wandb_project' : 'hw5',
    'wandb_run_name' : 'reduce loss more',

    # Model gflags
    'gradient_accumulation_steps' : 5,
    'batch_size' : 6,
    'block_size' : 1024,
    'n_layer' : 8,
    'n_head' : 12,
    'n_embd' : 768,
    'dropout' : 0.0,
    'bias' : False,

    'learning_rate' : 6e-4,
    'max_iters' : 500000,
    'weight_decay' : 1e-1,
    'beta1' : 0.9,
    'beta2' : 0.95,
    'grad_clip' : 1.0,

    # Learning rate scheduler gflags
    'decay_lr' : True,
    'warmup_iters': 0,
    'lr_decay_iters': 50000,
    'min_lr' : 1e-5,
}


In [86]:
# config = ModelConfig()
import gc
gc.collect() # These commands help you when you face CUDA OOM error
torch.cuda.empty_cache()

In [87]:
def save_checkpoint(model, optimizer, iter_num, filename='checkpoint.pth'):
    torch.save({
        'iter_num': iter_num,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
    }, filename)
    print(f"Checkpoint saved at iteration {iter_num}")

FINE TUNING


In [98]:
!pip install pytorch-ignite
!pip install wandb --quiet



In [99]:
CHK_PT_PATH = '/content/checkpoint_iter_30000.pth'
OUTPUT_DIR = '/content/finetune-complete/'
EPOCHS = 2
SUMMARY_ROOT = '/content/fine_tuned_dataset/cnn'
SQUAD_ROOT = '/content/fine_tuned_dataset/squad'
SENTIMENT_ROOT = '/content/fine_tuned_dataset/sentiment'
NER_ROOT = '/content/fine_tuned_dataset/ner'

BATCH_SIZE = 1
IGNORE_INDEX = -1

fine_tune_config = {
# adamw optimizer
"learning_rate": 6e-4, # max learning rate
"max_iters": 600000, # total number of training iterations
"weight_decay": 1e-1,
"beta1" : 0.9,
"beta2" : 0.95,
"grad_clip" : 1.0, # clip gradients at this value, or disable if == 0.0
# learning rate decay settings
"decay_lr" : True, # whether to decay the learning rate
"warmup_iters" :  0, # how many steps to warm up for
"lr_decay_iters" : 600000, # should be ~= max_iters per Chinchilla
"min_lr" : 6e-5, # minimum learning rate, should be ~= learning_rate/10 per Chinchilla
"gradient_accumulation_steps" : 5,  # used to simulate larger batch sizes
"wandb_project" : 'HW5',
"wandb_run_name" : 'finetune-complete-1',
"scaler_enabled" :  True,
"dropout" : 0.0 ,
}

In [90]:
# !tar -cvf "/content/fine_tuned_dataset.tar" "/content/fine_tuned_dataset"

In [100]:
import numpy as np
import tiktoken
import torch
from torch.nn import CrossEntropyLoss
from torch.utils.data import Dataset, DataLoader
from torchtext.data.metrics import bleu_score
from ignite.metrics import Rouge
from transformers import AdamW
from datetime import datetime
import os
import random
import gc
import wandb
import warnings
import math

torch.cuda.empty_cache()
gc.collect()

# Ignore all warnings
warnings.filterwarnings("ignore")

## Dataset Class
Summarise_Index = 0
QA_Index = 0
Entity_Index = 0
Sentiment_Index = 0

class MergedDataset(Dataset):
    def __init__(self, summary_root, squad_root, ner_root, sentiment_root, file, length=None):
        # Merge the datasets for the summarizer and QA tasks
        self.summarise_data = np.load(os.path.join(summary_root, file+'.npy'), mmap_mode='r')[:length]
        self.summarise_lens = np.load(os.path.join(summary_root, file+'_lens.npy'), mmap_mode='r')[:length]

        self.qa_data = np.load(os.path.join(squad_root, file+'.npy'), mmap_mode='r')[:length]
        self.qa_lens = np.load(os.path.join(squad_root, file+'_lens.npy'), mmap_mode='r')[:length]

        self.ner_data = np.load(os.path.join(ner_root, file+'.npy'), mmap_mode='r')[:length]
        self.ner_lens = np.load(os.path.join(ner_root, file+'_lens.npy'), mmap_mode='r')[:length]

        self.sentiment_data = np.load(os.path.join(sentiment_root, file+'.npy'), mmap_mode='r')[:length]
        self.sentiment_lens = np.load(os.path.join(sentiment_root, file+'_lens.npy'), mmap_mode='r')[:length]

        print(self.summarise_lens.shape)
        print(self.qa_lens.shape)
        print(self.ner_lens.shape)
        print(self.sentiment_lens.shape)

        self.data = np.concatenate([self.summarise_data, self.qa_data , self.ner_data , self.sentiment_data])

        self.data_lens = np.concatenate([self.summarise_lens, self.qa_lens, self.ner_lens, self.sentiment_lens])
        self.length = self.data.shape[0]

        Summarise_Index = len(self.summarise_data)
        QA_Index = len(self.summarise_data)+len(self.qa_data)
        Entity_Index = len(self.summarise_data)+len(self.qa_data) + len(self.ner_data)
        Sentiment_Index  = len(self.summarise_data)+len(self.qa_data) + len(self.ner_data) + len(self.sentiment_data)

        print("printing the lengths of all indices")
        print(Summarise_Index)
        print(QA_Index)
        print(Entity_Index)
        print(Sentiment_Index)

    def __len__(self):
        return self.length

    def __getitem__(self, idx):
        d = self.data[idx]
        l = self.data_lens[idx]

        # summarise
        if idx<len(self.summarise_data):
           dataTag = 0
        # squad
        elif ((idx>=len(self.summarise_data)) and (idx<=(len(self.summarise_data)+len(self.qa_data)))):
          dataTag = 1
        # NER Dataa
        elif ((idx>=(len(self.summarise_data)+len(self.qa_data))) and (idx<=(len(self.summarise_data)+len(self.qa_data)+len(self.ner_data)))):
          dataTag = 2
        # sentiment analysis
        elif ((idx>=(len(self.summarise_data)+len(self.qa_data)+len(self.ner_data))) and (idx<=(len(self.summarise_data)+len(self.qa_data)+len(self.ner_data) + len(self.sentiment_data)))):
          dataTag = 3

        return d, l, dataTag


train_dataset = MergedDataset(SUMMARY_ROOT, SQUAD_ROOT, NER_ROOT , SENTIMENT_ROOT, 'train', length=75000)
val_dataset = MergedDataset(SUMMARY_ROOT, SQUAD_ROOT, NER_ROOT , SENTIMENT_ROOT, 'validation', length=300)

## Intiialize dataloader
train_dataloader =  DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=4)

val_dataloader =  DataLoader(
    val_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=4)


## Load pretrained model ##
checkpoint = torch.load(CHK_PT_PATH, map_location=device)

model = GPT(n_layer = config['n_layer'], n_head=config['n_head'], n_embd=config['n_embd'], block_size=config['block_size'], bias=config['bias'], dropout= config['dropout'])
print(sum(p.numel() for p in model.parameters())/1e6, 'M parameters')

model.load_state_dict(checkpoint['model_state_dict'])
iter_num = checkpoint['iter_num']

# Setup training functions
loss_fct = CrossEntropyLoss(ignore_index=IGNORE_INDEX)
val_loss_fct = CrossEntropyLoss(ignore_index=IGNORE_INDEX)
optimizer = torch.optim.AdamW(model.parameters(), lr=fine_tune_config['learning_rate'], betas=(fine_tune_config['beta1'], fine_tune_config['beta2']))
scaler = torch.cuda.amp.GradScaler(enabled=fine_tune_config['scaler_enabled'])


(75000, 1)
(75000, 1)
(14041, 1)
(4006, 1)
printing the lengths of all indices
75000
150000
164041
168047
(300, 1)
(300, 1)
(300, 1)
(211, 1)
printing the lengths of all indices
300
600
900
1111
96.056064 M parameters


In [101]:
## Init wandb
wandb.login(key="176799c69976f0bd0ea0f167e585f7f3f2fbcd9f")
wandb_config = {
    'BATCH_SIZE': BATCH_SIZE,
    'learning_rate': fine_tune_config['learning_rate'],
    'gradient_accumulation_steps': fine_tune_config['gradient_accumulation_steps']
}
wandb_run_name = 'FineTunev1'
wandb.init(
    project=fine_tune_config['wandb_project'],
    reinit = True,
    name='FineTunev1',
    config=wandb_config
    )
model = model.to(device)
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

# learning rate decay scheduler (cosine with warmup)
def get_lr(it):
    # 1) linear warmup for warmup_iters steps
    if it < fine_tune_config['warmup_iters']:
        return fine_tune_config['learning_rate'] * it / fine_tune_config['warmup_iters']
    # 2) if it > lr_decay_iters, return min learning rate
    if it > fine_tune_config['lr_decay_iters']:
        return fine_tune_config['min_lr']
    # 3) in between, use cosine decay down to min learning rate
    decay_ratio = (it - fine_tune_config['warmup_iters']) / (fine_tune_config['lr_decay_iters'] - fine_tune_config['warmup_iters'])
    assert 0 <= decay_ratio <= 1
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio)) # coeff ranges 0..1
    return fine_tune_config['min_lr'] + coeff * (fine_tune_config['learning_rate'] - fine_tune_config['min_lr'])



VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval_F1_scores,▁▆▆▅▂▁▆▅▆██▅▇▆▅▇▇▇▂███▄
eval_accuracy_scores,▁▁▂▄▁▄▂▄▂▄▂▁█▁▂▁▂▄▂▁▅▁▄
eval_bleu_scores,█▇▆▆▄▄▁▄▂▃▂▁▂▂▃▂▂▂▃▂▂▁▂
eval_rouge_scores,█▅▆▄▂▃▂▂▂▂▁▂▂▁▂▁▁▂▁▁▁▂▁
iter,▁▁▂▂▂▃▃▃▄▄▄▄▅▅▅▆▆▆▇▇▇██
lr,██████▇▇▇▇▆▆▆▅▅▅▄▄▃▃▂▂▁
train/loss,▂▁▁▃▁▁▁▁▁▁█▂▁▂▁▃▁▂▁▁▂▁▁
val/loss,▁▃▃▅▆▆▇▆▇▆▇▇▇▇▇▇▇▇█▇███

0,1
eval_F1_scores,0.21238
eval_accuracy_scores,0.98438
eval_bleu_scores,0.018
eval_rouge_scores,0.00889
iter,460.0
lr,0.0006
train/loss,0.00237
val/loss,0.27516


In [102]:
def generate_sample(index):
    data_sample, art_len_sample, _ = val_dataset[index]
    data_sample = torch.tensor(data_sample[None,:]).to(device)
    idx = art_len_sample.item()

    logits = model(data_sample, return_all_logits=True)[0]
    preds = logits[0, idx:-1, :].argmax(dim=-1).tolist()

    labels = data_sample[0, idx+1:].tolist()
    print("The index coming here is")
    print(index)
    if index == 0: # index or tag?
        print("Pred Summary:\n %s \n" % enc.decode(preds))
        print("True Summary:\n %s \n\n" % enc.decode(labels))
    elif index == 310:
        print("Pred Answer:\n %s \n" % enc.decode(preds))
        print("True Answer:\n %s \n\n" % enc.decode(labels))
    elif index == 902:
      print("Pred Entities:\n %s \n" % enc.decode(preds))
      print("True Entities:\n %s \n\n" % enc.decode(labels))
    else:
        print("Pred Sentiment:\n %s \n" % enc.decode(preds))
        print("True Sentiment:\n %s \n\n" % enc.decode(labels))

In [103]:
! pip install torchmetrics



In [104]:
# from torchmetrics import F1Score
from sklearn.metrics import f1_score

def evaluate(model, global_step=None, lr=None, tr_loss=None):
    if not os.path.exists(OUTPUT_DIR):
        os.mkdir(OUTPUT_DIR)
    eval_output_dir = OUTPUT_DIR

    results = {}

    eval_loss = 0.0
    eval_bleu_scores = 0.0
    eval_rouge_scores = 0.0
    eval_f1_score = 0.0
    eval_accuracy_score = 0.0
    nb_eval_steps = 0
    model.eval()

    for (data, article_len, dataTag) in val_dataloader:
        inputs, labels = torch.tensor(data).to(device), torch.tensor(data).to(device)
        with torch.no_grad():
            logits = model(inputs, return_all_logits=True)[0]
            shift_logits = []
            shift_labels = []

            avg_eval_bleu = 0.0
            avg_rouge_score = 0.0
            avg_f1_score = 0.0
            avg_accuracy_score = 0.0

            m = Rouge(variants=["L",1,2], multiref="best")
            sentiment_correct = 0
            total_sentiment = 0

            for batch_idx in range(logits.shape[0]):
                idx = article_len[batch_idx].item() # index of separator token

                shift_logits.append(logits[batch_idx, idx:-1, :])
                shift_labels.append(labels[batch_idx, idx+1:])

                greedy_labels = labels[batch_idx, idx+1:].tolist()
                index = greedy_labels.index(enc.eot_token)
                greedy_labels  = greedy_labels[:index]
                references = [[enc.decode(greedy_labels).split()]]

                greedy_preds = logits[batch_idx, idx:-1, :].argmax(dim=-1).tolist()
                greedy_preds = greedy_preds[:index]
                try:
                  hypotheses = [enc.decode(greedy_preds).split()]
                except Exception as e:
                   print(f"Error decoding tokens: {e}")

                if dataTag[batch_idx].item() == 0:
                    bleu4 = bleu_score(hypotheses, references, max_n=2, weights=[0.5, 0.5])
                    avg_eval_bleu += bleu4

                elif dataTag[batch_idx].item() == 1:
                    m.update((hypotheses, references))
                    rouge = m.compute()
                    avg_rouge_score += max(rouge.values())

                elif dataTag[batch_idx].item() == 2:  # NER
                    # predictions = logits[batch_idx, :idx, :].argmax(-1)
                    # true_labels = labels[batch_idx, :idx]
                    # print("greedy labels")
                    # print(greedy_labels)
                    # print("greedy preds")
                    # print(greedy_preds)
                    F1_score = f1_score(greedy_labels, greedy_preds, average='micro')
                    avg_f1_score += F1_score

                elif dataTag[batch_idx].item() == 3:  # Sentiment Analysis
                    pred_label = logits[batch_idx].argmax(-1)
                    true_label = labels[batch_idx]
                    # print("pred label")
                    # print(pred_label)
                    # print("true label")
                    # print(true_label)
                    sentiment_correct += (pred_label == true_label).sum().item()
                    total_sentiment += len(labels[batch_idx])
                else:
                    print(" Here at Sentiment Eval\n")


            shift_logits = torch.cat(shift_logits, dim=0)
            shift_labels = torch.cat(shift_labels, dim=0)

            lm_loss = loss_fct(shift_logits, shift_labels)
            eval_loss += lm_loss.mean().item()

            eval_bleu_scores += avg_eval_bleu/logits.shape[0]
            eval_rouge_scores += avg_rouge_score/logits.shape[0]
            eval_f1_score += avg_f1_score/logits.shape[0]

        del inputs, labels
        torch.cuda.empty_cache()
        gc.collect()

        nb_eval_steps += 1

    eval_loss = eval_loss / nb_eval_steps
    eval_bleu_scores = 2* eval_bleu_scores / nb_eval_steps #bigram model
    eval_rouge_scores = 2* eval_rouge_scores / nb_eval_steps # bigram model
    eval_f1_score = eval_f1_score / nb_eval_steps
    avg_accuracy_score = sentiment_correct / total_sentiment
    perplexity = torch.exp(torch.tensor(eval_loss))

    result = {
        "perplexity": perplexity,
        'eval_bleu_scores': eval_bleu_scores,
        'eval_rouge_scores': eval_rouge_scores,
        'eval_accuracy_scores': avg_accuracy_score,
        'eval_F1_scores': eval_f1_score
    }

    print("perplexity:", perplexity.item())
    print('eval_bleu_scores: ', eval_bleu_scores)
    print('eval_rouge_scores: ', eval_rouge_scores)
    print('eval_accuracy_scores: ', avg_accuracy_score)
    print('eval_F1_scores: ', eval_f1_score)


    global best_bleu_score
    global best_rouge_score
    if global_step:
        output_eval_file = os.path.join(eval_output_dir, "eval_results.txt")
        with open(output_eval_file, "a") as f:
            for key in sorted(result.keys()):
                f.write('\n\n')
                f.write("time = %s, %s = %s, step = %s\n" % (datetime.now().strftime("%d/%m/%Y %H:%M:%S"), key, str(result[key]), str(global_step)))

        wandb.log({
                "iter": global_step,
                "train/loss": tr_loss,
                "val/loss": eval_loss,
                'eval_bleu_scores': eval_bleu_scores,
                'eval_rouge_scores': eval_rouge_scores,
                'eval_accuracy_scores': avg_accuracy_score,
                'eval_F1_scores': eval_f1_score,
                "lr": lr,
            })

        if eval_bleu_scores >= best_bleu_score:
            best_bleu_score = eval_bleu_scores
            checkpoint = {
                'iter_num': iter_num,
                'model': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict()
            }
            print(f"saving checkpoint to {eval_output_dir}")
            torch.save(checkpoint, os.path.join(eval_output_dir, 'bleu_ckpt.pt'))

        if eval_rouge_scores >= best_rouge_score:
            best_rouge_score = eval_rouge_scores
            checkpoint = {
                'iter_num': iter_num,
                'model': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict()
            }
            print(f"saving checkpoint to {eval_output_dir}")
            torch.save(checkpoint, os.path.join(eval_output_dir, 'rouge_ckpt.pt'))

    return result

In [105]:
def train(model):
    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    set_seed(1337)
    for _ in np.arange(EPOCHS):
        for step, (data, article_len, dataTag) in enumerate(train_dataloader):
            inputs, labels = torch.tensor(data), torch.tensor(data)
            inputs = inputs.to(device)
            labels = labels.to(device)
            model.train()
            logits = model(inputs, return_all_logits=True)[0]

            # only consider loss on reference summary just like seq2seq models
            shift_logits = []
            shift_labels = []
            for batch_idx in range(logits.shape[0]):
                idx = article_len[batch_idx].item() # index of separator token
                shift_logits.append(logits[batch_idx, idx:-1, :])
                shift_labels.append(labels[batch_idx, idx+1:])
            shift_logits = torch.cat(shift_logits, dim=0)
            shift_labels = torch.cat(shift_labels, dim=0)

            loss = loss_fct(shift_logits, shift_labels)
            loss = loss/fine_tune_config['gradient_accumulation_steps']
            scaler.scale(loss).backward()

            tr_loss += loss.item()
            if (step + 1) % fine_tune_config['gradient_accumulation_steps'] == 0:
                lr = get_lr(step)
                for param_group in optimizer.param_groups:
                    param_group['lr'] = lr

                scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(model.parameters(), fine_tune_config['grad_clip'])

                scaler.step(optimizer)
                scaler.update()

                model.zero_grad()
                optimizer.zero_grad(set_to_none=True)

                global_step += 1
                logging_loss = tr_loss
                print("loss:", loss.item(), end='\n\n')

                if (step + 1)/fine_tune_config['gradient_accumulation_steps'] == 1.0:
                    print('After 1st update: ', end='\n\n')
                    generate_sample(0) #Summary
                    generate_sample(299) #Sentiment
                    generate_sample(902) #NER
                    generate_sample(310) #Q&A


            if (step + 1) % (20*fine_tune_config['gradient_accumulation_steps'] )== 0:
                results = evaluate(model, global_step, lr, loss.item())
                print('After', global_step+1,'updates: ', end='\n\n')
                generate_sample(0) #Summary
                generate_sample(299) #Sentiment
                generate_sample(902) #NER
                generate_sample(310) #Q&A

            del inputs, labels
            torch.cuda.empty_cache()
            gc.collect()

In [106]:

train(model)

loss: 1.1077860593795776

After 1st update: 

The index coming here is
0
Pred Summary:
 <|endoftext|><|endoftext|>roussard<|endoftext|> to give<|endoftext|> kidney to a stranger<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>ants<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>

PanicException: no entry found for key