In [1]:
import numpy as np
import matplotlib.pyplot as plt
import torch
from torch import nn
from torch.nn import functional as F
from transformer import Transformer

## Finetuning on unsupervised IMDB text

In [6]:
from datasets import load_dataset

In [7]:
dataset = load_dataset("Kwaai/IMDB_Sentiment")

In [8]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [10]:
dataset['unsupervised'][0]

{'text': 'This is just a precious little diamond. The play, the script are excellent. I cant compare this movie with anything else, maybe except the movie "Leon" wonderfully played by Jean Reno and Natalie Portman. But... What can I say about this one? This is the best movie Anne Parillaud has ever played in (See please "Frankie Starlight", she\'s speaking English there) to see what I mean. The story of young punk girl Nikita, taken into the depraved world of the secret government forces has been exceptionally over used by Americans. Never mind the "Point of no return" and especially the "La femme Nikita" TV series. They cannot compare the original believe me! Trash these videos. Buy this one, do not rent it, BUY it. BTW beware of the subtitles of the LA company which "translate" the US release. What a disgrace! If you cant understand French, get a dubbed version. But you\'ll regret later :)',
 'label': -1}

In [11]:
l = []
for x in dataset['unsupervised']:
    l.append(x['text'])

In [12]:
text = " ".join(l)

In [2]:
# data loading
def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

In [3]:
class LanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        self.embedding_layer = nn.Embedding(vocab_size, d_model)
        self.position_embedding = nn.Embedding(block_size, d_model)
        self.transformer = Transformer(num_layers, d_model, nhead, dim_feedforward, dropout=dropout)
        self.final = nn.Linear(d_model, vocab_size)
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=(2/d_model)**0.5)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=(2/vocab_size)**0.5)

    def forward(self, x):
        B, T = x.shape
        tok_emb = self.embedding_layer(x) # (B,T,C)
        pos_emb = self.position_embedding(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.transformer(x, mask=mask) # (B,T,C)
        logits = self.final(x) # (B,T,vocab_size)
        return logits

In [4]:
from transformers import GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

In [15]:
batch_size = 16
accumulation_steps = 8
block_size = 512
num_iters = 30000
print_interval = 100
val_iters = 8
lr = 5e-4
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
d_model = 256
nhead = 8
num_layers = 8
dropout = 0.01
dim_feedforward = 2048
mask = torch.tril(torch.ones(block_size,block_size)).to(device=device)
vocab_size = tokenizer.vocab_size

In [18]:
from torch.amp import autocast, GradScaler

scaler = GradScaler()  # Helps prevent underflow issues

In [13]:
# Train and test splits
data = tokenizer(text, return_tensors="pt")["input_ids"][0]
n = int(0.96*len(data))
train_data = data[:n]
val_data = data[n:]

Token indices sequence length is longer than the specified maximum sequence length for this model (15022801 > 1024). Running this sequence through the model will result in indexing errors


In [14]:
len(data)

15022801

In [26]:
num_iters = 10000

In [19]:
model = LanguageModel()
checkpoint = torch.load("wiki103_checkpoint.pth")
model.load_state_dict(checkpoint["model_state"])
model = model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
optimizer.load_state_dict(checkpoint["optimizer_state"])
# print the number of parameters in the model
print(sum(p.numel() for p in model.parameters())/1e6, 'M parameters')

36.427857 M parameters


In [29]:
train_loss = 0
val_loss = 0
for n in range(num_iters):
    x, y = get_batch('train')
    with autocast(device_type="cuda"):
        logits = model(x)
        B, T, C = logits.shape
        loss = F.cross_entropy(logits.view(B*T, C), y.view(B*T)) #/ accumulation_steps
    scaler.scale(loss).backward()
    if (n + 1) % accumulation_steps == 0:
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad()
    with torch.no_grad():
        train_loss += loss #* accumulation_steps
        if (n % print_interval == 0 or n == num_iters - 1):
            model.eval()
            for _ in range(val_iters):
                x, y = get_batch('val')
                logits = model(x)
                B, T, C = logits.shape
                val_loss += F.cross_entropy(logits.view(B*T, C), y.view(B*T))
            if n==0:
                print(f"step {n}: train loss {train_loss:.4f}, val loss {val_loss/val_iters:.4f}")
            else:
                print(f"step {n}: train loss {train_loss/print_interval:.4f}, val loss {val_loss/val_iters:.4f}")
            train_loss = 0
            val_loss = 0
            model.train()

step 0: train loss 7.8739, val loss 7.9724
step 100: train loss 6.7880, val loss 5.8183
step 200: train loss 5.5548, val loss 5.3416
step 300: train loss 5.2586, val loss 5.2070
step 400: train loss 5.1132, val loss 5.1065
step 500: train loss 5.0190, val loss 4.9744
step 600: train loss 4.9713, val loss 4.9203
step 700: train loss 4.9073, val loss 4.8497
step 800: train loss 4.8601, val loss 4.8543
step 900: train loss 4.8508, val loss 4.8648
step 1000: train loss 4.8065, val loss 4.8360
step 1100: train loss 4.7738, val loss 4.7851
step 1200: train loss 4.7746, val loss 4.7549
step 1300: train loss 4.7224, val loss 4.8008
step 1400: train loss 4.7378, val loss 4.7828
step 1500: train loss 4.7170, val loss 4.7392
step 1600: train loss 4.6827, val loss 4.7427
step 1700: train loss 4.6818, val loss 4.7445
step 1800: train loss 4.6542, val loss 4.6890
step 1900: train loss 4.6450, val loss 4.7248
step 2000: train loss 4.6229, val loss 4.6693
step 2100: train loss 4.6114, val loss 4.7000


In [30]:
torch.save({
    "model_state": model.state_dict(),
    "optimizer_state": optimizer.state_dict(),
}, "imdb_wiki103_checkpoint_10k.pth")

In [25]:
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(tokenizer.decode(model.generate(context, max_new_tokens=500)[0].tolist()))

! Larry Silent OK, however, given is full shots of magic, most decor. Especially through the whole movie, despite one scene like a slasher films; one has constant squers falsehoods(under discomfort below.) In Swiss Borepa cooking buffs illustrates this glorious 1970's acted and native whashes of bearded, images. (Romails, violence, intellectual appeal, whatever time he flopging its characters were every by setting's home in the world) when my teenage wife, the timid Races of her parents grow murderous their family "heart of Mississippi settlers." What George was the case of 'Fanny' is a big station and by closing his eyes of a shifty knife. A ravishing father and neurotic and prayed on ideas should have guessed it...<br /><br />The characters are all cute and funny at times very dangerous. There is also little of each other, naked.... that's it was boring, if this strolling off thirties and used each of which a mean I know all the segments would say to the one where Gina walks around t

## Finetuning on Supervised IMDB sentiment (Using fc layer on final token for binary classification) 

In [21]:
from transformers import GPT2TokenizerFast
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")

In [22]:
train_data = dataset['train'].shuffle()
test_data = dataset['test'].shuffle()

In [23]:
tokenizer.eos_token = tokenizer.decode(50256)
tokenizer.pad_token = tokenizer.eos_token

In [24]:
# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else test_data
    ix = torch.randint(len(data)//batch_size, (1,)).item()
    x = tokenizer(data[ix*batch_size:(ix+1)*batch_size]['text'], padding='max_length', truncation=True, max_length=block_size, return_tensors="pt")["input_ids"]
    y = torch.tensor(data[ix*batch_size:(ix+1)*batch_size]['label'], dtype=torch.float)
    x, y = x.to(device), y.to(device)
    return x, y

In [26]:
batch_size = 16
accumulation_steps = 8
block_size = 512
num_iters = 500
print_interval = 100
val_iters = 8
lr = 5e-4
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
d_model = 256
nhead = 8
num_layers = 8
dropout = 0.01
dim_feedforward = 2048
mask = torch.tril(torch.ones(block_size,block_size)).to(device=device)
vocab_size = tokenizer.vocab_size

In [27]:
get_batch('train')

(tensor([[ 8241,  5690,   287,  ..., 50256, 50256, 50256],
         [ 1212,   318,   257,  ..., 50256, 50256, 50256],
         [ 1890,   257,  1877,  ..., 50256, 50256, 50256],
         ...,
         [ 1212,   318,   281,  ..., 50256, 50256, 50256],
         [ 2514,   502,   428,  ..., 50256, 50256, 50256],
         [ 2025,  4044,    11,  ..., 50256, 50256, 50256]], device='cuda:0'),
 tensor([1., 1., 1., 0., 1., 0., 1., 0., 1., 0., 1., 1., 1., 1., 0., 1.],
        device='cuda:0'))

In [28]:
class LanguageModelSentiment(nn.Module):

    def __init__(self):
        super().__init__()
        self.embedding_layer = nn.Embedding(vocab_size, d_model)
        self.position_embedding = nn.Embedding(block_size, d_model)
        self.transformer = Transformer(num_layers, d_model, nhead, dim_feedforward, dropout=dropout)
        self.final1 = nn.Linear(d_model, 1)
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=(2/d_model)**0.5)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=(2/vocab_size)**0.5)

    def forward(self, x):
        B, T = x.shape
        tok_emb = self.embedding_layer(x) # (B,T,C)
        pos_emb = self.position_embedding(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.transformer(x, mask=mask) # (B,T,C)
        logits = self.final1(x[:,-1,:]) # Using last token (B,1) 
        return logits

In [36]:
model = LanguageModelSentiment()
checkpoint = torch.load("imdb_wiki103_checkpoint_10k.pth")
model.load_state_dict(checkpoint["model_state"], strict=False)
model = model.to(device)
print(sum(p.numel() for p in model.parameters())/1e6, 'M parameters')

23.512065 M parameters


In [37]:
model = LanguageModelSentiment()
checkpoint = torch.load("imdb_wiki103_checkpoint_10k.pth")
model.load_state_dict(checkpoint["model_state"], strict=False)
model = model.to(device)
print(sum(p.numel() for p in model.parameters())/1e6, 'M parameters')

23.512065 M parameters


In [38]:
for param in model.parameters():
    param.requires_grad = False  # Freeze all layers

for param in model.transformer.parameters():
    param.requires_grad = True  # Unfreeze transformer layer

for param in model.final1.parameters():
    param.requires_grad = True  # Unfreeze final fc layer

In [39]:
for name, param in model.named_parameters():
    print(f"{name}: {'Trainable' if param.requires_grad else 'Frozen'}")

embedding_layer.weight: Frozen
position_embedding.weight: Frozen
transformer.layers.0.self_attn.w_q.weight: Trainable
transformer.layers.0.self_attn.w_k.weight: Trainable
transformer.layers.0.self_attn.w_v.weight: Trainable
transformer.layers.0.self_attn.fc_out.weight: Trainable
transformer.layers.0.self_attn.fc_out.bias: Trainable
transformer.layers.0.ffn.0.weight: Trainable
transformer.layers.0.ffn.0.bias: Trainable
transformer.layers.0.ffn.2.weight: Trainable
transformer.layers.0.ffn.2.bias: Trainable
transformer.layers.0.norm1.weight: Trainable
transformer.layers.0.norm1.bias: Trainable
transformer.layers.0.norm2.weight: Trainable
transformer.layers.0.norm2.bias: Trainable
transformer.layers.1.self_attn.w_q.weight: Trainable
transformer.layers.1.self_attn.w_k.weight: Trainable
transformer.layers.1.self_attn.w_v.weight: Trainable
transformer.layers.1.self_attn.fc_out.weight: Trainable
transformer.layers.1.self_attn.fc_out.bias: Trainable
transformer.layers.1.ffn.0.weight: Trainable


In [40]:
lr = 1e-4
print_interval = 500
val_iters = 500

In [41]:
optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=lr)

In [42]:
num_iters = 6000

In [43]:
train_loss = 0
val_loss = 0
accuracy = 0
for n in range(num_iters):
    x, y = get_batch('train')
    with autocast(device_type="cuda"):
        logits = model(x)
        loss = F.binary_cross_entropy_with_logits(logits, y.to(dtype=torch.float).unsqueeze(-1))
    scaler.scale(loss).backward()
    if (n + 1) % accumulation_steps == 0:
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad()
    with torch.no_grad():
        train_loss += loss #* accumulation_steps
        if (n % print_interval == 0 or n == num_iters - 1):
            for _ in range(val_iters):
                x, y = get_batch('val')
                logits = model(x)
                val_loss += F.binary_cross_entropy_with_logits(logits, y.to(dtype=torch.float).unsqueeze(-1))
                logits = logits.squeeze(-1)
                accuracy += torch.isclose(torch.sigmoid(logits).round(), y).to(dtype=torch.float).mean()
            if n==0:
                print(f"step {n}: train loss {train_loss:.4f}, val loss {val_loss/val_iters:.4f}, val accuracy {accuracy/val_iters:.4f}")
            else:
                print(f"step {n}: train loss {train_loss/print_interval:.4f}, val loss {val_loss/val_iters:.4f}, val accuracy {accuracy/val_iters:.4f}")
            train_loss = 0
            val_loss = 0
            accuracy = 0

step 0: train loss 1.4332, val loss 0.8732, val accuracy 0.5115
step 500: train loss 0.7387, val loss 0.6446, val accuracy 0.6819
step 1000: train loss 0.4313, val loss 0.3091, val accuracy 0.8711
step 1500: train loss 0.2916, val loss 0.3271, val accuracy 0.8599
step 2000: train loss 0.2600, val loss 0.3087, val accuracy 0.8688
step 2500: train loss 0.2447, val loss 0.2749, val accuracy 0.8865
step 3000: train loss 0.2229, val loss 0.2878, val accuracy 0.8821
step 3500: train loss 0.2204, val loss 0.2892, val accuracy 0.8821
step 4000: train loss 0.1897, val loss 0.2681, val accuracy 0.8940
step 4500: train loss 0.1725, val loss 0.2817, val accuracy 0.8891
step 5000: train loss 0.1429, val loss 0.2957, val accuracy 0.8886
step 5500: train loss 0.1495, val loss 0.2724, val accuracy 0.8973
step 5999: train loss 0.1432, val loss 0.2952, val accuracy 0.8916


In [44]:
torch.save({
    "model_state": model.state_dict(),
    "optimizer_state": optimizer.state_dict(),
}, "sentiment_imdb_wiki103_checkpoint.pth")