<a href="https://colab.research.google.com/github/bilnazir/LLM_Assignment3/blob/main/LLM_Assignment_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [62]:
!pip install datasets evaluate transformers[sentencepiece]
!pip install accelerate
# To run the training on TPU, you will need to uncomment the following line:
# !pip install cloud-tpu-client==0.10 torch==1.9.0 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl
!apt install git-lfs

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git-lfs is already the newest version (3.0.2-1ubuntu0.2).
0 upgraded, 0 newly installed, 0 to remove and 45 not upgraded.


In [2]:
!git config --global user.email "rajabilalnazir@example.com"
!git config --global user.name "bilnazir"

In [4]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [63]:
from transformers import AutoModelForMaskedLM

model_checkpoint = "distilbert-base-uncased"
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

In [58]:
distilbert_num_parameters = model.num_parameters() / 1_000_000
print(f"'>>> DistilBERT number of parameters: {round(distilbert_num_parameters)}M'")
print(f"'>>> BERT number of parameters: 110M'")

'>>> DistilBERT number of parameters: 67M'
'>>> BERT number of parameters: 110M'


In [64]:
text = "This is a great [MASK]."

In [65]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, max_length=512)


In [66]:
import torch

inputs = tokenizer.encode_plus(
    text,
    add_special_tokens=True,
    max_length=512,
    truncation=True,
    padding="max_length",
    return_attention_mask=True,
    return_tensors="pt"
)
token_logits = model(**inputs).logits
# Find the location of [MASK] and extract its logits
mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
mask_token_logits = token_logits[0, mask_token_index, :]
# Pick the [MASK] candidates with the highest logits
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()

for token in top_5_tokens:
    print(f"'>>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}'")

'>>> This is a great deal.'
'>>> This is a great success.'
'>>> This is a great adventure.'
'>>> This is a great idea.'
'>>> This is a great feat.'


In [11]:
import torch
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup, default_data_collator
from datasets import load_dataset

In [19]:
dataset = load_dataset("stanfordnlp/sst2")
column_to_drop = 'idx'  # Replace 'column_name' with the name of the column you want to drop
dataset = dataset.remove_columns(column_to_drop)

In [20]:
print(dataset['train'].features)
dataset

{'sentence': Value(dtype='string', id=None), 'label': ClassLabel(names=['negative', 'positive'], id=None)}


DatasetDict({
    train: Dataset({
        features: ['sentence', 'label'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label'],
        num_rows: 1821
    })
})

In [21]:
sample = dataset["train"].shuffle(seed=42).select(range(3))

for row in sample:
    print(f"\n'>>> Review: {row['sentence']}'")
    print(f"'>>> Label: {row['label']}'")


'>>> Review: klein , charming in comedies like american pie and dead-on in election , '
'>>> Label: 1'

'>>> Review: be fruitful '
'>>> Label: 1'

'>>> Review: soulful and '
'>>> Label: 1'


In [22]:
def tokenize_function(examples):
    result = tokenizer(examples["sentence"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result


# Use batched=True to activate fast multithreading!
tokenized_datasets = dataset.map(
    tokenize_function, batched=True, remove_columns=["sentence", "label"]
)
tokenized_datasets

Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Map:   0%|          | 0/1821 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 872
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 1821
    })
})

In [23]:
chunk_size = 128

In [24]:
# Slicing produces a list of lists for each feature
tokenized_samples = tokenized_datasets["train"][:3]

for idx, sample in enumerate(tokenized_samples["input_ids"]):
    print(f"'>>> Review {idx} length: {len(sample)}'")

'>>> Review 0 length: 10'
'>>> Review 1 length: 11'
'>>> Review 2 length: 15'


In [25]:
concatenated_examples = {
    k: sum(tokenized_samples[k], []) for k in tokenized_samples.keys()
}
total_length = len(concatenated_examples["input_ids"])
print(f"'>>> Concatenated reviews length: {total_length}'")

'>>> Concatenated reviews length: 36'


In [26]:
chunks = {
    k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
    for k, t in concatenated_examples.items()
}

for chunk in chunks["input_ids"]:
    print(f"'>>> Chunk length: {len(chunk)}'")

'>>> Chunk length: 36'


In [27]:
def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result

In [28]:
lm_datasets = tokenized_datasets.map(group_texts, batched=True)
lm_datasets

Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Map:   0%|          | 0/1821 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 6971
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 171
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 353
    })
})

In [29]:
tokenizer.decode(lm_datasets["train"][1]["input_ids"])

"[SEP] [CLS] are more deeply thought through than in most ` right - thinking'films [SEP] [CLS] goes to absurd lengths [SEP] [CLS] for those moviegoers who complain that ` they don't make movies like they used to anymore [SEP] [CLS] the part where nothing's happening, [SEP] [CLS] saw how bad this movie was [SEP] [CLS] lend some dignity to a dumb story [SEP] [CLS] the greatest musicians [SEP] [CLS] cold movie [SEP] [CLS] with his usual intelligence and subtlety [SEP] [CLS] redundant concept [SEP] [CLS] swimming is above all about a young woman's face, and by casting an actress whose face projects that woman's doubts and yearnings, it succeeds."

In [30]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [31]:
samples = [lm_datasets["train"][i] for i in range(2)]
for sample in samples:
    _ = sample.pop("word_ids")

for chunk in data_collator(samples)["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")


'>>> [CLS] hide [MASK] [MASK]ions from the [MASK] units [SEP] [CLS] contains [MASK] wit [MASK] [MASK] labored gags [SEP] [CLS] that loves its characters and communicate [MASK] something rather beautiful about human nature [SEP] [CLS] remains utterly satisfied to remain the same throughout [SEP] [CLS] on the worst revenge - of - the - [MASK]rds cliches the filmmakers could dredge up [SEP] [CLS] that's far [MASK] [MASK] to merit such superficial treatment [SEP] [CLS] demonstrates that the director of such hollywood blockbusters [MASK] patriot snout [MASK] still turn out videos small, personal film [MASK] an emotional [MASK]op. [SEP] [CLS] of sa [MASK]y [SEP] [CLS] a depressed fifteen [MASK] year - old's suicidal poetry'

'>>> [SEP] [CLS] are more deeply thought through than in most ` right - thinking'films [SEP] [CLS] goes [MASK] absurd lengths [SEP] [CLS] for those moviegoers who complain that ` they don't make movies like they used to anymore [SEP] [CLS] the part where nothing'[MASK] 

In [32]:
import collections
import numpy as np

from transformers import default_data_collator

wwm_probability = 0.2


def whole_word_masking_data_collator(features):
    for feature in features:
        word_ids = feature.pop("word_ids")

        # Create a map between words and corresponding token indices
        mapping = collections.defaultdict(list)
        current_word_index = -1
        current_word = None
        for idx, word_id in enumerate(word_ids):
            if word_id is not None:
                if word_id != current_word:
                    current_word = word_id
                    current_word_index += 1
                mapping[current_word_index].append(idx)

        # Randomly mask words
        mask = np.random.binomial(1, wwm_probability, (len(mapping),))
        input_ids = feature["input_ids"]
        labels = feature["labels"]
        new_labels = [-100] * len(labels)
        for word_id in np.where(mask)[0]:
            word_id = word_id.item()
            for idx in mapping[word_id]:
                new_labels[idx] = labels[idx]
                input_ids[idx] = tokenizer.mask_token_id
        feature["labels"] = new_labels

    return default_data_collator(features)

In [33]:
samples = [lm_datasets["train"][i] for i in range(2)]
batch = whole_word_masking_data_collator(samples)

for chunk in batch["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")


'>>> [CLS] hide new secretions from the parental units [SEP] [CLS] contains [MASK] wit, only labored [MASK] [MASK] [SEP] [CLS] that loves its characters and [MASK] [MASK] something rather beautiful about human nature [SEP] [CLS] remains [MASK] satisfied to remain the same [MASK] [SEP] [CLS] on the worst revenge [MASK] [MASK] - the [MASK] nerds cliches the filmmakers could dredge up [SEP] [CLS] that's [MASK] too tragic to merit such superficial treatment [SEP] [CLS] demonstrates [MASK] the director of such hollywood blockbusters [MASK] [MASK] [MASK] can still [MASK] out a small [MASK] [MASK] film with an emotional wallop. [SEP] [CLS] of saucy [SEP] [CLS] a depressed [MASK] - [MASK] [MASK] old's suicidal poetry'

'>>> [SEP] [CLS] [MASK] more [MASK] thought through than [MASK] most ` right - thinking'films [SEP] [CLS] goes to absurd lengths [SEP] [CLS] for those [MASK] [MASK] [MASK] [MASK] complain [MASK] ` they do [MASK]'t make movies like they used to anymore [SEP] [CLS] the part [MASK

In [34]:
train_size = 5_000
test_size = int(0.1 * train_size)

downsampled_dataset = lm_datasets["train"].train_test_split(
    train_size=train_size, test_size=test_size, seed=42
)
downsampled_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 500
    })
})

In [35]:
from transformers import TrainingArguments

batch_size = 64
# Show the training loss with every epoch
logging_steps = len(downsampled_dataset["train"]) // batch_size
model_name = model_checkpoint.split("/")[-1]

training_args = TrainingArguments(
    output_dir=f"{model_name}-finetuned-sst",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    push_to_hub=True,
    fp16=False,
    logging_steps=logging_steps,
)

In [36]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=downsampled_dataset["train"],
    eval_dataset=downsampled_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [37]:
import math

eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

>>> Perplexity: 85.48


In [38]:
trainer.push_to_hub()

events.out.tfevents.1714707296.04de0b0c45da.377.0:   0%|          | 0.00/297 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/5.05k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/rajabilalnazir/distilbert-base-uncased-finetuned-sst/commit/71e39f882f2e753ef75ada5dc05e6624d611f172', commit_message='End of training', commit_description='', oid='71e39f882f2e753ef75ada5dc05e6624d611f172', pr_url=None, pr_revision=None, pr_num=None)

In [39]:
def insert_random_mask(batch):
    features = [dict(zip(batch, t)) for t in zip(*batch.values())]
    masked_inputs = data_collator(features)
    # Create a new "masked" column for each column in the dataset
    return {"masked_" + k: v.numpy() for k, v in masked_inputs.items()}

In [40]:
downsampled_dataset = downsampled_dataset.remove_columns(["word_ids"])
eval_dataset = downsampled_dataset["test"].map(
    insert_random_mask,
    batched=True,
    remove_columns=downsampled_dataset["test"].column_names,
)
eval_dataset = eval_dataset.rename_columns(
    {
        "masked_input_ids": "input_ids",
        "masked_attention_mask": "attention_mask",
        "masked_labels": "labels",
    }
)

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [41]:
from torch.utils.data import DataLoader
from transformers import default_data_collator

batch_size = 64
train_dataloader = DataLoader(
    downsampled_dataset["train"],
    shuffle=True,
    batch_size=batch_size,
    collate_fn=data_collator,
)
eval_dataloader = DataLoader(
    eval_dataset, batch_size=batch_size, collate_fn=default_data_collator
)

In [42]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

In [43]:
from accelerate import Accelerator

accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

In [44]:
from transformers import get_scheduler

num_train_epochs = 3
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [47]:
from huggingface_hub import get_full_repo_name

model_name = "distilbert-base-uncased-finetuned-sst-accelerate"
repo_name = get_full_repo_name(model_name)
repo_name

'rajabilalnazir/distilbert-base-uncased-finetuned-sst-accelerate'

In [48]:
from huggingface_hub import Repository

output_dir = model_name
repo = Repository(output_dir, clone_from=repo_name)

For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
Cloning https://huggingface.co/rajabilalnazir/distilbert-base-uncased-finetuned-sst-accelerate into local empty directory.


In [49]:
from tqdm.auto import tqdm
import torch
import math

progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):
    # Training
    model.train()
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # Evaluation
    model.eval()
    losses = []
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            outputs = model(**batch)

        loss = outputs.loss
        losses.append(accelerator.gather(loss.repeat(batch_size)))

    losses = torch.cat(losses)
    losses = losses[: len(eval_dataset)]
    try:
        perplexity = math.exp(torch.mean(losses))
    except OverflowError:
        perplexity = float("inf")

    print(f">>> Epoch {epoch}: Perplexity: {perplexity}")

    # Save and upload
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)
        repo.push_to_hub(
            commit_message=f"Training in progress epoch {epoch}", blocking=False
        )

  0%|          | 0/237 [00:00<?, ?it/s]

>>> Epoch 0: Perplexity: 19.758912780394912
>>> Epoch 1: Perplexity: 17.516072481506498
>>> Epoch 2: Perplexity: 16.86200307342919


In [50]:
from transformers import pipeline

mask_filler = pipeline(
    "fill-mask", model="rajabilalnazir/distilbert-base-uncased-finetuned-sst-accelerate"
)



config.json:   0%|          | 0.00/552 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.25k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [55]:
test = "Paris is the [MASK] of France."
preds = mask_filler(test)

for pred in preds:
    print(f">>> {pred['sequence']}")

>>> paris is the capital of france.
>>> paris is the birthplace of france.
>>> paris is the center of france.
>>> paris is the heart of france.
>>> paris is the centre of france.
