In [2]:
!pip install numpy pandas datasets transformers torch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.11.0-py3-none-any.whl (468 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.7/468.7 kB[0m [31m21.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m90.5 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash
  Downloading xxhash-3.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.2/212.2 kB[0m [31m28.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting aiohttp
  Downloading aiohttp-3.8.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [3

In [3]:
!apt install git-lfs

Reading package lists... Done
Building dependency tree       
Reading state information... Done
git-lfs is already the newest version (2.9.2-1).
0 upgraded, 0 newly installed, 0 to remove and 24 not upgraded.


In [4]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset
from torch.utils.data.dataloader import DataLoader

seed = 1999

from datasets.arrow_dataset import Dataset
from datasets.dataset_dict import DatasetDict
from transformers import (
    AutoConfig,
    AutoModel,
    AutoModelForCausalLM,
    AutoModelForQuestionAnswering,
    AutoTokenizer,
    BatchEncoding,
    DataCollator,
    DataCollatorWithPadding,
    DataCollatorForLanguageModeling,
    EvalPrediction,
    PretrainedConfig,
    PreTrainedModel,
    PreTrainedTokenizer,
    Trainer,
    TrainingArguments,
    pipeline,
    set_seed
)

In [5]:
# check torch is using GPU acceleration
print(torch.cuda.is_available())
print(torch.backends.cuda.is_built())

True
True


In [6]:
# Get cpu or gpu device for training.
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Using {device} device")

Using cuda device


In [7]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Text generation

We can just use `gpt2` model from the Hub as a starting point...

In [101]:
model_name = 'distilgpt2'
config = AutoConfig.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorWithPadding(tokenizer)
model = AutoModelForCausalLM.from_pretrained(model_name)

Using pad_token, but it is not set yet.


In [102]:
tokenizer

GPT2TokenizerFast(name_or_path='distilgpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True)

In [104]:
tokenizer.cls_token is None

Using cls_token, but it is not set yet.


True

In [10]:
text = "Replace me by any text you'd like."
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)
type(output)

transformers.modeling_outputs.CausalLMOutputWithCrossAttentions

In [11]:
generator = pipeline('text-generation', model=model, tokenizer=tokenizer)
set_seed(42)
generator("The White man worked as a", max_length=10, num_return_sequences=5)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'The White man worked as a journalist for Time magazine'},
 {'generated_text': 'The White man worked as a taxi driver for 25'},
 {'generated_text': 'The White man worked as a constructionist for 18'},
 {'generated_text': 'The White man worked as a waiter at a restaurant'},
 {'generated_text': 'The White man worked as a prostitute, according to'}]

In [12]:
generator("The White man worked as a", num_return_sequences=5)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'The White man worked as a volunteer for the Black Lives Matter movement in Florida.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nIt looks like the government is using the'},
 {'generated_text': 'The White man worked as a waitress at a pizza service in Memphis before being convicted of a hate crime in May.\n\n\n\n\n'},
 {'generated_text': "The White man worked as a clerk in the US Navy, and was stationed as a judge in China's Heilongjiang province for over 30 years. The military has denied the claims, and it seems clear the man’s name and family"},
 {'generated_text': 'The White man worked as a waitress at the time, her manager said.'},
 {'generated_text': 'The White man worked as a construction worker at an apartment building for a year and has been hired as a manager.\n\n\n\n\n\nHe was fired after the death of another worker in a fatal car crash.'}]

## Fine-tuning on new data

In [109]:
feature_name = "content"
cls_token_avail = tokenizer.cls_token is not None

# set padding to False for now - when we train, we'll use dynamic padding...
tokenizer_args = {"padding": False, "truncation": True}
def tokenize_function(dataset):
    return tokenizer(
        dataset[feature_name],
        **tokenizer_args,
    )

def tokenize_decoder(dataset):
    tokens = []
    for i in range(len(dataset["input_ids"])):
        ind = torch.where(torch.tensor(dataset["attention_mask"][i]) == 1)[0].tolist()
        # if no tokens (i.e. empty string), then just return the first embedding (CLS token)
        if len(ind) == 0 and cls_token_avail:
            ind = [0]
        tokens.append(
            tokenizer.convert_ids_to_tokens(
                torch.tensor(dataset["input_ids"][i])[ind]
                )
            )
    return {"tokens": tokens}

Using cls_token, but it is not set yet.


In [86]:
def split_dataset(dataset, train_size=0.8, valid_size=0.5, shuffle=False, seed=42):
    if valid_size is None:
        print(
            "[INFO] Splitting up dataset into train / test sets"
        )
    else:
        print(
            "[INFO] Splitting up dataset into train / validation / test sets"
        )

    # first split data into train set, test/valid set
    train_testvalid = dataset.train_test_split(
        train_size=train_size, shuffle=shuffle, seed=seed
    )
    if valid_size is not None:
        # further split the test set into a test, valid set
        test_valid = train_testvalid["test"].train_test_split(
            train_size=valid_size, shuffle=shuffle, seed=seed
        )
        # gather everyone if you want to have a single DatasetDict
        dataset_split = DatasetDict(
            {
                "train": train_testvalid["train"],
                "test": test_valid["test"],
                "validation": test_valid["train"],
            }
        )
    else:
        dataset_split = DatasetDict(
            {
                "train": train_testvalid["train"],
                "test": train_testvalid["test"],
                "validation": None,
            }
        )
    return dataset_split

In [87]:
def group_texts(examples):
    # examples should be a DatasetDict object
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [88]:
from google.colab import files
import io

## Seinfeld

In [125]:
dataset_name = "seinfeld"

In [12]:
uploaded = files.upload()

Saving scripts.csv to scripts.csv


In [13]:
seinfeld = pd.read_csv(io.BytesIO(uploaded['scripts.csv']))

In [14]:
# seinfeld = pd.read_csv("the-seinfeld-chronicles/scripts.csv")
seinfeld["Character"] = seinfeld["Character"].astype(str)
seinfeld["Dialogue"] = seinfeld["Dialogue"].astype(str)
seinfeld[feature_name] = seinfeld["Character"] + ": " + seinfeld["Dialogue"]
seinfeld.head(10)

NameError: ignored

In [None]:
dataset = Dataset.from_pandas(seinfeld[[feature_name]])
dataset

## Game of Thrones

In [126]:
dataset_name = "GoT"

In [22]:
uploaded = files.upload()

Saving got1.txt to got1.txt
Saving got2.txt to got2.txt
Saving got3.txt to got3.txt
Saving got4.txt to got4.txt
Saving got5.txt to got5.txt


In [28]:
uploaded.keys()

dict_keys(['got1.txt', 'got2.txt', 'got3.txt', 'got4.txt', 'got5.txt'])

In [89]:
def read_script(filename):
    with open(filename, 'r') as f:
        return(f.read().split('\n\n'))

GoT = pd.DataFrame({feature_name: [read_script(x) for x in uploaded.keys()]}).explode(feature_name).reset_index(drop=True)
GoT.head(10)

Unnamed: 0,content
0,PROLOGUE
1,"The comet’s tail spread across the dawn, a red..."
2,The maester stood on the windswept balcony out...
3,The maester did not believe in omens. And yet ...
4,"Such folly. He leaned against the battlement, ..."
5,And yet . . . and yet . . . the comet burned e...
6,"“Maester Cressen, we have visitors.” Pylos spo..."
7,"The old man turned away from the dawn, keeping..."
8,"Taking his arm, Pylos led him inside. In his y..."
9,He let the younger man settle him behind his b...


In [90]:
dataset = Dataset.from_pandas(GoT[[feature_name]])
dataset

Dataset({
    features: ['content'],
    num_rows: 45422
})

## Data preparation for fine-tuning

In [91]:
dataset = dataset.map(tokenize_function,
                      batched=True,
                      batch_size=1000,
                      num_proc=4)
dataset

Map (num_proc=4):   0%|          | 0/45422 [00:00<?, ? examples/s]

Dataset({
    features: ['content', 'input_ids', 'attention_mask'],
    num_rows: 45422
})

In [110]:
dataset = dataset.map(
    tokenize_decoder,
    batched=True,
    batch_size=1000,
    num_proc=1
)
dataset

Map:   0%|          | 0/45422 [00:00<?, ? examples/s]

Dataset({
    features: ['content', 'input_ids', 'attention_mask', 'tokens'],
    num_rows: 45422
})

In [111]:
dataset[feature_name][0]

'PROLOGUE'

In [112]:
dataset["tokens"][0][0:20]

['PR', 'OLOG', 'UE']

In [113]:
dataset = dataset.remove_columns([feature_name, "tokens"])
dataset

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 45422
})

In [114]:
dataset = split_dataset(dataset, 0.7)
dataset

[INFO] Splitting up dataset into train / validation / test sets


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 31795
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 6814
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 6813
    })
})

In [116]:
# block_size = tokenizer.model_max_length
block_size = 128
lm_dataset = dataset.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=4,
)

Map (num_proc=4):   0%|          | 0/31795 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/6814 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/6813 [00:00<?, ? examples/s]

In [117]:
lm_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 14291
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3097
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2894
    })
})

In [118]:
[tokenizer.decode(x) for x in dataset["train"][0:5]["input_ids"]]

['PROLOGUE',
 'The comet’s tail spread across the dawn, a red slash that bled above the crags of Dragonstone like a wound in the pink and purple sky.',
 'The maester stood on the windswept balcony outside his chambers. It was here the ravens came, after long flight. Their droppings speckled the gargoyles that rose twelve feet tall on either side of him, a hellhound and a wyvern, two of the thousand that brooded over the walls of the ancient fortress. When first he came to Dragonstone, the army of stone grotesques had made him uneasy, but as the years passed he had grown used to them. Now he thought of them as old friends. The three of them watched the sky together with foreboding.',
 'The maester did not believe in omens. And yet... old as he was, Cressen had never seen a comet half so bright, nor yet that color, that terrible color, the color of blood and flame and sunsets. He wondered if his gargoyles had ever seen its like. They had been here so much longer than he had, and would st

In [119]:
[tokenizer.decode(lm_dataset["train"][i]["input_ids"]) for i in range(5)]

['PROLOGUEThe comet’s tail spread across the dawn, a red slash that bled above the crags of Dragonstone like a wound in the pink and purple sky.The maester stood on the windswept balcony outside his chambers. It was here the ravens came, after long flight. Their droppings speckled the gargoyles that rose twelve feet tall on either side of him, a hellhound and a wyvern, two of the thousand that brooded over the walls of the ancient fortress. When first he came to Dragonstone, the army of stone grotesques had made him uneasy,',
 ' but as the years passed he had grown used to them. Now he thought of them as old friends. The three of them watched the sky together with foreboding.The maester did not believe in omens. And yet... old as he was, Cressen had never seen a comet half so bright, nor yet that color, that terrible color, the color of blood and flame and sunsets. He wondered if his gargoyles had ever seen its like. They had been here so much longer than he had, and would still be her

In [129]:
fine_tuned_model_name = f"ds-summer-school-{dataset_name}"

In [121]:
training_args = TrainingArguments(output_dir=fine_tuned_model_name,
                                  evaluation_strategy="epoch",
                                  num_train_epochs=10,
                                  per_device_train_batch_size=128,
                                  disable_tqdm=False,
                                  save_strategy="steps",
                                  save_steps=10000,
                                  learning_rate=2e-5,
                                  weight_decay=0.01,
                                  push_to_hub=True,
                                  seed=seed)

In [127]:
data_collator_for_LM = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                       mlm=False)
trainer = Trainer(model=model,
                  args=training_args,
                  train_dataset=lm_dataset["train"],
                  eval_dataset=lm_dataset["validation"],
                  data_collator=data_collator_for_LM,
                  tokenizer=tokenizer)

/content/ds-summer-school-distilgpt2 is already a clone of https://huggingface.co/rchan26/ds-summer-school-distilgpt2. Make sure you pull the latest changes with `repo.git_pull()`.


In [128]:
trainer.train()



OutOfMemoryError: ignored

In [62]:
import math
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

Perplexity: 47.92


In [38]:
# trainer.save_model(fine_tuned_model_name)
trainer.push_to_hub()

Upload file pytorch_model.bin:   0%|          | 1.00/318M [00:00<?, ?B/s]

Upload file runs/Apr24_16-40-54_0d1eec507afb/1682354470.2915497/events.out.tfevents.1682354470.0d1eec507afb.41…

Upload file runs/Apr24_16-40-54_0d1eec507afb/events.out.tfevents.1682355013.0d1eec507afb.419.2:   0%|         …

Upload file training_args.bin:   0%|          | 1.00/3.50k [00:00<?, ?B/s]

Upload file runs/Apr24_16-40-54_0d1eec507afb/events.out.tfevents.1682354470.0d1eec507afb.419.0:   0%|         …

To https://huggingface.co/rchan26/ds-summer-school-seinfeld
   0f11fc5..1358ba1  main -> main

   0f11fc5..1358ba1  main -> main

To https://huggingface.co/rchan26/ds-summer-school-seinfeld
   1358ba1..4d2ec82  main -> main

   1358ba1..4d2ec82  main -> main



'https://huggingface.co/rchan26/ds-summer-school-seinfeld/commit/1358ba19bc453923978fe99a0001fee7b848fe32'

## Question answering

In [None]:
model_name = 'deepset/bert-base-cased-squad2'
config = AutoConfig.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
data_collator = DataCollatorWithPadding(tokenizer)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

Downloading (…)lve/main/config.json:   0%|          | 0.00/508 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/152 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]