In [156]:
import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset
from torch.utils.data.dataloader import DataLoader

seed = 1999

from datasets.arrow_dataset import Dataset
from datasets.dataset_dict import DatasetDict
from transformers import (
    AutoConfig,
    AutoModel,
    AutoModelForCausalLM,
    AutoModelForQuestionAnswering,
    AutoTokenizer,
    BatchEncoding,
    DataCollator,
    DataCollatorWithPadding,
    DataCollatorForLanguageModeling,
    EvalPrediction,
    PretrainedConfig,
    PreTrainedModel,
    PreTrainedTokenizer,
    Trainer,
    TrainingArguments,
    pipeline,
    set_seed
)

In [3]:
# check torch is using GPU acceleration
print(torch.backends.mps.is_available())
print(torch.backends.mps.is_built())

True
True


In [4]:
# Get cpu or gpu device for training.
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Using {device} device")

Using mps device


## Text generation

We can just use `gpt2` model from the Hub as a starting point...

In [160]:
model_name = 'gpt2'
config = AutoConfig.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorWithPadding(tokenizer)
model = AutoModelForCausalLM.from_pretrained(model_name)

Using pad_token, but it is not set yet.


In [161]:
tokenizer

GPT2TokenizerFast(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True)

In [162]:
text = "Replace me by any text you'd like."
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)

In [163]:
output

CausalLMOutputWithCrossAttentions(loss=None, logits=tensor([[[ -28.7086,  -27.9282,  -30.2394,  ...,  -37.6027,  -35.8096,
           -28.8439],
         [ -60.0999,  -57.8383,  -62.0976,  ...,  -68.0317,  -67.7715,
           -60.5209],
         [ -76.6621,  -78.3350,  -83.4316,  ...,  -89.8896,  -89.1757,
           -81.1512],
         ...,
         [-145.1963, -145.4154, -150.1035,  ..., -153.6665, -150.1051,
          -146.9226],
         [ -84.2474,  -85.6009,  -91.2381,  ...,  -99.5435,  -97.8360,
           -88.2612],
         [-122.2601, -121.1205, -121.5652,  ..., -131.8223, -131.5999,
          -115.1341]]], grad_fn=<UnsafeViewBackward0>), past_key_values=((tensor([[[[-1.0719,  2.4170,  0.9660,  ..., -0.4787, -0.3316,  1.7925],
          [-2.2897,  2.5424,  0.8317,  ..., -0.5299, -2.4828,  1.3537],
          [-2.2856,  2.7125,  2.4725,  ..., -1.4911, -1.8427,  1.6493],
          ...,
          [-3.3203,  2.3325,  2.7061,  ..., -1.1569, -1.5586,  2.4076],
          [-2.9917,  

In [164]:
generator = pipeline('text-generation', model=model, tokenizer=tokenizer)
set_seed(42)
generator("The White man worked as a", max_length=10, num_return_sequences=1)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'The White man worked as a clerk and was a'}]

## Fine-tuning on new data

In [165]:
seinfeld = pd.read_csv("the-seinfeld-chronicles/scripts.csv")
seinfeld["Character"] = seinfeld["Character"].astype(str)
seinfeld["Dialogue"] = seinfeld["Dialogue"].astype(str)

In [166]:
seinfeld.head()

Unnamed: 0.1,Unnamed: 0,Character,Dialogue,EpisodeNo,SEID,Season
0,0,JERRY,Do you know what this is all about? Do you kno...,1.0,S01E01,1.0
1,1,JERRY,"(pointing at Georges shirt) See, to me, that b...",1.0,S01E01,1.0
2,2,GEORGE,Are you through?,1.0,S01E01,1.0
3,3,JERRY,"You do of course try on, when you buy?",1.0,S01E01,1.0
4,4,GEORGE,"Yes, it was purple, I liked it, I dont actuall...",1.0,S01E01,1.0


In [167]:
feature_name = "Character_Dialogue"
seinfeld[feature_name] = seinfeld["Character"] + ": " + seinfeld["Dialogue"]
seinfeld.head(60)

Unnamed: 0.1,Unnamed: 0,Character,Dialogue,EpisodeNo,SEID,Season,Character_Dialogue
0,0,JERRY,Do you know what this is all about? Do you kno...,1.0,S01E01,1.0,JERRY: Do you know what this is all about? Do ...
1,1,JERRY,"(pointing at Georges shirt) See, to me, that b...",1.0,S01E01,1.0,"JERRY: (pointing at Georges shirt) See, to me,..."
2,2,GEORGE,Are you through?,1.0,S01E01,1.0,GEORGE: Are you through?
3,3,JERRY,"You do of course try on, when you buy?",1.0,S01E01,1.0,"JERRY: You do of course try on, when you buy?"
4,4,GEORGE,"Yes, it was purple, I liked it, I dont actuall...",1.0,S01E01,1.0,"GEORGE: Yes, it was purple, I liked it, I dont..."
5,5,JERRY,"Oh, you dont recall?",1.0,S01E01,1.0,"JERRY: Oh, you dont recall?"
6,6,GEORGE,"(on an imaginary microphone) Uh, no, not at th...",1.0,S01E01,1.0,"GEORGE: (on an imaginary microphone) Uh, no, n..."
7,7,JERRY,"Well, senator, Id just like to know, what you ...",1.0,S01E01,1.0,"JERRY: Well, senator, Id just like to know, wh..."
8,8,CLAIRE,Mr. Seinfeld. Mr. Costanza.,1.0,S01E01,1.0,CLAIRE: Mr. Seinfeld. Mr. Costanza.
9,9,GEORGE,"Are, are you sure this is decaf? Wheres the or...",1.0,S01E01,1.0,"GEORGE: Are, are you sure this is decaf? Where..."


In [168]:
seinfeld_dataset = Dataset.from_pandas(seinfeld[["Character", "Dialogue"]])
seinfeld_dataset

Dataset({
    features: ['Character', 'Dialogue'],
    num_rows: 54616
})

In [169]:
len(seinfeld_dataset["Character"])

54616

In [170]:
len(seinfeld_dataset["Dialogue"])

54616

In [171]:
# set padding to False for now - when we train, we'll use dynamic padding...
tokenizer_args = {"padding": False, "truncation": True}
def tokenize_function(dataset):
    return tokenizer(
        dataset["Character"],
        dataset["Dialogue"],
        **tokenizer_args,
    )

In [172]:
seinfeld_tokenized = seinfeld_dataset.map(tokenize_function,
                                          batched=True,
                                          batch_size=128)

Map:   0%|          | 0/54616 [00:00<?, ? examples/s]

In [173]:
seinfeld_tokenized

Dataset({
    features: ['Character', 'Dialogue', 'input_ids', 'attention_mask'],
    num_rows: 54616
})

In [174]:
def split_dataset(
    dataset,
    train_size: float = 0.8,
    valid_size: float | None = 0.5,
    seed: int = 42,
) -> DatasetDict:
    """
    Split up dataset into train, validation, test sets for training / fine-tuning.

    Parameters
    ----------
    train_size : float, optional
        How to split the initial dataset into train, test/validation, by default 0.8.
    valid_size : float | None, optional
        How to split the remaining dataset after the first split, by default 0.5.
        For example, if the size of the dataset is N=100, and we have `train_size=0.8`,
        `valid_size=0.5`, the training set will have 80 samples, the validation set
        will have 10 samples and the test set will have 10 samples.
    seed : int, optional
        Seed for splitting, by default 42.

    Returns
    -------
    DatasetDict
        A dictionary of Datasets with training (`train`), validation (`valid`)
        (if `valid_size` is not None), and test (`test`) Datasets.
    """
    if valid_size is None:
        print(
            "[INFO] Splitting up dataset into train / test sets"
        )
    else:
        print(
            "[INFO] Splitting up dataset into train / validation / test sets"
        )

    # first split data into train set, test/valid set
    train_testvalid = dataset.train_test_split(
        train_size=train_size, seed=seed
    )
    if valid_size is not None:
        # further split the test set into a test, valid set
        test_valid = train_testvalid["test"].train_test_split(
            train_size=valid_size, seed=seed
        )
        # gather everyone if you want to have a single DatasetDict
        dataset_split = DatasetDict(
            {
                "train": train_testvalid["train"],
                "test": test_valid["test"],
                "validation": test_valid["train"],
            }
        )
    else:
        dataset_split = DatasetDict(
            {
                "train": train_testvalid["train"],
                "test": train_testvalid["test"],
                "validation": None,
            }
        )
    return dataset_split

In [183]:
split_tokenized_dataset = split_dataset(seinfeld_tokenized, 0.1)

[INFO] Splitting up dataset into train / validation / test sets


In [184]:
split_tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['Character', 'Dialogue', 'input_ids', 'attention_mask'],
        num_rows: 5461
    })
    test: Dataset({
        features: ['Character', 'Dialogue', 'input_ids', 'attention_mask'],
        num_rows: 24578
    })
    validation: Dataset({
        features: ['Character', 'Dialogue', 'input_ids', 'attention_mask'],
        num_rows: 24577
    })
})

In [185]:
fine_tuned_model_name = "ds-summer-school-seinfeld"

In [186]:
training_args = TrainingArguments(output_dir=fine_tuned_model_name,
                                  evaluation_strategy="epoch",
                                  num_train_epochs=600,
                                  per_device_train_batch_size=128,
                                  disable_tqdm=False,
                                  save_strategy="steps",
                                  save_steps=10000,
                                  seed=seed)

In [187]:
data_collator_for_LM = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                       mlm=False)
trainer = Trainer(model=model,
                  args=training_args,
                  train_dataset=split_tokenized_dataset["train"],
                  eval_dataset=split_tokenized_dataset["validation"],
                  data_collator=data_collator_for_LM,
                  tokenizer=tokenizer)

In [188]:
trainer.train()

KeyboardInterrupt: 

In [None]:
trainer.save_model(model_name)

## Question answering

In [38]:
model_name = 'deepset/bert-base-cased-squad2'
config = AutoConfig.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
data_collator = DataCollatorWithPadding(tokenizer)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

Downloading (…)lve/main/config.json:   0%|          | 0.00/508 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/152 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]