In [384]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from datasets import load_dataset
import torch

tokenizer = GPT2Tokenizer.from_pretrained("dpv/finetuned-gpt2-tiny")
model = GPT2LMHeadModel.from_pretrained("dpv/finetuned-gpt2-tiny")

In [385]:
model_size = sum(t.numel() for t in model.parameters())
print(f"GPT-2 size: {model_size/1000**2:.1f}M parameters")

GPT-2 size: 124.4M parameters


In [386]:
text = "Once upon a time, a little ant colony"
encoded_input = tokenizer(text, return_tensors="pt")
# output_base = model(**encoded_input)
output = model.generate(
    **encoded_input, max_length=50, pad_token_id=tokenizer.eos_token_id
)
output_text = tokenizer.decode(output[0], skip_special_tokens=True)

In [387]:
output[0]

tensor([ 7454,  2402,   257,   640,    11,   257,  1310,  1885, 18815,   373,
         1043,   287,   262,  8222,    13,   383, 18815,   373,  1043,   284,
          307,   257,  1402,    11,   475,   845,  1588,    11, 18815,   286,
        27842,    13,   383, 27842,   547,   845,  1588,    11,   290,   484,
          547,   845,  1327,   284,  1494,    13,   383, 27842,   547,   845])

In [388]:
print(output_text)

Once upon a time, a little ant colony was found in the forest. The colony was found to be a small, but very large, colony of ants. The ants were very large, and they were very hard to kill. The ants were very


In [175]:
dataset = load_dataset("roneneldan/TinyStories", split="train")

Downloading readme: 100%|██████████| 946/946 [00:00<00:00, 1.46MB/s]
Using custom data configuration roneneldan--TinyStories-6ac769f186d7da53


Downloading and preparing dataset parquet/roneneldan--TinyStories to /Users/fabio/.cache/huggingface/datasets/roneneldan___parquet/roneneldan--TinyStories-6ac769f186d7da53/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 9.99M/9.99M [00:00<00:00, 33.4MB/s]
Downloading data: 100%|██████████| 249M/249M [00:04<00:00, 60.1MB/s]]
Downloading data: 100%|██████████| 248M/248M [00:03<00:00, 66.0MB/s]
Downloading data: 100%|██████████| 246M/246M [00:04<00:00, 58.5MB/s]
Downloading data: 100%|██████████| 248M/248M [00:04<00:00, 54.8MB/s]
Downloading data files: 100%|██████████| 2/2 [00:21<00:00, 10.83s/it]
Extracting data files: 100%|██████████| 2/2 [00:00<00:00, 235.24it/s]
                                  

Dataset parquet downloaded and prepared to /Users/fabio/.cache/huggingface/datasets/roneneldan___parquet/roneneldan--TinyStories-6ac769f186d7da53/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


In [184]:
# shuffle the dataset
dataset_shuffled = dataset.shuffle()

# select one sample
sample = dataset_shuffled.select(range(1))

In [444]:
text = sample["text"][0][:100]
encoded_input = tokenizer(text, return_tensors="pt")
# output_base = model(**encoded_input)
output = model.generate(
    **encoded_input, max_length=50, pad_token_id=tokenizer.eos_token_id
)
output_text = tokenizer.decode(output[0], skip_special_tokens=True)

In [447]:
output[0][26:]

tensor([  318,  5762,   257,  2042, 10147,   290,  2042, 12581,    13,   679,
          318,  5762,   257,  2042,  6877,   290,  2042, 12581,    13,   679,
          318,  5762,   257,  2042])

In [424]:
tokenizer.padding_side = "right"
encoded_input = tokenizer(text, return_tensors="pt")
tokenizer.decode(model(**encoded_input).logits.argmax(dim=2)[0])

". I are to play with the same,\n're the lot tree and the other and They's a and white, has is"

In [425]:
output = model(**encoded_input)

In [426]:
next_token_logits = output.logits[:, -1, :]

In [427]:
torch.argmax(next_token_logits, dim=-1)

tensor([318])

In [402]:
output

tensor([[14967,   290, 32189,   588,   284,   711,   287,   262,  3952,    13,
          1119,   766,   257,  1263,  3430,   319,   262,  2323,    13,   632,
           318,  7586,   290,   890,   290,   339,   318,  5762,   257,  2042,
         10147,   290,  2042, 12581,    13,   679,   318,  5762,   257,  2042,
          6877,   290,  2042, 12581,    13,   679,   318,  5762,   257,  2042]])

In [400]:
encoded_input["input_ids"]

tensor([[14967,   290, 32189,   588,   284,   711,   287,   262,  3952,    13,
          1119,   766,   257,  1263,  3430,   319,   262,  2323,    13,   632,
           318,  7586,   290,   890,   290,   339]])

In [382]:
tokenizer.decode(
    tokenizer(sample["text"], return_tensors="pt")["input_ids"][0],
    skip_special_tokens=True,
)

'Tim and Mia like to play in the park. They see a big club on the ground. It is brown and long and heavy.\n\n"Look, a club!" Tim says. "I can lift it!"\n\nHe tries to lift the club, but it is too tough. He falls down and drops the club.\n\n"Ouch!" he says. "That hurt!"\n\nMia laughs. She is not mean, she just thinks it is funny.\n\n"Let me try!" she says. "I can balance it!"\n\nShe picks up the club and puts it on her head. She walks slowly and carefully. She does not fall down.\n\n"Wow!" Tim says. "You are good at balancing!"\n\n"Thank you!" Mia says. "It is fun!"\n\nThey take turns balancing the club on their heads, arms, and legs. They have a lot of fun with the club. They are happy and proud. They are good friends.'

In [198]:
tokenizer(sample["text"], return_tensors="pt")

{'input_ids': tensor([[14967,   290, 32189,   588,   284,   711,   287,   262,  3952,    13,
          1119,   766,   257,  1263,  3430,   319,   262,  2323,    13,   632,
           318,  7586,   290,   890,   290,  4334,    13,   198,   198,     1,
          8567,    11,   257,  3430,  2474,  5045,  1139,    13,   366,    40,
           460, 10303,   340,  2474,   198,   198,  1544,  8404,   284, 10303,
           262,  3430,    11,   475,   340,   318,  1165,  5802,    13,   679,
          8953,   866,   290, 10532,   262,  3430,    13,   198,   198,     1,
            46,   794,  2474,   339,  1139,    13,   366,  2504,  5938,  2474,
           198,   198,    44,   544, 22051,    13,  1375,   318,   407,  1612,
            11,   673,   655,  6834,   340,   318,  8258,    13,   198,   198,
             1,  5756,   502,  1949,  2474,   673,  1139,    13,   366,    40,
           460,  5236,   340,  2474,   198,   198,  3347, 11103,   510,   262,
          3430,   290,  7584,   340,  

In [190]:
sample["text"][0][:100]

'Tim and Mia like to play in the park. They see a big club on the ground. It is brown and long and he'

In [191]:
tokenizer(sample["text"][0][:100], return_tensors="pt")

{'input_ids': tensor([[14967,   290, 32189,   588,   284,   711,   287,   262,  3952,    13,
          1119,   766,   257,  1263,  3430,   319,   262,  2323,    13,   632,
           318,  7586,   290,   890,   290,   339]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1]])}

In [240]:
output = model(encoded_input["input_ids"], labels=encoded_input["input_ids"])

In [241]:
# output.loss.backward()
for param in model.named_parameters():
    print(param[0], param[1].grad)

transformer.wte.weight tensor([[ 7.0591e-06,  3.5879e-06, -6.6299e-05,  ..., -1.7504e-05,
         -2.0686e-05, -4.4902e-05],
        [ 1.3535e-05,  2.3487e-06, -7.2210e-05,  ..., -2.7416e-05,
         -1.3930e-05, -3.4621e-05],
        [ 1.3506e-07,  1.9443e-07, -1.4108e-06,  ..., -5.4301e-07,
         -4.0270e-07, -4.6412e-07],
        ...,
        [ 6.5042e-08,  1.4908e-08, -5.5142e-08,  ...,  1.7429e-08,
          5.3899e-09, -1.3369e-07],
        [ 3.8537e-07, -4.9245e-08, -1.4201e-06,  ...,  4.5042e-08,
         -3.3162e-07, -5.0026e-07],
        [ 5.3926e-06,  3.7411e-06, -2.3852e-05,  ..., -1.6649e-05,
         -2.3811e-06, -1.8391e-05]])
transformer.wpe.weight tensor([[-0.1352, -0.1679,  0.1112,  ..., -0.1684,  0.2953, -0.1492],
        [ 0.0695, -0.1063, -0.0924,  ...,  0.0700, -0.1334, -0.0577],
        [ 0.0036, -0.0218,  0.0102,  ...,  0.0029, -0.0102,  0.0450],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0

In [219]:
[param[0] for param in model.named_parameters() if "mlp" in param[0]]

['transformer.h.0.mlp.c_fc.weight',
 'transformer.h.0.mlp.c_fc.bias',
 'transformer.h.0.mlp.c_proj.weight',
 'transformer.h.0.mlp.c_proj.bias',
 'transformer.h.1.mlp.c_fc.weight',
 'transformer.h.1.mlp.c_fc.bias',
 'transformer.h.1.mlp.c_proj.weight',
 'transformer.h.1.mlp.c_proj.bias',
 'transformer.h.2.mlp.c_fc.weight',
 'transformer.h.2.mlp.c_fc.bias',
 'transformer.h.2.mlp.c_proj.weight',
 'transformer.h.2.mlp.c_proj.bias',
 'transformer.h.3.mlp.c_fc.weight',
 'transformer.h.3.mlp.c_fc.bias',
 'transformer.h.3.mlp.c_proj.weight',
 'transformer.h.3.mlp.c_proj.bias',
 'transformer.h.4.mlp.c_fc.weight',
 'transformer.h.4.mlp.c_fc.bias',
 'transformer.h.4.mlp.c_proj.weight',
 'transformer.h.4.mlp.c_proj.bias',
 'transformer.h.5.mlp.c_fc.weight',
 'transformer.h.5.mlp.c_fc.bias',
 'transformer.h.5.mlp.c_proj.weight',
 'transformer.h.5.mlp.c_proj.bias',
 'transformer.h.6.mlp.c_fc.weight',
 'transformer.h.6.mlp.c_fc.bias',
 'transformer.h.6.mlp.c_proj.weight',
 'transformer.h.6.mlp.c_proj

In [226]:
mlp_model_size = sum(t.numel() for name, t in model.named_parameters() if "mlp" in name)
print(f"Mlp parameters size: {mlp_model_size/1000**2:.1f}M parameters")

Mlp parameters size: 56.7M parameters


In [257]:
def tokenize(element):
    outputs = tokenizer(
        element["text"],
        truncation=True,
        max_length=20,
        return_overflowing_tokens=True,
        return_length=True,
    )

    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length == 20:
            input_batch.append(input_ids)
    return {"input_ids": input_batch}

In [258]:
tokenized_datasets = sample.map(tokenize, batched=True)
tokenized_datasets

  0%|          | 0/1 [00:00<?, ?ba/s]


Dataset({
    features: ['text', 'input_ids'],
    num_rows: 1
})

In [259]:
tokenized_datasets["input_ids"]

[[14967,
  290,
  32189,
  588,
  284,
  711,
  287,
  262,
  3952,
  13,
  1119,
  766,
  257,
  1263,
  3430,
  319,
  262,
  2323,
  13,
  632]]