https://www.bdmi.org/Book-Reading/Charlie-and-the-Chocolate-Factory.pdf

## 1. Load the data

In [1]:
book_content = [
"1 Here Comes Charlie",
"2 Mr Willy Wonka’s Factory",
"3 Mr Wonka and the Indian Prince",
"4 The Secret Workers",
"5 The Golden Tickets",
"6 The First Two Finders",
"7 Charlie’s Birthday",
"8 Two More Golden Tickets Found",
"9 Grandpa Joe Takes a Gamble",
"10 The Family Begins to Starve",
"11 The Miracle",
"12 What It Said on the Golden Ticket",
"13 The Big Day Arrives",
"14 Mr Willy Wonka",
"15 The Chocolate Room",
"16 The Oompa-Loompas",
"17 Augustus Gloop Goes up the Pipe",
"18 Down the Chocolate River",
"19 The Inventing Room – Everlasting Gobstoppers and Hair Toffee",
"20 The Great Gum Machine",
"21 Good-bye Violet",
"22 Along the Corridor",
"23 Square Sweets That Look Round",
"24 Veruca in the Nut Room",
"25 The Great Glass Lift",
"26 The Television-Chocolate Room",
"27 Mike Teavee is Sent by Television",
"28 Only Charlie Left",
"29 The Other Children Go Home",
"30 Charlie’s Chocolate Factory",
]

https://stackoverflow.com/questions/3277503/how-to-read-a-file-line-by-line-into-a-list

In [2]:
book_chapters = [str(chapter) for chapter in range(31)]

In [3]:
with open("Charlie and the Chocolate Factory.txt", encoding="utf8") as file:
    book_lines = [line.rstrip('\n') for line in file if line.rstrip('\n') not in book_chapters]

In [4]:
book_lines = []
skip_line = False

with open("Charlie and the Chocolate Factory.txt", encoding="utf8") as file:
    for line in file:
        if skip_line:
            skip_line = False
            continue
        
        line = line.rstrip('\n')
        if line in book_chapters:
            skip_line = True
            continue
        
        book_lines.append(line)

In [5]:
book_lines[:15]

['These two very old people are the father and mother of Mr Bucket.',
 'Their names are Grandpa Joe and Grandma Josephine.',
 'And these two very old people are the father and mother of Mrs',
 'Bucket. Their names are Grandpa George and Grandma Georgina.',
 'This is Mr Bucket. This is Mrs Bucket.',
 'Mr and Mrs Bucket have a small boy whose name is Charlie Bucket.',
 'This is Charlie.',
 'How d’you do? And how d’you do? And how d’you do again? He is',
 'pleased to meet you.',
 'The whole of this family – the six grown-ups (count them) and little',
 'Charlie Bucket – live together in a small wooden house on the edge of a',
 'great town.',
 'The house wasn’t nearly large enough for so many people, and life',
 'was extremely uncomfortable for them all. There were only two rooms',
 'in the place altogether, and there was only one bed. The bed was given']

## 2. Preprocessing

### Combining some sentences

In [6]:
import random

In [7]:
df = []
lines = []
current = ""
combine_line_countdown = random.randint(1, 7) # Combine 'combine_line_countdown' number of sentences

for line in book_lines:
    current += " " + line
    combine_line_countdown -= 1
    
    if combine_line_countdown == 0:
        combine_line_countdown = random.randint(1, 3)
        df.append(current.strip())
        current = ""

In [8]:
df[:5]

['These two very old people are the father and mother of Mr Bucket. Their names are Grandpa Joe and Grandma Josephine. And these two very old people are the father and mother of Mrs Bucket. Their names are Grandpa George and Grandma Georgina. This is Mr Bucket. This is Mrs Bucket.',
 'Mr and Mrs Bucket have a small boy whose name is Charlie Bucket.',
 'This is Charlie. How d’you do? And how d’you do? And how d’you do again? He is pleased to meet you.',
 'The whole of this family – the six grown-ups (count them) and little',
 'Charlie Bucket – live together in a small wooden house on the edge of a']

In [9]:
df_indices = [i for i in range(len(df))]
random.shuffle(df_indices)

val_split_index = int(0.2 * len(df_indices))
train_idx       = df_indices[val_split_index:]
val_idx         = df_indices[:val_split_index]

def add_text_index(df, indices):
    text_list = []
    for idx in indices:
        text_list.append(df[idx])
    
    return text_list

# storing text in df_train for indices:
df_train = add_text_index(df, train_idx)

# storing text in df_val for indices:
df_val = add_text_index(df, val_idx)

In [10]:
assert len(df_train) + len(df_val) == len(df)

In [11]:
len(df_train), len(df_val)

(1225, 306)

### Convert 'df_train' and 'df_val' into Dataset type

In [12]:
from datasets import Dataset, DatasetDict

In [13]:
my_list = [{"text": text} for text in df_train]
df_train = Dataset.from_list(my_list)

my_list = [{"text": text} for text in df_val]
df_val  = Dataset.from_list(my_list)

new_df = DatasetDict({
    "train": df_train,
    "validation": df_val
})

In [14]:
new_df

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 1225
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 306
    })
})

### Tokenization

In [15]:
from transformers import AutoTokenizer

checkpoint = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [16]:
outputs = tokenizer(
    new_df["train"][:2]["text"],
    truncation=True,
    max_length=10,
    return_overflowing_tokens=True,
    return_length=True,
)

print(f"Input IDs length: {len(outputs['input_ids'])}")
print(f"Input chunk lengths: {(outputs['length'])}")
print(f"Chunk mapping: {outputs['overflow_to_sample_mapping']}")

Input IDs length: 12
Input chunk lengths: [10, 10, 10, 10, 10, 5, 10, 10, 10, 10, 10, 2]
Chunk mapping: [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]


In [17]:
print("---------------------- Sentence ----------------------")
print(new_df["train"][0]["text"], end="\n\n")

print("--------------------- Tokenization --------------------")
print(tokenizer(new_df["train"][0]["text"]))

---------------------- Sentence ----------------------
‘But… but… but…’ shrieked Mrs Salt, ‘where does the great big pipe go to in the end?’ ‘Why, to the furnace, of course,’ Mr Wonka said calmly. ‘To the

--------------------- Tokenization --------------------
{'input_ids': [447, 246, 1537, 1399, 475, 1399, 475, 1399, 447, 247, 35064, 988, 276, 9074, 13754, 11, 564, 246, 3003, 857, 262, 1049, 1263, 12656, 467, 284, 287, 262, 886, 30, 447, 247, 564, 246, 5195, 11, 284, 262, 42227, 11, 286, 1781, 11, 447, 247, 1770, 23306, 4914, 531, 30180, 13, 564, 246, 2514, 262], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [18]:
context_length = 10

def tokenize(element):
    outputs = tokenizer(
        element["text"],
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True,
    )
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length == context_length:
            input_batch.append(input_ids)
    return {"input_ids": input_batch}


tokenized_datasets = new_df.map(
    tokenize, batched=True, remove_columns=new_df["train"].column_names
)
tokenized_datasets

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 3287
    })
    validation: Dataset({
        features: ['input_ids'],
        num_rows: 762
    })
})

## 3. Model

In [19]:
from transformers import AutoTokenizer, GPT2LMHeadModel, AutoConfig

config = AutoConfig.from_pretrained(
    "gpt2",
    vocab_size=len(tokenizer),
    n_ctx=context_length,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)

In [20]:
model = GPT2LMHeadModel(config)
model_size = sum(t.numel() for t in model.parameters())
print(f"GPT-2 size: {model_size/1000**2:.1f}M parameters")

GPT-2 size: 124.4M parameters


In [21]:
next(model.parameters()).is_cuda

False

### Keyword

In [22]:

keytoken_ids = []
for keyword in [
    "Charlie",
    "Wonka",
    "chocolate",
    "Chocolate",
    "Tickets",
    "factory",
    "Golden",
    "Joe",
]:
    ids = tokenizer([keyword]).input_ids[0]
    if len(ids) == 1:
        keytoken_ids.append(ids[0])
    else:
        print(f"Keyword has not single token: {keyword}")

Keyword has not single token: Wonka
Keyword has not single token: chocolate
Keyword has not single token: Chocolate
Keyword has not single token: factory


### Loss

In [23]:
from torch.nn import CrossEntropyLoss
import torch

def keytoken_weighted_loss(inputs, logits, keytoken_ids, alpha=1.0):
    # Shift so that tokens < n predict n
    shift_labels = inputs[..., 1:].contiguous()
    shift_logits = logits[..., :-1, :].contiguous()
    # Calculate per-token loss
    loss_fct = CrossEntropyLoss(reduce=False) #change to reduction=None
    loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
    # Resize and average loss per sample
    loss_per_sample = loss.view(shift_logits.size(0), shift_logits.size(1)).mean(axis=1)
    # Calculate and scale weighting
    weights = torch.stack([(inputs == kt).float() for kt in keytoken_ids]).sum(
        axis=[0, 2]
    )
    weights = alpha * (1.0 + weights)
    # Calculate weighted average
    weighted_loss = (loss_per_sample * weights).mean()
    return weighted_loss

### Dataloaders

In [24]:
from torch.utils.data.dataloader import DataLoader

tokenized_datasets.set_format("torch")
train_dataloader = DataLoader(tokenized_datasets["train"], batch_size=32, shuffle=True)
eval_dataloader  = DataLoader(tokenized_datasets["validation"], batch_size=32)

### Optimizer

In [25]:
weight_decay = 0.1


def get_grouped_params(model, no_decay=["bias", "LayerNorm.weight"]):
    params_with_wd, params_without_wd = [], []
    for n, p in model.named_parameters():
        if any(nd in n for nd in no_decay):
            params_without_wd.append(p)
        else:
            params_with_wd.append(p)
    return [
        {"params": params_with_wd, "weight_decay": weight_decay},
        {"params": params_without_wd, "weight_decay": 0.0},
    ]

In [26]:
def evaluate():
    model.eval()
    losses = []
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            outputs = model(batch["input_ids"], labels=batch["input_ids"])
            outputs.loss = outputs.loss.reshape(1)
        losses.append(accelerator.gather(outputs.loss))        
    loss = torch.mean(torch.cat(losses))
    try:
        perplexity = torch.exp(loss)
    except OverflowError:
        perplexity = float("inf")
    return loss.item(), perplexity.item()

In [27]:
model = GPT2LMHeadModel(config)

In [28]:
from torch.optim import AdamW

optimizer = AdamW(get_grouped_params(model), lr=5e-4)

### Accelerator

In [29]:
from accelerate import Accelerator

accelerator = Accelerator(mixed_precision='fp16')

model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

In [30]:
from transformers import get_scheduler

num_train_epochs = 1
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=1_000,
    num_training_steps=num_training_steps,
)

### Repository

In [31]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [32]:
## Create Repository
# from huggingface_hub import create_repo

# create_repo("aal2015/Charlie-and-the-Chocolate_Factory-LM-mode")

In [33]:
from huggingface_hub import Repository, get_full_repo_name

model_name = "Charlie-and-the-Chocolate_Factory-LM-mode"
repo_name = get_full_repo_name(model_name)
repo_name

'aal2015/Charlie-and-the-Chocolate_Factory-LM-mode'

In [34]:
import os

In [35]:
os.environ["TOKENIZERS_PARALLELISM"] = "true"

output_dir = "Charlie-and-the-Chocolate_Factory-LM-mode"
repo = Repository(output_dir, clone_from=repo_name)

C:\Users\abhin\OneDrive\Documents\AIT Thailand\NLP\04-Huggingface\Charlie-and-the-Chocolate_Factory-LM-mode is already a clone of https://huggingface.co/aal2015/Charlie-and-the-Chocolate_Factory-LM-mode. Make sure you pull the latest changes with `repo.git_pull()`.


## 5. Training

In [36]:
evaluate()

(11.048202514648438, 62830.9140625)

In [37]:
from tqdm.notebook import tqdm

gradient_accumulation_steps = 8
eval_steps = 2

model.train()
completed_steps = 0
for epoch in range(num_train_epochs):
    print("---------------------- Epoch:" + str(epoch) + "----------------------")
    for step, batch in tqdm(
        enumerate(train_dataloader, start=1), total=num_training_steps
    ):
        logits = model(batch["input_ids"]).logits
        loss = keytoken_weighted_loss(batch["input_ids"], logits, keytoken_ids)
        if step % 1 == 0:
            accelerator.print(
                {
                    "steps": completed_steps,
                    "loss/train": loss.item() * gradient_accumulation_steps,
                }
            )
        loss = loss / gradient_accumulation_steps
        accelerator.backward(loss)
        if step % gradient_accumulation_steps == 0:
            accelerator.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            completed_steps += 1
        if (step % (eval_steps * gradient_accumulation_steps)) == 0:
            eval_loss, perplexity = evaluate()
            accelerator.print({"loss/eval": eval_loss, "perplexity": perplexity})
            model.train()
            accelerator.wait_for_everyone()
            unwrapped_model = accelerator.unwrap_model(model)
            unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
            if accelerator.is_main_process:
                tokenizer.save_pretrained(output_dir)
                repo.push_to_hub(
                    commit_message=f"Training in progress step {step}", blocking=False
                )

---------------------- Epoch:0----------------------


  0%|          | 0/103 [00:00<?, ?it/s]

{'steps': 0, 'loss/train': 93.81758880615234}
{'steps': 0, 'loss/train': 88.20152282714844}




{'steps': 0, 'loss/train': 88.10628509521484}
{'steps': 0, 'loss/train': 88.19737243652344}
{'steps': 0, 'loss/train': 88.0042724609375}
{'steps': 0, 'loss/train': 88.34061431884766}
{'steps': 0, 'loss/train': 91.35808563232422}
{'steps': 0, 'loss/train': 88.39398193359375}
{'steps': 1, 'loss/train': 88.24712371826172}
{'steps': 1, 'loss/train': 90.87971496582031}
{'steps': 1, 'loss/train': 87.82565307617188}
{'steps': 1, 'loss/train': 88.11347961425781}
{'steps': 1, 'loss/train': 87.80493927001953}
{'steps': 1, 'loss/train': 88.02610778808594}
{'steps': 1, 'loss/train': 88.13383483886719}
{'steps': 1, 'loss/train': 91.17115783691406}
{'loss/eval': 11.013158798217773, 'perplexity': 60667.21875}


Several commits (10) will be pushed upstream.


{'steps': 2, 'loss/train': 87.94093322753906}
{'steps': 2, 'loss/train': 87.9168701171875}
{'steps': 2, 'loss/train': 90.57923889160156}
{'steps': 2, 'loss/train': 92.84661865234375}
{'steps': 2, 'loss/train': 90.68077087402344}
{'steps': 2, 'loss/train': 88.14566802978516}
{'steps': 2, 'loss/train': 88.23249053955078}
{'steps': 2, 'loss/train': 87.72026062011719}
{'steps': 3, 'loss/train': 87.86695861816406}
{'steps': 3, 'loss/train': 87.18161010742188}
{'steps': 3, 'loss/train': 87.67507934570312}
{'steps': 3, 'loss/train': 90.54136657714844}
{'steps': 3, 'loss/train': 87.14336395263672}
{'steps': 3, 'loss/train': 87.50393676757812}
{'steps': 3, 'loss/train': 87.45518493652344}
{'steps': 3, 'loss/train': 87.66889953613281}
{'loss/eval': 10.843338012695312, 'perplexity': 51191.97265625}


Several commits (11) will be pushed upstream.


{'steps': 4, 'loss/train': 86.79505920410156}
{'steps': 4, 'loss/train': 86.83560180664062}
{'steps': 4, 'loss/train': 86.23213195800781}
{'steps': 4, 'loss/train': 87.47259521484375}
{'steps': 4, 'loss/train': 86.74269104003906}
{'steps': 4, 'loss/train': 87.13511657714844}
{'steps': 4, 'loss/train': 86.96968841552734}
{'steps': 4, 'loss/train': 89.91542053222656}
{'steps': 5, 'loss/train': 88.60795593261719}
{'steps': 5, 'loss/train': 88.83068084716797}
{'steps': 5, 'loss/train': 88.45805358886719}
{'steps': 5, 'loss/train': 85.93077850341797}
{'steps': 5, 'loss/train': 88.49138641357422}
{'steps': 5, 'loss/train': 85.8282470703125}
{'steps': 5, 'loss/train': 87.94056701660156}
{'steps': 5, 'loss/train': 85.77117919921875}
{'loss/eval': 10.566911697387695, 'perplexity': 38828.57421875}


Several commits (12) will be pushed upstream.


{'steps': 6, 'loss/train': 87.84773254394531}
{'steps': 6, 'loss/train': 84.50074768066406}
{'steps': 6, 'loss/train': 85.05963897705078}
{'steps': 6, 'loss/train': 85.04985046386719}
{'steps': 6, 'loss/train': 84.66012573242188}
{'steps': 6, 'loss/train': 87.82927703857422}
{'steps': 6, 'loss/train': 85.00859069824219}
{'steps': 6, 'loss/train': 84.30796813964844}
{'steps': 7, 'loss/train': 86.50621032714844}
{'steps': 7, 'loss/train': 83.3243637084961}
{'steps': 7, 'loss/train': 83.60151672363281}
{'steps': 7, 'loss/train': 83.17100524902344}
{'steps': 7, 'loss/train': 83.08995056152344}
{'steps': 7, 'loss/train': 84.00460815429688}
{'steps': 7, 'loss/train': 86.28923797607422}
{'steps': 7, 'loss/train': 84.02320861816406}
{'loss/eval': 10.269451141357422, 'perplexity': 28838.0546875}


Several commits (13) will be pushed upstream.


{'steps': 8, 'loss/train': 83.6007080078125}
{'steps': 8, 'loss/train': 82.65158081054688}
{'steps': 8, 'loss/train': 82.36135864257812}
{'steps': 8, 'loss/train': 83.10652160644531}
{'steps': 8, 'loss/train': 82.63813018798828}
{'steps': 8, 'loss/train': 83.10719299316406}
{'steps': 8, 'loss/train': 82.41522979736328}
{'steps': 8, 'loss/train': 82.81314086914062}
{'steps': 9, 'loss/train': 80.84086608886719}
{'steps': 9, 'loss/train': 84.29546356201172}
{'steps': 9, 'loss/train': 80.80013275146484}
{'steps': 9, 'loss/train': 82.85252380371094}
{'steps': 9, 'loss/train': 81.59756469726562}
{'steps': 9, 'loss/train': 81.68305969238281}
{'steps': 9, 'loss/train': 81.97419738769531}
{'steps': 9, 'loss/train': 81.56845092773438}
{'loss/eval': 10.02204418182373, 'perplexity': 22517.412109375}


Several commits (14) will be pushed upstream.


{'steps': 10, 'loss/train': 80.42759704589844}
{'steps': 10, 'loss/train': 80.64562225341797}
{'steps': 10, 'loss/train': 80.36273193359375}
{'steps': 10, 'loss/train': 80.22191619873047}
{'steps': 10, 'loss/train': 80.57587432861328}
{'steps': 10, 'loss/train': 78.68746948242188}
{'steps': 10, 'loss/train': 80.6020278930664}
{'steps': 10, 'loss/train': 79.41481018066406}
{'steps': 11, 'loss/train': 81.92530822753906}
{'steps': 11, 'loss/train': 79.90909576416016}
{'steps': 11, 'loss/train': 78.67170715332031}
{'steps': 11, 'loss/train': 81.94192504882812}
{'steps': 11, 'loss/train': 80.15055847167969}
{'steps': 11, 'loss/train': 80.26918029785156}
{'steps': 11, 'loss/train': 79.3121337890625}
{'steps': 11, 'loss/train': 79.34199523925781}
{'loss/eval': 9.82398796081543, 'perplexity': 18471.568359375}


Several commits (15) will be pushed upstream.


{'steps': 12, 'loss/train': 77.95243072509766}
{'steps': 12, 'loss/train': 82.05929565429688}
{'steps': 12, 'loss/train': 78.21273040771484}
{'steps': 12, 'loss/train': 77.85730743408203}
{'steps': 12, 'loss/train': 78.75857543945312}
{'steps': 12, 'loss/train': 77.70838928222656}
{'steps': 12, 'loss/train': 80.16534423828125}


## 6. Inference

In [39]:
import torch
from transformers import pipeline

pipe = pipeline("text-generation", max_length=100, pad_token_id=0, eos_token_id=0, model="aal2015/Charlie-and-the-Chocolate_Factory-LM-mode")

ValueError: Could not load model aal2015/Charlie-and-the-Chocolate_Factory-LM-mode with any of the following classes: (<class 'transformers.models.auto.modeling_auto.AutoModelForCausalLM'>, <class 'transformers.models.auto.modeling_tf_auto.TFAutoModelForCausalLM'>, <class 'transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel'>, <class 'transformers.models.gpt2.modeling_tf_gpt2.TFGPT2LMHeadModel'>).