In [1]:
import torch

In [2]:
torch.cuda.is_available()

True

In [3]:
import transformers
print(transformers.__version__)

4.11.3


In [4]:
from transformers import AutoTokenizer

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
from pathlib import Path

In [7]:
model_checkpoint = "distilgpt2"

In [8]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

In [9]:
tokenizer.add_special_tokens({'pad_token': '[SEP]'})

1

In [10]:
DATA_DIR = "../../data/recipes-combined/individual/"

In [11]:
def read_recipes(split_dir):
    split_dir = Path(split_dir)
    texts = []
    labels = []
    for text_file in (split_dir).iterdir():
        texts.append(text_file.read_text())

    return texts

recipes = read_recipes(DATA_DIR)

In [12]:
train_data, test_data = train_test_split(recipes, test_size=.1)

In [13]:
train_data, val_data = train_test_split(train_data, test_size=.1)

In [14]:
padding = False
max_length=1024
train_encodings = tokenizer(train_data, truncation=True, padding=padding, max_length=max_length)
val_encodings = tokenizer(val_data, truncation=True, padding=padding, max_length=max_length)
test_encodings = tokenizer(test_data, truncation=True, padding=padding, max_length=max_length)

In [15]:
list(val_encodings.keys())

['input_ids', 'attention_mask']

In [16]:
# block_size = tokenizer.model_max_length
block_size = 512

In [17]:
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [18]:
class RecipeDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
        self.labels = encodings['input_ids'].copy()

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = RecipeDataset(group_texts(train_encodings))
val_dataset = RecipeDataset(group_texts(val_encodings))
test_dataset = RecipeDataset(group_texts(test_encodings))

In [19]:
len(train_dataset), len(val_dataset), len(test_dataset)

(1370, 151, 168)

In [20]:
print(tokenizer.decode(train_dataset[0]["input_ids"]))


Black Friday Bread
    
    Black Friday — the day after Thanksgiving — is equal parts shopping and leftovers. Turkey sandwiches are a must; but what do you do with all those leftover bits of stuffing, mashed potatoes, squash, creamed onions... Well, kill two birds with one stone: make a delicious sandwich loaf, AND use those other leftovers from the Turkey Day meal right in the bread itself. This moist, flavorful bread slices beautifully; and when you use stuffing as one of the ingredients, its mild herb flavor is perfect for a turkey sandwich.
    
       113g King Arthur White Whole Wheat Flour 298g King Arthur Unbleached Bread Flour 28g soft butter 1 to 1 1/4 teaspoons salt, to taste 14g sugar 2 1/2 teaspoons instant yeast 152g lukewarm milk 128g prepared stuffing 213g mashed potatoes, white or sweet 
    
     Directions   Place all of the ingredients in a bowl (or the bowl of your stand mixer; or a bread machine bucket); and mix and knead to make a smooth, elastic, and somewhat s

In [21]:
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained(model_checkpoint)

In [22]:
model_checkpoint

'distilgpt2'

In [23]:
from transformers import Trainer, TrainingArguments

model_name = model_checkpoint.split("/")[-1]
training_args = TrainingArguments(
    f"{model_name}-finetuned-recipes",
    num_train_epochs=10.,
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,    
    # push_to_hub=True,
    # fp16 didn't lower memory usage in a meaningful way, i guess because
    # batch size is already a small % of memory, and also slowed down training
    fp16=True, 
    gradient_accumulation_steps=16,
)

In [24]:
torch.cuda.empty_cache()

In [25]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

Using amp fp16 backend


In [26]:
torch.cuda.empty_cache()

In [27]:
torch.cuda.memory_summary(device=None, abbreviated=False)



In [28]:
trainer.train()

***** Running training *****
  Num examples = 1370
  Num Epochs = 10
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 16
  Total optimization steps = 850


Epoch,Training Loss,Validation Loss
0,No log,2.721223
1,No log,2.562458
2,No log,2.484214
3,No log,2.435942
4,No log,2.39994
5,2.727200,2.37581
6,2.727200,2.358845
7,2.727200,2.348522
8,2.727200,2.340244
9,2.727200,2.338228


  nn.utils.clip_grad_norm_(
***** Running Evaluation *****
  Num examples = 151
  Batch size = 1
***** Running Evaluation *****
  Num examples = 151
  Batch size = 1
***** Running Evaluation *****
  Num examples = 151
  Batch size = 1
***** Running Evaluation *****
  Num examples = 151
  Batch size = 1
Saving model checkpoint to distilgpt2-finetuned-recipes/checkpoint-500
Configuration saved in distilgpt2-finetuned-recipes/checkpoint-500/config.json
Model weights saved in distilgpt2-finetuned-recipes/checkpoint-500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 151
  Batch size = 1
***** Running Evaluation *****
  Num examples = 151
  Batch size = 1
***** Running Evaluation *****
  Num examples = 151
  Batch size = 1
***** Running Evaluation *****
  Num examples = 151
  Batch size = 1
***** Running Evaluation *****
  Num examples = 151
  Batch size = 1


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=850, training_loss=2.6216669060202205, metrics={'train_runtime': 2290.2138, 'train_samples_per_second': 5.982, 'train_steps_per_second': 0.371, 'total_flos': 1788576257802240.0, 'train_loss': 2.6216669060202205, 'epoch': 9.99})

In [29]:
import math
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

***** Running Evaluation *****
  Num examples = 151
  Batch size = 1


Perplexity: 10.36


In [30]:
model_checkpoint_local = 'recipes_model'

In [31]:
model.save_pretrained(model_checkpoint_local)

Configuration saved in recipes_model/config.json
Model weights saved in recipes_model/pytorch_model.bin


# Try to load back the model to see if that works

In [32]:
model2 = AutoModelForCausalLM.from_pretrained(model_checkpoint_local)

loading configuration file recipes_model/config.json
Model config GPT2Config {
  "_name_or_path": "distilgpt2",
  "_num_labels": 1,
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 6,
  "n_positions": 1024,
  "resid_pdrop": 0.1,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "torch_dtype": "float32",
  "transformers_version": "4.11.3",
  "use_cache": true,
  "vocab_size": 5

# Try to generate text 

In [39]:
prompt = "Strawberry Bread Machine Loaf"

In [None]:
inputs = tokenizer(prompt, add_special_tokens=True, return_tensors="pt")["input_ids"]

prompt_length = len(tokenizer.decode(inputs[0]))
outputs = model2.generate(inputs, max_length=250, do_sample=True, top_p=0.95, top_k=60, temperature=1.)
generated = prompt + tokenizer.decode(outputs[0])[prompt_length+1:]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [None]:
print(generated)