In [1]:
import math
import os
from torch.utils.data import Dataset
import h5py
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from transformers import (
    AutoConfig,
    AutoModelWithLMHead,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    PreTrainedTokenizer,
    Trainer,
    TrainingArguments,
    set_seed,
    TrainerCallback
)

In [3]:
os.environ["CUDA_VISIBLE_DEVICES"]="1"
# --optional (to debug the cuda error)
os.environ["CUDA_LAUNCH_BLOCKING"]="1"

In [4]:
class H5Dataset(Dataset):
    def __init__(self, tokenizer, file_path='train_temp', block_size=512): 
        cached_features_file = "data_temp.h5"

        # logger.info("Loading features from cached file %s", cached_features_file)
        print(("Loading features from cached file %s", cached_features_file))
        with h5py.File(cached_features_file, 'r') as f:
            if file_path=='test_temp':
                self.samples = f[file_path][:] #this is a dev set, 30% of a test set
            else:
                self.samples = f[file_path][:]

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, item):
        return torch.tensor(self.samples[item])

def get_dataset( tokenizer, evaluate=False, local_rank=-1):
  file_path = "test_temp" if evaluate else "train_temp"
  return H5Dataset(tokenizer=tokenizer, file_path=file_path)

set_seed(20)

In [5]:
config = AutoConfig.from_pretrained('gpt2', cache_dir='cache')

In [6]:
tokenizer = AutoTokenizer.from_pretrained('gpt2', cache_dir= 'cache')

In [7]:
model = AutoModelWithLMHead.from_pretrained(
            'gpt2', # model name
            config=config,
            cache_dir='cache', # cache directory (path to the cache directory)
        )



In [8]:
special_tokens = {
    "additional_special_tokens": ['<RECIPE_START>',
                                  '<INPUT_START>',
                                  '<NEXT_INPUT>',
                                  '<INPUT_END>',
                                  '<INGR_START>',
                                  '<NEXT_INGR>',
                                  '<INGR_END>',
                                  '<INSTR_START>',
                                  '<NEXT_INSTR>',
                                  '<INSTR_END>',
                                  '<TITLE_START>'
                                  ,'<TITLE_END>'
                                  ,'<RECIPE_END>'
        ]
}

In [9]:
tokenizer.add_special_tokens(special_tokens)
model.resize_token_embeddings(len(tokenizer))

Embedding(50270, 768)

In [10]:
train_dataset = ( get_dataset(tokenizer=tokenizer) )
eval_dataset = (  get_dataset(tokenizer=tokenizer, evaluate=True) )

('Loading features from cached file %s', 'data_temp.h5')
('Loading features from cached file %s', 'data_temp.h5')


In [11]:
data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False, mlm_probability=0.15  )

training_args = TrainingArguments(
    
    output_dir= "./outputs",
    
    num_train_epochs=1,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=16,
    evaluation_strategy="steps",
    fp16=True,
    fp16_opt_level='O1',
    warmup_steps=1e2,    
    learning_rate=5e-4,
    adam_epsilon=1e-8,
    weight_decay=0.01,        
    save_total_limit=1,
    load_best_model_at_end=True,     
)

In [12]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

Using amp half precision backend


In [13]:
tokenizer.save_pretrained('./outputs/tempt')
# Starting the Training and saving the model
trainer.train()
trainer.save_model()

tokenizer config file saved in ./outputs/tempt/tokenizer_config.json
Special tokens file saved in ./outputs/tempt/special_tokens_map.json
***** Running training *****
  Num examples = 35762
  Num Epochs = 1
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 16
  Total optimization steps = 1117


Step,Training Loss,Validation Loss
500,3.0908,1.551185
1000,1.5547,1.466045


***** Running Evaluation *****
  Num examples = 1513
  Batch size = 2
Saving model checkpoint to ./outputs/checkpoint-500
Configuration saved in ./outputs/checkpoint-500/config.json
Model weights saved in ./outputs/checkpoint-500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1513
  Batch size = 2
Saving model checkpoint to ./outputs/checkpoint-1000
Configuration saved in ./outputs/checkpoint-1000/config.json
Model weights saved in ./outputs/checkpoint-1000/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ./outputs/checkpoint-1000 (score: 1.4660452604293823).
Saving model checkpoint to ./outputs
Configuration saved in ./outputs/config.json
Model weights saved in ./outputs/pytorch_model.bin
