\**Downloading, Installing & Importing Required Libraries**

In [1]:
import os
import h5py
import math
import torch
from torch.utils.data import Dataset

In [2]:
!pip install transformers
!pip install accelerate -U

Collecting accelerate
  Downloading accelerate-0.24.1-py3-none-any.whl (261 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.4/261.4 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.24.1


In [3]:
from transformers import (
    AutoConfig,
    AutoModelWithLMHead,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    PreTrainedTokenizer,
    Trainer,
    TrainingArguments,
    set_seed,
    TrainerCallback
)
import accelerate

**Mounting Google Drive for importing the Data Files which will be used in the Tokenization**

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
drive.mount('/content/drive', force_remount=True)


Mounted at /content/drive


**Selecting the GPU to Train the Model**

In [6]:
os.environ["CUDA_VISIBLE_DEVICES"]="0"
os.environ["CUDA_LAUNCH_BLOCKING"]="0"

**Defining the Method that will Create the Pytorch Compatible Dataset Class**

In [7]:
class H5Dataset(Dataset):
    def __init__(self, tokenizer, file_path='/content/drive/MyDrive/BTP_Dev/Dataset/train_temp', block_size=512):
        cached_features_file = "/content/drive/MyDrive/BTP_Dev/Dataset/data_temp.h5"

        # logger.info("Loading features from cached file %s", cached_features_file)
        print(("Loading features from cached file %s", cached_features_file))
        with h5py.File(cached_features_file, 'r') as f:
            if file_path=='/content/drive/MyDrive/BTP_Dev/Dataset/test_temp':
                self.samples = f[file_path][:] #this is a dev set, 30% of a test set
            else:
                self.samples = f[file_path][:]

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, item):
        return torch.tensor(self.samples[item])

In [8]:
def get_dataset( tokenizer, evaluate=False, local_rank=-1):
  file_path = "/content/drive/MyDrive/BTP_Dev/Dataset/test_temp" if evaluate else "/content/drive/MyDrive/BTP_Dev/Dataset/train_temp"
  return H5Dataset(tokenizer=tokenizer, file_path=file_path)

**Performing Transformer Configuration**

In [9]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"


In [10]:
print(torch.cuda.is_available())  # Should return True
print(torch.version.cuda)         # Should print the CUDA version


True
11.8


In [11]:
config = AutoConfig.from_pretrained('gpt2', cache_dir='cache')
set_seed(20)

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

**Defining the Tokenizer for the Model Training**

In [12]:
tokenizer = AutoTokenizer.from_pretrained('gpt2', cache_dir= 'cache')

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

**Initialising the GPT2 Model**

In [13]:
model = AutoModelWithLMHead.from_pretrained('gpt2',config=config,cache_dir='cache',)



model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

**Adding the Special Recipe Token to the Tokenizer**

In [14]:
top_cuisine_tokens = ['<CUISINE_ITALIAN>',
 '<CUISINE_MEXICAN>',
 '<CUISINE_SOUTH AMERICAN>',
 '<CUISINE_CANADIAN>',
 '<CUISINE_INDIAN SUBCONTINENT>']

# Update the special tokens dictionary
special_tokens = {
    "additional_special_tokens": [
        '<RECIPE_START>',
        '<INPUT_START>',
        '<NEXT_INPUT>',
        '<INPUT_END>',
        '<INGR_START>',
        '<NEXT_INGR>',
        '<INGR_END>',
        '<INSTR_START>',
        '<NEXT_INSTR>',
        '<INSTR_END>',
        '<TITLE_START>',
        '<TITLE_END>',
        '<RECIPE_END>'
    ] + top_cuisine_tokens
}

**Resizeing the Model to Fit the Tokenizer with Special Tokens**

In [15]:
tokenizer.add_special_tokens(special_tokens)
model.resize_token_embeddings(len(tokenizer))

Embedding(50275, 768)

**Converting the Train and Validation Dataset to Pytorch Dataset so as it can be given to the Model as Input for Training**

In [16]:
train_dataset = (get_dataset(tokenizer=tokenizer))
eval_dataset = (get_dataset(tokenizer=tokenizer, evaluate=True))

('Loading features from cached file %s', '/content/drive/MyDrive/BTP_Dev/Dataset/data_temp.h5')
('Loading features from cached file %s', '/content/drive/MyDrive/BTP_Dev/Dataset/data_temp.h5')


**To be able to build batches, data collators may apply some processing (like padding).Some of them (like DataCollatorForLanguageModeling) also apply some random data augmentation (like random masking) oin the formed batch.
Data collators are objects that will form a batch by using a list of dataset elements as input. These elements are of the same type as the elements of train_dataset or eval_dataset.Forming the batches to dataset to be trained
source :- Hugginface.co**

In [17]:
data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False, mlm_probability=0.15  )

In [18]:
training_args = TrainingArguments(

    output_dir= "/content/model_output",

    num_train_epochs=2,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=8,
    evaluation_strategy="steps",
    fp16=True,
    fp16_opt_level='O1',
    warmup_steps=1e2,
    learning_rate=5e-4,
    adam_epsilon=1e-8,
    weight_decay=0.01,
    save_total_limit=1,
    load_best_model_at_end=True,
)

**Initializing PyTorch Trainer**

In [19]:

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)


**Saving the Tokenizer Object & Starting Training and Saving the model after Finishing the training**

In [20]:
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [21]:
tokenizer.save_pretrained('/content/model_output')
trainer.train()
trainer.save_model()

Step,Training Loss,Validation Loss
500,2.6933,1.202413
1000,1.182,1.13658


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


**Saving the Tokenizer**

In [22]:
tokenizer.save_pretrained('/content/model_output')

('/content/model_output/tokenizer_config.json',
 '/content/model_output/special_tokens_map.json',
 '/content/model_output/vocab.json',
 '/content/model_output/merges.txt',
 '/content/model_output/added_tokens.json',
 '/content/model_output/tokenizer.json')

In [23]:
from google.colab import files
import os

model_directory = '/content/model_output'
for filename in os.listdir(model_directory):
    files.download(os.path.join(model_directory, filename))


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [24]:
!zip -r model_output.zip /content/model_output
files.download('model_output.zip')


  adding: content/model_output/ (stored 0%)
  adding: content/model_output/config.json (deflated 51%)
  adding: content/model_output/checkpoint-1000/ (stored 0%)
  adding: content/model_output/checkpoint-1000/config.json (deflated 51%)
  adding: content/model_output/checkpoint-1000/optimizer.pt (deflated 7%)
  adding: content/model_output/checkpoint-1000/rng_state.pth (deflated 25%)
  adding: content/model_output/checkpoint-1000/scheduler.pt (deflated 55%)
  adding: content/model_output/checkpoint-1000/generation_config.json (deflated 24%)
  adding: content/model_output/checkpoint-1000/training_args.bin (deflated 51%)
  adding: content/model_output/checkpoint-1000/model.safetensors (deflated 7%)
  adding: content/model_output/checkpoint-1000/trainer_state.json (deflated 62%)
  adding: content/model_output/runs/ (stored 0%)
  adding: content/model_output/runs/Nov27_09-56-30_457b6e264349/ (stored 0%)
  adding: content/model_output/runs/Nov27_09-56-30_457b6e264349/events.out.tfevents.1701

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>