In [2]:
import torch
import torch.nn as nn
from transformers import GPT2TokenizerFast, GPT2LMHeadModel

from transformers import Trainer, TrainingArguments

from tqdm.auto import tqdm

import pandas as pd
import numpy as np


In [3]:
model_name = 'gpt2'

In [4]:
model_save_path = '../model'

In [5]:
tokenizer = GPT2TokenizerFast.from_pretrained(model_name,
                                              bos_token='<|startoftext|>',
                                              eos_token='<|endoftext|>',
                                              unk_token='<|unknown|>',
                                              pad_token='<|pad|>'
                                             )
model = GPT2LMHeadModel.from_pretrained(model_name)
model.resize_token_embeddings(len(tokenizer))

Downloading (…)olve/main/vocab.json: 100%|██████████| 1.04M/1.04M [00:00<00:00, 6.16MB/s]
Downloading (…)olve/main/merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 2.52MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 1.36M/1.36M [00:00<00:00, 13.1MB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 665/665 [00:00<00:00, 1.71MB/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Downloading model.safetensors: 100%|██████████| 548M/548M [00:05<00:00, 99.7MB/s] 
Downloading (…)neration_config.json: 100%|██████████| 124/124 [00:00<00:00, 311kB/s]


Embedding(50260, 768)

In [6]:
tokenizer.save_pretrained(model_save_path)

('../model/tokenizer_config.json',
 '../model/special_tokens_map.json',
 '../model/vocab.json',
 '../model/merges.txt',
 '../model/added_tokens.json',
 '../model/tokenizer.json')

In [7]:
tokenizer.convert_tokens_to_ids(['<|pad|>'])

[50259]

In [8]:
def generate(prompt):
    inputs = tokenizer.encode_plus(prompt, return_tensors='pt')
    output = model.generate(**inputs,max_length=256,do_sample=True,pad_token_id=50259)
    print(tokenizer.decode(output[0]))

In [9]:
tokenizer.special_tokens_map

{'bos_token': '<|startoftext|>',
 'eos_token': '<|endoftext|>',
 'unk_token': '<|unknown|>',
 'pad_token': '<|pad|>'}

In [10]:
tokenizer.convert_tokens_to_ids(['<|startoftext|>'],)

[50257]

In [11]:
clean = pd.read_csv('../data/.IndianFoodDatasetCSV.csv.icloudIndianFoodDatasetCSV.csv')
clean = clean.sample(frac=1)
clean.reset_index(drop=True,inplace=True)

FileNotFoundError: [Errno 2] No such file or directory: '../data/IndianFoodDatasetCSV.csv'

In [16]:
def print_recipe(idx):
    print(f"{clean['ingredients'][idx]}\n\n{clean['instructions'][idx]}")

In [17]:
def form_string(ingredient,instruction):
    s = f"<|startoftext|>Ingredients:\n{ingredient.strip()}\n\nInstructions:\n{instruction.strip()}<|endoftext|>"
    return s

In [18]:
data = clean.apply(lambda x:form_string(x['ingredients'],x['instructions']),axis=1).to_list()

KeyError: 'ingredients'

https://towardsdatascience.com/guide-to-fine-tuning-text-generation-models-gpt-2-gpt-neo-and-t5-dc5de6b3bc5e

In [14]:
train_size = 0.85
train_len = int(train_size * len(data))
train_data = data[:train_len]
val_data = data[train_len:]

In [15]:
class RecipeDataset:
    def __init__(self,data):
        self.data = data
        self.input_ids = []
        self.attn_masks = []
        
        for item in tqdm(data):
            encodings = tokenizer.encode_plus(item,
                                              truncation=True,
                                              padding='max_length',
                                              max_length=1024,
                                              return_tensors='pt'
                                             )
            self.input_ids.append(torch.squeeze(encodings['input_ids'],0))
            self.attn_masks.append(torch.squeeze(encodings['attention_mask'],0))
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self,idx):
        return self.input_ids[idx], self.attn_masks[idx] 

In [16]:
def collate_fn(batch):
    return {
        'input_ids': torch.stack([item[0] for item in batch]),
        'attention_mask': torch.stack([item[1] for item in batch]),
        'labels': torch.stack([item[0] for item in batch])
    }

In [17]:
train_ds = RecipeDataset(train_data)
val_ds = RecipeDataset(val_data)

  0%|          | 0/5047 [00:00<?, ?it/s]

  0%|          | 0/891 [00:00<?, ?it/s]

In [18]:
args = TrainingArguments(output_dir=model_save_path,
                         per_device_train_batch_size=2,
                         per_device_eval_batch_size=2,
                         gradient_accumulation_steps=2,
                         report_to='none',
                         num_train_epochs=3,
                         save_strategy='no'
                        )

In [19]:
optim = torch.optim.AdamW(model.parameters(),lr=5e-5)
scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optim,20,eta_min=1e-7)

In [20]:
trainer = Trainer(model,
                  args,
                  train_dataset=train_ds,
                  eval_dataset=val_ds,
                  data_collator=collate_fn,
                  optimizers=(optim,scheduler)
                 )

In [21]:
trainer.train()

***** Running training *****
  Num examples = 5047
  Num Epochs = 3
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 2
  Total optimization steps = 1893


Step,Training Loss
500,0.9125
1000,0.6522
1500,0.6175




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=1893, training_loss=0.7011296936776735, metrics={'train_runtime': 3148.955, 'train_samples_per_second': 4.808, 'train_steps_per_second': 0.601, 'total_flos': 7912445313024000.0, 'train_loss': 0.7011296936776735, 'epoch': 3.0})

In [22]:
trainer.save_model()

Saving model checkpoint to ./khaanaGPT
Configuration saved in ./khaanaGPT/config.json
Model weights saved in ./khaanaGPT/pytorch_model.bin


In [23]:
from transformers import pipeline

In [24]:
pl = pipeline(task='text-generation',model='/kaggle/working/khaanaGPT')

loading configuration file /kaggle/working/khaanaGPT/config.json
Model config GPT2Config {
  "_name_or_path": "/kaggle/working/khaanaGPT",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "torch_dtype": "float32",
  "transformers_version": "4.20.1",
  "use_cache": true,
  "voca

In [25]:
def create_prompt(ingredients):
    ingredients = ','.join([x.strip().lower() for x in ingredients.split(',')])
    ingredients = ingredients.strip().replace(',','\n')
    s = f"<|startoftext|>Ingredients:\n{ingredients}\n"
    return s

In [26]:
ingredients = ['Rice,Potatoes,Tomatoes,Spinach,red bell peppers','chicken,tomatoes,aloo,jeera,curry powder']

In [27]:

for ing in ingredients:
    prompt = create_prompt(ing)
    print(pl(prompt,
         max_new_tokens=512,
         penalty_alpha=0.6,
         top_k=4,
         pad_token_id=50259
        )[0]['generated_text'])

<|startoftext|>Ingredients:
rice
potatoes
tomatoes
spinach
red bell peppers
black pepper powder
coriander (dhania) leaves
onion
cumin seeds (jeera)
potato (aloo)
turmeric powder
green chillies
green peas (matar)
garam masala powder
sunflower oil

Instructions:
first wash and chop potatoes.
keep them aside.heat oil in a kadai.
add cumin seeds, black pepper powder, turmeric, garam masala powder, red chilli powder and cook until the spices have softened and the potatoes are soft.once the spices have softened, add chopped onions and cook until they turn translucent and brown in colour.
once done add in chopped pota green chillies, coriander seeds and mix well to combine.
add in cooked rice, red bell peppers and cook until it becomes a little thick and cooked.
once done, add in cooked pota spinach,  red chilli powder and garam masala and stir well to combine.
check the seasoning and adjust according to your taste.serve the spinach and red chili curry recipe as a side dish along with a hot c