In [1]:
import pandas as pd

In [2]:
df=pd.read_csv("Indian Recipe Dataset.csv")

In [None]:
df

In [None]:
def parse_text(row):
    text = row
    sections = [part.strip() for part in text.split("###") if part.strip()]
    parsed_data = {}
    for section in sections:
        if ": " in section:
            key, value = section.split(": ", 1)
            parsed_data[key] = value
    return parsed_data

# Apply the function to each row
parsed_df = df["text"].apply(parse_text)

# Convert parsed data into a new DataFrame
result_df = pd.DataFrame(parsed_df.tolist())


df = result_df.applymap(lambda x: x.lower() if isinstance(x, str) else x)


In [None]:
df

In [6]:


# Create empty lists to store the input-output pairs
input_texts = []
output_texts = []

# Iterate through the rows to generate input-output pairs
for _, row in df.iterrows():
    # Create the input text
    input_text = f"Ingredients: {row['TranslatedIngredients']}; Time: {row['PrepTimeInMins'] + row['CookTimeInMins']} mins; Cuisine: {row['Cuisine']}; Diet: {row['Diet']}"
    
    # Create the output text
    output_text = f"Recipe Name: {row['Course']} Recipe; Instructions: {row['TranslatedInstructions']}; Servings: {row['Servings']}"
    
    # Append to the lists
    input_texts.append(input_text)
    output_texts.append(output_text)

# Add the input-output pairs to the DataFrame (optional)
df['Input'] = input_texts
df['Output'] = output_texts

In [7]:
df.to_csv("recipe.csv")

In [None]:
df

In [9]:
recipe_df=df[['Input','Output']]

In [None]:
recipe_df

# DATA PREPROCESSING

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import Dataset
from torch.optim import AdamW

In [12]:
# Load tokenizer and model from the checkpoint (not from "t5-small")
checkpoint_path = './results/checkpoint-1374'

In [13]:
tokenizer = T5Tokenizer.from_pretrained(checkpoint_path)
model = T5ForConditionalGeneration.from_pretrained(checkpoint_path)


In [14]:
# Create Dataset for Hugging Face
data = {'Input': input_texts, 'Output': output_texts}
dataset = Dataset.from_dict(data)

In [None]:
# Tokenize the dataset
def tokenize_function(examples):
    input_encodings = tokenizer(examples['Input'], padding='max_length', truncation=True, max_length=512)
    output_encodings = tokenizer(examples['Output'], padding='max_length', truncation=True, max_length=512)
    
    return {
        'input_ids': input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'labels': output_encodings['input_ids'],
    }

# Tokenize the dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)



In [16]:
# Split the dataset into train and eval datasets (80% train, 20% eval)
train_test_split = tokenized_datasets.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']


In [17]:
# Subclass Trainer to customize the optimizer
class CustomTrainer(Trainer):
    def create_optimizer(self):
        """
        Override the default optimizer creation method to use AdamW.
        """
        optimizer = AdamW(
            self.model.parameters(),
            lr=self.args.learning_rate,
            weight_decay=self.args.weight_decay,
        )
        self.optimizer = optimizer
        return self.optimizer


In [None]:
model = model.to('cuda')

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=100,
)

In [None]:
# Use the custom trainer with AdamW
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
)

In [None]:
# Resume training from checkpoint-687
trainer.train(resume_from_checkpoint='./results/checkpoint-1374')

**Adding output as markdown because of large output**
There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].
c:\Users\abhin\Desktop\AI 30 DAYS\Recipe Generator\recipeenv\lib\site-packages\transformers\trainer.py:3420: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
  torch.load(os.path.join(checkpoint, OPTIMIZER_NAME), map_location=map_location)
  0%|          | 0/2061 [00:00<?, ?it/s]c:\Users\abhin\Desktop\AI 30 DAYS\Recipe Generator\recipeenv\lib\site-packages\transformers\trainer.py:3083: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
  checkpoint_rng_state = torch.load(rng_file)
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.

 68%|██████▊   | 1400/2061 [01:35<05:11,  2.12it/s] 
{'loss': 1.6009, 'grad_norm': 0.6635178923606873, 'learning_rate': 1.603590490053372e-05, 'epoch': 2.04}

 73%|███████▎  | 1500/2061 [07:40<33:57,  3.63s/it]
{'loss': 1.5683, 'grad_norm': 0.5967133045196533, 'learning_rate': 1.3609898107714703e-05, 'epoch': 2.18}

 78%|███████▊  | 1600/2061 [13:47<27:46,  3.61s/it]
{'loss': 1.5441, 'grad_norm': 1.2870875597000122, 'learning_rate': 1.1183891314895683e-05, 'epoch': 2.33}

 82%|████████▏ | 1700/2061 [19:54<21:47,  3.62s/it]
{'loss': 1.5891, 'grad_norm': 0.5602926015853882, 'learning_rate': 8.757884522076662e-06, 'epoch': 2.47}

 87%|████████▋ | 1800/2061 [25:55<15:47,  3.63s/it]
{'loss': 1.5901, 'grad_norm': 0.5525727272033691, 'learning_rate': 6.3318777292576415e-06, 'epoch': 2.62}

 92%|█████████▏| 1900/2061 [31:57<09:40,  3.61s/it]
{'loss': 1.6074, 'grad_norm': 0.43832340836524963, 'learning_rate': 3.905870936438622e-06, 'epoch': 2.77}

 97%|█████████▋| 2000/2061 [37:57<04:02,  3.98s/it]
{'loss': 1.611, 'grad_norm': 0.4432830214500427, 'learning_rate': 1.4798641436196021e-06, 'epoch': 2.91}

                                                   
100%|██████████| 2061/2061 [43:33<00:00,  3.65s/it]
{'eval_loss': 1.4431242942810059, 'eval_runtime': 109.8543, 'eval_samples_per_second': 12.517, 'eval_steps_per_second': 1.566, 'epoch': 3.0}

100%|██████████| 2061/2061 [43:35<00:00,  1.27s/it]
{'train_runtime': 2615.314, 'train_samples_per_second': 6.304, 'train_steps_per_second': 0.788, 'train_loss': 0.5302758270071178, 'epoch': 3.0}