In [None]:
from copy import deepcopy
from typing import Optional, Dict, List, Tuple


In [None]:
from transformers import AutoModel, AutoTokenizer
model = AutoModel.from_pretrained("google/mobilebert-uncased")
embedding = deepcopy(model.embeddings)
# https://huggingface.co/docs/transformers/main_classes/tokenizer
tokenizer = AutoTokenizer.from_pretrained("google/mobilebert-uncased")
VOCAB_SIZE = tokenizer.vocab_size
print(f'Using tokenizer of')
print(f'Vocabulary size: {VOCAB_SIZE}')
del models

In [None]:
import re
import tqdm

from torch.utils.data import Dataset, Subset, DataLoader
import pandas as pd
from typing import List, Dict, Tuple

class RecipeDataset(Dataset):

    def __init__(
            self, path: str,
            source_columns: List[str],
            target_columns: List[str],
            tokenizer: AutoTokenizer,
            nrows = None,
            device = 'cpu',
            max_len = 256,
            padding_type = "max_length",
            ):

        columns = source_columns + target_columns
        self.source_columns = source_columns
        self.target_columns = target_columns

        data = pd.read_csv(path, usecols=columns, nrows=nrows)

        self.src_ = []
        self.tgt_ = []
        self.tokenizer = tokenizer
        self.device = device
        self.PAD_idx = tokenizer.pad_token_id
        self.max_len = max_len
        self.padding_type = padding_type

        for i in tqdm.trange(len(data)):
            row = data.iloc[i]
            src_text = self.process_row(row, source_columns)
            tgt_text = self.process_row(row, target_columns)
            
            self.src_.append(src_text)
            self.tgt_.append(tgt_text)

        self.size = len(self.src_)

    def process_row(self, row: pd.Series, columns: List[str]):
        """Processes a single recipe row from the DataFrame into a clean string."""
        entry_parts = []
        for col in columns:
            if pd.notna(row[col]):
                content = str(row[col])
                if content.startswith('[') and content.endswith(']'):
                    content = re.sub(r'["\\$$\\\\$$]', '', content)
                entry_parts.append(f'{col.replace("_", " ")}: {content}')
                entry_parts.append('\n')
        return ''.join(entry_parts[:-1])

    def __len__(self) -> int:
        return self.size

    def __getitem__(self, idx) -> Tuple[str, str]:
        source = self.src_[idx]
        target = self.tgt_[idx]
        return source, target

    def single(self, idx):
        return self.collate_fn([(self.src_[idx], self.tgt_[idx])])

    def collate_fn(self, batch: List[Tuple[str, str]]) -> Dict[str, List]:
        
        sources = [f[0] for f in batch]
        targets = [f[1] for f in batch]
    
        source_enc = self.tokenizer(list(sources), max_length=self.max_len, 
                                    padding=self.padding_type, truncation=True, 
                                    return_tensors="pt", return_length=True)

        target_enc = self.tokenizer(list(targets), max_length=self.max_len,
                                    padding=self.padding_type, truncation=True, 
                                    return_tensors="pt", return_length=True)

        batch = {
            'input_ids': source_enc['input_ids'], 
            'attention_mask': source_enc['attention_mask'],
            'input_lengths': source_enc['length'],
            'labels': target_enc['input_ids'],
            'labels_lengths': target_enc['length'],
        }
        return batch

    def get_dataloaders(
            self,
            names: Optional[List[str]] = ['train', 'val'],
            ratios: Optional[List[float]] = [0.9, 0.1],
            shuffle: Optional[List[bool]] = [True, False],
            batch_size: int = 8,
            load_ratio: int = 1.0,
            **kwargs,
        ) -> Dict[str, DataLoader]:
        """
        Fetches several dataloaders from this dataset
        """ 

        indices = list(range(len(self)))
        i0 = 0
        dataloaders: Dict[str, DataLoader] = {}
        
        for name, part, shuff in zip(names, ratios, shuffle):
            part_len = int(len(indices) * part * load_ratio )
            subset = Subset(self, indices[i0: i0 + part_len])
            dataloaders[name] = DataLoader(subset, batch_size, shuff, collate_fn=data.collate_fn, **kwargs)
            i0 += part_len        
            
        return dataloaders

DATA_PATH = '../data/1_Recipe_csv.csv'
INPUT_COLS = ['recipe_title', 'category']
TARGET_COLS = ['description', 'ingredients', 'directions']
NSAMPLES = 1024

dataset = RecipeDataset(DATA_PATH, INPUT_COLS, TARGET_COLS,\
                     tokenizer, nrows=NSAMPLES, padding_type="longest")


train_loader, val_loader, test_loader = dataset.get_dataloaders(
    ['train', 'test', 'val'], [0.8, 0.1, 0.1], [True, False, False], batch_size=8).values()

## PyTorch Lightning



## Optuna



## Tensorboard integration with PyTorch

In [None]:
class RecipeTransformer(nn,Module):
    

In [None]:
import lightning as L
import torch
# import torchvision as tv

dataset = tv.datasets.CIFAR10("data", download=True,
                              train=True,
                              transform=tv.transforms.ToTensor())

fabric = L.Fabric()
fabric.launch()

model = tv.models.resnet18()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
model, optimizer = fabric.setup(model, optimizer)

dataloader = torch.utils.data.DataLoader(dataset, batch_size=8)
dataloader = fabric.setup_dataloaders(dataloader)

model.train()
num_epochs = 10
for epoch in range(num_epochs):
    for batch in dataloader:
        inputs, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = torch.nn.functional.cross_entropy(outputs, labels)
        fabric.backward(loss)
        optimizer.step()
        print(loss.data)