In [2]:
import torch
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings('ignore')

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Data preprocessing
df = pd.read_csv('Food_Recipe.csv')
df = df.dropna(subset=['instructions', 'prep_time (in mins)', 'cook_time (in mins)'])
df['total_time'] = df['prep_time (in mins)'] + df['cook_time (in mins)']
df['difficulty'] = pd.cut(df['total_time'],
                         bins=[0, 30, 60, float('inf')],
                         labels=['easy', 'medium', 'hard'])
df = df.dropna(subset=['difficulty'])

# Encode labels
le = LabelEncoder()
df['label'] = le.fit_transform(df['difficulty'])

# Split data
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['instructions'], df['label'],
    test_size=0.2, random_state=42,
    stratify=df['label']
)

# BERT initialization
MODEL_NAME = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

# Dataset class
class RecipeDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts.iloc[idx])
        label = self.labels.iloc[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Create dataloaders
def create_data_loader(texts, labels, tokenizer, batch_size=16):
    dataset = RecipeDataset(
        texts=texts,
        labels=labels,
        tokenizer=tokenizer
    )
    return DataLoader(dataset, batch_size=batch_size)

BATCH_SIZE = 16
train_loader = create_data_loader(train_texts, train_labels, tokenizer, BATCH_SIZE)
test_loader = create_data_loader(test_texts, test_labels, tokenizer, BATCH_SIZE)

# Model definition
class DifficultyClassifier(torch.nn.Module):
    def __init__(self, n_classes=3):
        super(DifficultyClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(MODEL_NAME)
        self.drop = torch.nn.Dropout(p=0.3)
        self.out = torch.nn.Linear(self.bert.config.hidden_size, n_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        pooled_output = outputs.last_hidden_state[:, 0, :]
        output = self.drop(pooled_output)
        return self.out(output)

model = DifficultyClassifier(len(le.classes_)).to(device)

# Training setup
EPOCHS = 15
optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = torch.nn.CrossEntropyLoss().to(device)

# Training loop
def train_epoch(model, data_loader, optimizer, device):
    model.train()
    losses = []

    for batch in data_loader:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids, attention_mask)
        loss = loss_fn(outputs, labels)

        losses.append(loss.item())
        loss.backward()
        optimizer.step()

    return np.mean(losses)

# Evaluation function
def eval_model(model, data_loader, device):
    model.eval()
    correct_predictions = 0
    losses = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask)
            loss = loss_fn(outputs, labels)

            _, preds = torch.max(outputs, dim=1)
            correct_predictions += torch.sum(preds == labels)
            losses.append(loss.item())

    accuracy = correct_predictions.double() / len(data_loader.dataset)
    return np.mean(losses), accuracy

# Training execution
for epoch in range(EPOCHS):
    train_loss = train_epoch(model, train_loader, optimizer, device)
    val_loss, val_acc = eval_model(model, test_loader, device)

    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print(f'Train loss: {train_loss:.4f}')
    print(f'Val loss: {val_loss:.4f}')
    print(f'Val accuracy: {val_acc:.4f}\n')

# Generate predictions
def get_predictions(model, data_loader):
    model.eval()
    predictions = []
    real_values = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask)
            _, preds = torch.max(outputs, dim=1)

            predictions.extend(preds.cpu().numpy())
            real_values.extend(labels.cpu().numpy())

    return predictions, real_values

y_pred, y_test = get_predictions(model, test_loader)
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))

# Prediction function
def predict_difficulty(text):
    encoded_text = tokenizer.encode_plus(
        text,
        max_length=128,
        add_special_tokens=True,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )

    input_ids = encoded_text['input_ids'].to(device)
    attention_mask = encoded_text['attention_mask'].to(device)

    with torch.no_grad():
        output = model(input_ids, attention_mask)
        _, prediction = torch.max(output, dim=1)

    return le.inverse_transform(prediction.cpu().numpy())[0]

# Example usage
sample_recipe = "Chop vegetables and sauté in pan. Add spices and cook for 10 minutes. Mix with cooked rice and serve hot."
print(f"\nPredicted difficulty: {predict_difficulty(sample_recipe)}")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Epoch 1/15
Train loss: 1.0311
Val loss: 0.9961
Val accuracy: 0.5486

Epoch 2/15
Train loss: 0.9429
Val loss: 0.9729
Val accuracy: 0.5833

Epoch 3/15
Train loss: 0.8367
Val loss: 0.9509
Val accuracy: 0.5833

Epoch 4/15
Train loss: 0.6580
Val loss: 1.0737
Val accuracy: 0.5903

Epoch 5/15
Train loss: 0.4070
Val loss: 1.3236
Val accuracy: 0.5347

Epoch 6/15
Train loss: 0.2250
Val loss: 1.3908
Val accuracy: 0.5486

Epoch 7/15
Train loss: 0.1454
Val loss: 1.6727
Val accuracy: 0.5278

Epoch 8/15
Train loss: 0.2302
Val loss: 1.6797
Val accuracy: 0.5556

Epoch 9/15
Train loss: 0.1393
Val loss: 1.6097
Val accuracy: 0.5625

Epoch 10/15
Train loss: 0.1408
Val loss: 1.6372
Val accuracy: 0.5278

Epoch 11/15
Train loss: 0.0664
Val loss: 1.7369
Val accuracy: 0.5417

Epoch 12/15
Train loss: 0.0157
Val loss: 1.8817
Val accuracy: 0.5694

Epoch 13/15
Train loss: 0.0088
Val loss: 1.9827
Val accuracy: 0.5764

Epoch 14/15
Train loss: 0.0065
Val loss: 2.0736
Val accuracy: 0.5903

Epoch 15/15
Train loss: 0.005

In [3]:
import torch
from pathlib import Path
import joblib

# Create save directory
save_dir = Path("recipe_classifier")
save_dir.mkdir(exist_ok=True)

# Save model weights
torch.save(model.state_dict(), save_dir / "bert_classifier.pt")

# Save tokenizer
tokenizer.save_pretrained(save_dir)

# Save label encoder
joblib.dump(le, save_dir / "label_encoder.joblib")

# Save config (MAX_LEN and class names)
config = {
    "max_len": 128,
    "classes": le.classes_.tolist()
}
joblib.dump(config, save_dir / "config.joblib")

print(f"Saved model and assets to {save_dir}")

Saved model and assets to recipe_classifier


In [4]:
import shutil
from google.colab import files
# Path to your folder
folder_path = 'recipe_classifier'

# Output zip file
output_path = 'recipe_classifier.zip'

# Compress the folder
shutil.make_archive(output_path.replace('.zip', ''), 'zip', folder_path)

files.download(output_path)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [7]:
import torch
from transformers import BertTokenizer, BertModel
from torch.utils.data import Dataset, DataLoader
import joblib
from pathlib import Path

class RecipeClassifier(torch.nn.Module):
    def __init__(self, n_classes=3):
        super(RecipeClassifier, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.drop = torch.nn.Dropout(p=0.3)
        self.out = torch.nn.Linear(self.bert.config.hidden_size, n_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]
        output = self.drop(pooled_output)
        return self.out(output)

class PredictionPipeline:
    def __init__(self, model_path="recipe_classifier"):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model_path = Path(model_path)

        # Load assets
        self.tokenizer = BertTokenizer.from_pretrained(self.model_path)
        self.label_encoder = joblib.load(self.model_path / "label_encoder.joblib")
        self.config = joblib.load(self.model_path / "config.joblib")

        # Initialize model
        self.model = RecipeClassifier(len(self.config["classes"]))
        self.model.load_state_dict(torch.load(self.model_path / "bert_classifier.pt",
                                            map_location=self.device))
        self.model.to(self.device)
        self.model.eval()

    def preprocess(self, text):
        encoding = self.tokenizer.encode_plus(
            text,
            max_length=self.config["max_len"],
            add_special_tokens=True,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt"
        )
        return encoding["input_ids"], encoding["attention_mask"]

    def predict(self, text):
        input_ids, attention_mask = self.preprocess(text)

        with torch.no_grad():
            input_ids = input_ids.to(self.device)
            attention_mask = attention_mask.to(self.device)
            outputs = self.model(input_ids, attention_mask)
            _, prediction = torch.max(outputs, dim=1)

        return self.label_encoder.inverse_transform(prediction.cpu().numpy())[0]

# Example usage
if __name__ == "__main__":
    pipeline = PredictionPipeline()

    sample_text = "Chop vegetables and sauté in pan. Add spices and cook for 10 minutes."
    prediction = pipeline.predict(sample_text)
    print(f"Predicted difficulty: {prediction}")

Predicted difficulty: medium
