In [8]:
import torch
import torch.nn as nn
from torchvision import models, transforms
from PIL import Image 
import os
from mlx_lm import load, generate
import cv2

In [9]:
model_path = "Ingredient-classifier/model/ingredient_classifier2.pth"
data_dir = "dataset"
device = torch.device("mps" if torch.backends.mps.is_available() else "gpu" if torch.cuda.is_available() else "cpu")

In [10]:
class_names = ['carrot', 'chicken', 'corn', 'egg', 'garlic', 'lettuce', 'mushroom', 'onion', 'potato', 'salmon', 'shrimp', 'tomato']

test_transforms = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406],
                         [0.229, 0.224, 0.225])
])

In [11]:
model = models.resnet18(pretrained = False)
num_ftrs = model.fc.in_features
model.fc = nn.Linear(num_ftrs, len(class_names))
model.load_state_dict(torch.load(model_path, map_location=device))
model = model.to(device)
model.eval()

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [12]:
def predict_image(frame):

    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    image = test_transforms(frame_rgb).unsqueeze(0).to(device)

    with torch.no_grad():
        outputs = model(image)
        _, predicted = torch.max(outputs, 1)

    return class_names[predicted.item()]

In [13]:
lm_path = "output/phi3_finetuned_fused"
lm, tokenizer = load(lm_path)

def generate_recipe(ingredient_list, requirement):
    prompt = f"I have {ingredient_list}. Suggest 2 dishes, with ingredient needed, instructions and their estimated time. keep it easy to read.{requirement}" 
    
    recipe = generate(
        model = lm,
        tokenizer = tokenizer,
        prompt=prompt,
        max_tokens=5000,
    )
    
    return recipe

In [14]:
cap = cv2.VideoCapture(0)
stored = []

if not cap.isOpened():
    print("Error: Could not open camera.")
else:
    print("Camera opened successfully.")

    while True:

        ret, frame = cap.read()

        if not ret:
            print("Error: Could not read frame.")
            break

        cv2.imshow('Ingredient Scanner', frame)      

        key = cv2.waitKey(10) & 0xFF
        if key == ord('p') or key == ord('P'): 
            prediction = predict_image(frame)
            stored.append(prediction)
            print("Captured")
            print(f"Stored Ingredient: {stored}")

        if key == ord('r') or key == ord('R'): 
            stored = []
            print("Reset!")

        if key == ord('g') or key == ord('G'): 
            requirement = input("Please enter any special requirement:")
            print("Generating...")
            print(generate_recipe(stored, requirement))
            stored = []

        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    cap.release()
    cv2.destroyAllWindows()
    cv2.waitKey(1)


Camera opened successfully.


KeyboardInterrupt: 