#### Set up the environment and various imports

In [2]:
%pip install numpy pandas torch tensorflow_hub
# !pip install  nltk pandas matplotlib==3.8.2 sklearn
# !pip install torchinfo torchvision==0.16.2 torchtext==0.7.0
# !pip install python-dotenv psycopg2-binary wandb pinecone-client

Note: you may need to restart the kernel to use updated packages.


In [3]:
import os
import numpy as np
import pandas as pd
import torch
from torch import Tensor, nn
import tensorflow_hub as hub
import tensorflow as tf
from typing import List, Tuple, Dict, Any


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
2024-02-01 12:55:19.951222: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-01 12:55:19.951245: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-01 12:55:19.952084: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory 

#### Load and Pre-Process the data

In [4]:
JSON_FILE = "./dataset/full_format_recipes.json"
dataset = pd.read_json(JSON_FILE)
dataset.head()

Unnamed: 0,directions,fat,date,categories,calories,desc,protein,rating,title,ingredients,sodium
0,"[1. Place the stock, lentils, celery, carrot, ...",7.0,2006-09-01 04:00:00+00:00,"[Sandwich, Bean, Fruit, Tomato, turkey, Vegeta...",426.0,,30.0,2.5,"Lentil, Apple, and Turkey Wrap","[4 cups low-sodium vegetable or chicken stock,...",559.0
1,[Combine first 9 ingredients in heavy medium s...,23.0,2004-08-20 04:00:00+00:00,"[Food Processor, Onion, Pork, Bake, Bastille D...",403.0,This uses the same ingredients found in boudin...,18.0,4.375,Boudin Blanc Terrine with Red Onion Confit,"[1 1/2 cups whipping cream, 2 medium onions, c...",1439.0
2,[In a large heavy saucepan cook diced fennel a...,7.0,2004-08-20 04:00:00+00:00,"[Soup/Stew, Dairy, Potato, Vegetable, Fennel, ...",165.0,,6.0,3.75,Potato and Fennel Soup Hodge,"[1 fennel bulb (sometimes called anise), stalk...",165.0
3,[Heat oil in heavy large skillet over medium-h...,,2009-03-27 04:00:00+00:00,"[Fish, Olive, Tomato, Sauté, Low Fat, Low Cal,...",,The Sicilian-style tomato sauce has tons of Me...,,5.0,Mahi-Mahi in Tomato Olive Sauce,"[2 tablespoons extra-virgin olive oil, 1 cup c...",
4,[Preheat oven to 350°F. Lightly grease 8x8x2-i...,32.0,2004-08-20 04:00:00+00:00,"[Cheese, Dairy, Pasta, Vegetable, Side, Bake, ...",547.0,,20.0,3.125,Spinach Noodle Casserole,"[1 12-ounce package frozen spinach soufflé, th...",452.0


In [5]:
## Drop the columns that are not needed
dataset = dataset.drop(columns=['date', 'categories', 'calories', 'protein', 'fat', 'sodium'])
## Resequence the columns
dataset = dataset.reindex(columns=['title', 'desc', 'ingredients', 'directions', 'rating'])
## Rename desc to description
dataset = dataset.rename(columns={'desc': 'description'})
dataset.head()

Unnamed: 0,title,description,ingredients,directions,rating
0,"Lentil, Apple, and Turkey Wrap",,"[4 cups low-sodium vegetable or chicken stock,...","[1. Place the stock, lentils, celery, carrot, ...",2.5
1,Boudin Blanc Terrine with Red Onion Confit,This uses the same ingredients found in boudin...,"[1 1/2 cups whipping cream, 2 medium onions, c...",[Combine first 9 ingredients in heavy medium s...,4.375
2,Potato and Fennel Soup Hodge,,"[1 fennel bulb (sometimes called anise), stalk...",[In a large heavy saucepan cook diced fennel a...,3.75
3,Mahi-Mahi in Tomato Olive Sauce,The Sicilian-style tomato sauce has tons of Me...,"[2 tablespoons extra-virgin olive oil, 1 cup c...",[Heat oil in heavy large skillet over medium-h...,5.0
4,Spinach Noodle Casserole,,"[1 12-ounce package frozen spinach soufflé, th...",[Preheat oven to 350°F. Lightly grease 8x8x2-i...,3.125


In [6]:
## Remove rows with missing values
print("missing directions", dataset['directions'].isnull().sum())
dataset = dataset[dataset['directions'].notna()]
print("missing ingredients", dataset['ingredients'].isnull().sum())
dataset = dataset[dataset['ingredients'].notna()]
print("missing ratings", dataset['rating'].isna().sum())
dataset = dataset[dataset['rating'].notna()]

missing directions 19
missing ingredients 0
missing ratings 11


In [7]:
print("Original Rating Average", dataset['rating'].mean())
print("Original deviation", dataset['rating'].std())
## Normalize the ratings between 0 and 1
if dataset['rating'].mean() > 1.0:
       dataset['rating'] = dataset['rating'] / 5.0
print("\nAverage rating", dataset['rating'].mean())
print("Standard deviation", dataset['rating'].std())

Original Rating Average 3.7130597014925373
Original deviation 1.3431435358354373

Average rating 0.7426119402985074
Standard deviation 0.2686287071670875


In [8]:
## Prepare dataset for embedding

## Where description is None, replace with the text "No description"
dataset['description'] = dataset['description'].fillna('No description')

## Join elements in the ingredients and directions columns with newlines between each element
dataset['ingredients'] = dataset['ingredients'].apply(lambda x: ',\n'.join(x))
dataset['directions'] = dataset['directions'].apply(lambda x: '\n'.join(x))


In [9]:
## Preview the data before running embedder
print(f"Title: {dataset['title'][0]} \nDescription: {dataset['description'][0]} \nIngredients: {dataset['ingredients'][0]} \nInstructions: {dataset['directions'][0]}")

Title: Lentil, Apple, and Turkey Wrap  
Description: No description 
Ingredients: 4 cups low-sodium vegetable or chicken stock,
1 cup dried brown lentils,
1/2 cup dried French green lentils,
2 stalks celery, chopped,
1 large carrot, peeled and chopped,
1 sprig fresh thyme,
1 teaspoon kosher salt,
1 medium tomato, cored, seeded, and diced,
1 small Fuji apple, cored and diced,
1 tablespoon freshly squeezed lemon juice,
2 teaspoons extra-virgin olive oil,
Freshly ground black pepper to taste,
3 sheets whole-wheat lavash, cut in half crosswise, or 6 (12-inch) flour tortillas,
3/4 pound turkey breast, thinly sliced,
1/2 head Bibb lettuce 
Instructions: 1. Place the stock, lentils, celery, carrot, thyme, and salt in a medium saucepan and bring to a boil. Reduce heat to low and simmer until the lentils are tender, about 30 minutes, depending on the lentils. (If they begin to dry out, add water as needed.) Remove and discard the thyme. Drain and transfer the mixture to a bowl; let cool.
2. Fol

In [10]:
# 76% of processed_dataset is used for training
train_size = int(0.76 * len(dataset))
# 12% of clean_data is used for validation
val_size = int(0.12 * len(dataset))
# The remaining 12% is used for testing
test_size = len(dataset) - train_size - val_size


train_data, val_data, test_data = torch.utils.data.random_split(
   dataset, [train_size, val_size, test_size]
)

print(f"Total size: {len(dataset)}")
print(f"Train size: {len(train_data)}")
print(f"Test size: {len(test_data)}")
print(f"Val size: {len(val_data)}")



Total size: 20100
Train size: 15276
Test size: 2412
Val size: 2412


#### Arthur abandoned attempt to get Nvidia GPU working
For future reference and troubleshooting... After I ran the following command and rebooted. I lost all access to KDE/Plasma GUI for Kubuntu and could only use the text only terminals via fn-ctrl-alt-f2

```
> sudo prime-select nvidia
```

In [11]:
# Set the device to run on: GPU, MPS, or CPU
device = torch.device(
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cpu device


In [12]:
## Create embeddings for the title, description, ingredients and directions
## Use the Universal Sentence Encoder from Tensorflow Hub 
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

title_embeddings = embed(dataset['title'])
description_embeddings = embed(dataset['description'])
ingredients_embeddings = embed(dataset['ingredients'])
directions_embeddings = embed(dataset['directions'])

title_embeddings.shape


2024-02-01 12:56:08.087068: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:274] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2024-02-01 12:56:23.781096: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 2566607360 exceeds 10% of free system memory.
2024-02-01 12:56:28.613082: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 1502193600 exceeds 10% of free system memory.
2024-02-01 12:56:42.988838: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 1502193600 exceeds 10% of free system memory.
2024-02-01 12:56:46.044250: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 7539031040 exceeds 10% of free system memory.


TensorShape([20100, 512])

In [None]:
## Create a PyTorch model to predict the ratings using the four embeddings as inputs
class RecipeRatingsPredictor(nn.Module):
    def __init__(self):
        super(RecipeRatingsPredictor, self).__init__()
        self.fc1 = nn.Linear(512 * 4, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 64)
        self.fc4 = nn.Linear(64, 1)
        self.dropout = nn.Dropout(0.2)

    def forward(self, x: Tensor) -> Tensor:
        x = x.view(x.size(0), -1)
        x = self.dropout(torch.relu(self.fc1(x)))
        x = self.dropout(torch.relu(self.fc2(x)))
        x = self.dropout(torch.relu(self.fc3(x)))
        x = torch.sigmoid(self.fc4(x))
        return x

In [None]:
## Run the model on the embeddings
model = RecipeRatingsPredictor().to(device)
title_embeddings = title_embeddings.numpy()
description_embeddings = description_embeddings.numpy()
ingredients_embeddings = ingredients_embeddings.numpy()
directions_embeddings = directions_embeddings.numpy()
ratings = dataset['rating'].values

)

