# Alejandro Simon
## CPSC 482 Coding Project
### March 3rd, 2023

# A Neural Network Approach to Predicting Ingredient Synergy

Predicts user ratings based on recipe information

## CODE USED BELOW
### Slightly different variations of the code below were used for the other models mentioned in the methods section

In [1]:
import tensorflow as tf
import keras
import numpy as np
import pandas as pd
from sklearn.preprocessing import normalize

# Details on the dataset can be found at https://www.kaggle.com/datasets/shuyangli94/food-com-recipes-and-user-interactions
# For nutrition information:
# [calories (#), total fat (PDV), 
# sugar (PDV) , sodium (PDV) , protein (PDV) 
# , saturated fat (PDV) , and carbohydrates (PDV)]

# Load the dataset
path = r"/Users/aleja/Documents/School/CPSC 482"
data_path = path + r"/archive"

users = pd.read_csv(data_path+"/RAW_interactions.csv")
recipes = pd.read_csv(data_path+"/RAW_recipes.csv")

In [2]:
# Get rid of unnecessary columns of data
to_drop = ['name', 'contributor_id', 'submitted', 'steps']
for header in to_drop:
    recipes = recipes.drop(header,axis=1)
recipes.head(1)

Unnamed: 0,id,minutes,tags,nutrition,n_steps,description,ingredients,n_ingredients
0,137739,55,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7


In [3]:
# Merge the users and recipes dataframes
users = users.rename(columns={'recipe_id': 'id'})
users = users.groupby('id').mean()
print(len(users))
df = pd.merge(users, recipes, on="id")

231637


  users = users.groupby('id').mean()


In [4]:
# Drop more of the unnecessary columns of data

for header in users.head():
    if header != 'id' and header != 'rating':
        users = users.drop(header,axis=1)

for header in recipes.head():
    if header != 'id' and header != 'nutrition' and header != 'ingredients' and header != 'tags':
        recipes = recipes.drop(header,axis=1)

In [5]:
# Average and round the user ratings
users = users.round(0)
users['rating'] = users['rating'].astype(int)

In [7]:
train = pd.merge(users, recipes, on="id")

# Change the ratings into binary (1 for successful with an average score of five out of five 
# or 0 for unsuccesful with less ideal reviews)
train['rating'][train['rating'] <= 4] = 0
train['rating'][train['rating'] > 4] = 1

def vectorize(inp):
    """
    Vectorizes input
    """
    pre_vect = [] # Store all of the possible inputs (get size and name)
    for vecs in inp:
        for vec in vecs:
            if vec not in pre_vect:
                pre_vect.append(vec)
                
    n = len(pre_vect)
    vect = np.zeros((train.shape[0],n)).tolist() # This will contain 1 or 0 if the element is contained for a given recipe
    for i,vecs in enumerate(inp):
        for vec in vecs:
            vect[i][pre_vect.index(vec)] = 1
        vect[i] = normalize([vect[i]])[0]
    return vect

# Vectorize all of the possible inputs
vect_ingr = vectorize(train['ingredients'])
vect_tags = vectorize(train['tags'])

# We don't actually use these because they don't help the model much
# but I kept the code here for completeness
vect_min = df['minutes'].to_numpy().tolist()
vect_nsteps = df['n_steps'].to_numpy().tolist()
vect_ningr = df['n_ingredients'].to_numpy().tolist()
# vect_desc = vectorize(recipes['description'])

# Drop now unnecessary columns of data
train = train.drop('id',axis=1)
train = train.drop('ingredients',axis=1)
train = train.drop('tags',axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['rating'][train['rating'] <= 4] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['rating'][train['rating'] > 4] = 1


In [8]:
# Find the appropriate dimensions for the tensor
n = max(len(vect_ingr[0]), len(vect_tags[0]))

In [9]:
# Get x and y data
x, y = train.drop('rating',axis=1), train['rating'].to_numpy().tolist()

# Convert the string that holds the list of ingredients to a list
for i in range(np.shape(x)[0]):
    x['nutrition'][i] = normalize([eval(x['nutrition'][i])])[0]

In [10]:
x['nutrition'][0] # Double check output

array([0.81628611, 0.01432919, 0.57316754, 0.0047764 , 0.02865838,
       0.02865838, 0.05731675])

In [11]:
# Actually define the tensor for training and testing
# Depending on what data we want we comment out rows of x_t and change the size of x_t 
# and the input shape to the model
x_t = np.zeros((len(x),3,n)).tolist()
for i,ing in enumerate(vect_ingr):
    x_t[i][0] = np.pad(x['nutrition'][i],(0,n-len(x['nutrition'][i])),'constant',constant_values=(0,0)).tolist()
    x_t[i][1] = ing
    x_t[i][2] = np.pad(vect_tags[i],(0,n-len(vect_tags[i])),'constant',constant_values=(0,0)).tolist()
    # x_t[i][3] = np.pad([vect_min[i]],(0,n-1),'constant',constant_values=(0,0)).tolist()
    # x_t[i][4] = np.pad([vect_nsteps[i]],(0,n-1),'constant',constant_values=(0,0)).tolist()
    # x_t[i][5] = np.pad([vect_ningr[i]],(0,n-1),'constant',constant_values=(0,0)).tolist()

In [12]:
# Split the data between a train and test set
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_t, y, test_size=0.4)

In [13]:
# Define the model as outlined in the methods section
from keras import backend as K

model = keras.Sequential([
    keras.layers.Flatten(input_shape=(3,n)),
    # keras.layers.Dense(1024, activation='relu'),
    keras.layers.Dense(512, activation='relu'),
    keras.layers.Dense(256, activation='relu'),
    keras.layers.Dense(128, activation='relu'),
    keras.layers.Dense(64, activation='relu'),
    keras.layers.Dense(32, activation='relu'),
    keras.layers.Dense(16, activation='relu'),
    keras.layers.Dense(2, activation='softmax')
])

model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Here we benefit from a lower learning rate
K.set_value(model.optimizer.learning_rate, 1e-6)

2023-03-01 09:09:57.821202: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [14]:
# Perform training as outlined in the method and results sections
model.fit(np.asarray(x_train), np.asarray(y_train), epochs=1000, batch_size=64, shuffle=True, validation_split=0.4)
test_loss, test_acc = model.evaluate(np.asarray(x_test), np.asarray(y_test), verbose=2)

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
E

In [15]:
# Test some predictions
predictions_single = model.predict(np.asarray(x_train))



In [16]:
# Convert probabilities to actual predictions
pred = [predictions_single[i].argmax() for i in range(len(predictions_single))]
np.sum(np.array(pred) - np.array(y_train)) / len(y_train)

0.2241657193017801