In [1]:
import sys
sys.path.insert(0, "D:\\Documents\\food_recipe_gen\\recipe_1m_analysis")
import os
import pandas as pd
import torch
import utils

from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

## Data preprocessing

In [2]:
FOLDER_PATH = "D:\\Google Drive\\Catherning Folder\\THU\\Thesis\\Recipe datasets\\scirep-cuisines-detail"
FILE = "cleaned_data.pkl"

In [109]:
df=pd.read_pickle(os.path.join(FOLDER_PATH,FILE))
df=df.reset_index()
df["nb_ingr"]=df["ingredients"].apply(len)

32

In [4]:
vocab_ingrs = utils.Vocabulary()
for ingredients in df.loc[:,"ingredients"]:
    for ingr in ingredients:
        vocab_ingrs.add_word(ingr)
vocab_ingrs.word2idx

{'egg': 0,
 'yeast': 1,
 'wheat': 2,
 'milk': 3,
 'lard': 4,
 'pork': 5,
 'carrot': 6,
 'pea': 7,
 'onion': 8,
 'potato': 9,
 'maple_syrup': 10,
 'almond': 11,
 'honey': 12,
 'oat': 13,
 'date': 14,
 'vegetable_oil': 15,
 'whole_grain_wheat_flour': 16,
 'butter': 17,
 'lovage': 18,
 'clam': 19,
 'thyme': 20,
 'black_pepper': 21,
 'parsley': 22,
 'ginger': 23,
 'bay': 24,
 'celery': 25,
 'cinnamon': 26,
 'mustard': 27,
 'cane_molasses': 28,
 'raisin': 29,
 'cream': 30,
 'asparagus': 31,
 'olive_oil': 32,
 'pepper': 33,
 'garlic': 34,
 'tomato': 35,
 'cilantro': 36,
 'tea': 37,
 'jasmine': 38,
 'vegetable': 39,
 'brown_rice': 40,
 'lemon_juice': 41,
 'soy_sauce': 42,
 'white_wine': 43,
 'chicken': 44,
 'vanilla': 45,
 'rice': 46,
 'mushroom': 47,
 'chicken_broth': 48,
 'basil': 49,
 'porcini': 50,
 'mozzarella_cheese': 51,
 'tuna': 52,
 'lemon': 53,
 'beef': 54,
 'fish': 55,
 'cocoa': 56,
 'green_bell_pepper': 57,
 'oregano': 58,
 'rosemary': 59,
 'coffee': 60,
 'banana': 61,
 'squash': 

In [5]:
vocab_cuisine = utils.Vocabulary()
for cuisine in df.loc[:,"cuisine"]:
    vocab_cuisine.add_word(cuisine)

## Data processing

In [6]:
MAX_INGR=max(df["nb_ingr"])
MAX_INGR

In [138]:
def ingr2idx(ingr_list):
    input_=[]
    for ingr in ingr_list:
        input_.append(vocab_ingrs.word2idx[ingr])
    input_ = torch.LongTensor(input_)
    onehot_enc = F.one_hot(input_.to(torch.int64), INPUT_SIZE)
    output = torch.sum(onehot_enc,0)
#     padding=(0,0,0,MAX_INGR-len(input_))
#     output = torch.nn.functional.pad(onehot_enc, padding)
    return output

class RecipesDataset(Dataset):
    """Recipes dataset for cuisine classification. Only from ingredients for now"""

    def __init__(self, dataframe):
        """
        Args:
            file (string): Path to the file
        """
        self.data = dataframe

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        if type(idx)==int or len(idx)==1:
            a = vocab_cuisine.word2idx[self.data.loc[idx,"cuisine"]]
            b = ingr2idx(self.data.loc[idx,"ingredients"])
            sample = {'label': a ,
                  'ingredients':b }
        else:
            sample=[]
            data= self.data.loc[idx,("cuisine","ingredients")]
            for index, row in data.iterrows():
                a = vocab_cuisine.word2idx[row["cuisine"]]
                b = ingr2idx(row["ingredients"])
                sample.append({'label':a,'ingredients':b})
        
                
        return sample

In [148]:
dataset = RecipesDataset(df)
train_d,test_d = torch.utils.data.random_split(dataset, [len(dataset)-1000,1000])
train_loader = DataLoader(train_d, batch_size=2,shuffle=True)
test_loader = DataLoader(test_d, batch_size=2)

# Model

In [141]:
INPUT_SIZE=len(vocab_ingrs)
NUM_CLASSES = len(vocab_cuisine)
HIDDEN_SIZE = 100

In [142]:
class Net(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(Net, self).__init__()
        self.layer_1 = nn.Linear(input_size,hidden_size, bias=True)
#         self.embeddings = nn.Embedding(vocab_size, embedding_dim) vocab size = input size,  embed dim = hidden size
        self.layer_2 = nn.Linear(hidden_size, hidden_size, bias=True)
        self.output_layer = nn.Linear(hidden_size, num_classes, bias=True)

    def forward(self, x):
        out = F.relu(self.layer_1(x))
        out = F.relu(self.layer_2(out))
        out = self.output_layer(out)
        return out

net = Net(INPUT_SIZE, HIDDEN_SIZE, NUM_CLASSES)

## Training

In [143]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

In [156]:
for epoch in range(10):  # loop over the dataset multiple times

    running_loss = 0.0
    for i, data in enumerate(train_loader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs = data["ingredients"]
        labels = data["label"]

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs.float())
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % 2000 == 1999:    # print every 2000 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 2000))
            running_loss = 0.0

print('Finished Training')

[1,  2000] loss: 1.284
[1,  4000] loss: 1.342
[1,  6000] loss: 1.324
[1,  8000] loss: 1.354
[2,  2000] loss: 1.288
[2,  4000] loss: 1.273
[2,  6000] loss: 1.291
[2,  8000] loss: 1.297
[3,  2000] loss: 1.209
[3,  4000] loss: 1.252
[3,  6000] loss: 1.287
[3,  8000] loss: 1.273
[4,  2000] loss: 1.177
[4,  4000] loss: 1.251
[4,  6000] loss: 1.235
[4,  8000] loss: 1.231
[5,  2000] loss: 1.127
[5,  4000] loss: 1.213
[5,  6000] loss: 1.225
[5,  8000] loss: 1.209
[6,  2000] loss: 1.090
[6,  4000] loss: 1.178
[6,  6000] loss: 1.197
[6,  8000] loss: 1.191
[7,  2000] loss: 1.088
[7,  4000] loss: 1.129
[7,  6000] loss: 1.167
[7,  8000] loss: 1.171
[8,  2000] loss: 1.061
[8,  4000] loss: 1.070
[8,  6000] loss: 1.147
[8,  8000] loss: 1.156
[9,  2000] loss: 1.014
[9,  4000] loss: 1.062
[9,  6000] loss: 1.121
[9,  8000] loss: 1.119
[10,  2000] loss: 0.994
[10,  4000] loss: 1.053
[10,  6000] loss: 1.085
[10,  8000] loss: 1.077
Finished Training


## Testing

In [157]:
correct = 0
total = 0
with torch.no_grad():
    for data in test_loader:
        inputs = data["ingredients"]
        labels = data["label"]
        outputs = net(inputs.float())
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print('Accuracy of the network on the test dataset: %d %%' % (
    100 * correct / total))


Accuracy of the network on the test dataset: 47 %
