In [161]:
import sys
sys.path.insert(0, "D:\\Documents\\food_recipe_gen\\recipe_1m_analysis")
import os
import pandas as pd
import torch
import utils
import numpy as np

from sklearn.model_selection import train_test_split

from torch.utils.data import Dataset, DataLoader, Sampler
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

## Data preprocessing

In [106]:
FOLDER_PATH = "D:\\Google Drive\\Catherning Folder\\THU\\Thesis\\Recipe datasets\\"
DATASET = ["scirep-cuisines-detail","Yummly28"]
FILES = ["cleaned_data.pkl","full_data.pkl"]

In [107]:
for i in range(1,2):
    df=pd.read_pickle(os.path.join(FOLDER_PATH,DATASET[i],FILES[1]))
    df=df.reset_index()
df

Unnamed: 0,index,id,cuisine,ingredients,all_ingredients
0,0,0,Italian,"[baby bella mushroom sliced, arborio rice, oli...",baby bella mushroom sliced;arborio rice;olive ...
1,1,1,Barbecue,"[pork country style rib fat trimmed, soy sauce...",pork country style rib fat trimmed;soy sauce;w...
2,2,2,Italian,"[garlic, extra-virgin olive oil divided, madei...",garlic;extra-virgin olive oil divided;madeira ...
3,3,3,French,"[garlic halved, cooking spray, peeled red pota...",garlic halved;cooking spray;peeled red potato;...
4,4,4,Barbecue,"[lean ground beef, worcestershire sauce, liqui...",lean ground beef;worcestershire sauce;liquid s...
...,...,...,...,...,...
27621,27633,27633,Italian,"[zwiebeln, knoblauchzehen, dose dicke weisse b...",zwiebeln;knoblauchzehen;dose dicke weisse bohn...
27622,27634,27634,American,"[fresh lavender blossom, sugar, carrot juice, ...",fresh lavender blossom;sugar;carrot juice;pota...
27623,27635,27635,French,"[gr. bread flour, ml. water, gr. yeast, gr. st...",gr. bread flour;ml. water;gr. yeast;gr. strong...
27624,27636,27636,American,"[flour, unsweetened cocoa powder, baking soda,...",flour;unsweetened cocoa powder;baking soda;sal...


In [153]:
vocab_ingrs = utils.Vocabulary()
for ingredients in df.loc[:,"ingredients"]:
    for ingr in ingredients:
        vocab_ingrs.add_word(ingr)
vocab_ingrs.add_word("<unk>")
vocab_ingrs.word2idx

{'baby bella mushroom sliced': 0,
 'arborio rice': 1,
 'olive oil': 2,
 'butter': 3,
 'shallot minced': 4,
 'white wine': 5,
 'fat free chicken stock': 6,
 'salt pepper': 7,
 'grated parmesan cheese': 8,
 'chopped parsley': 9,
 'pork country style rib fat trimmed': 10,
 'soy sauce': 11,
 'white vinegar': 12,
 'lemon juice': 13,
 'brown sugar': 14,
 'garlic crushed': 15,
 'black pepper': 16,
 'crushed red pepper flake': 17,
 'garlic': 18,
 'extra-virgin olive oil divided': 19,
 'madeira divided': 20,
 'unsalted chicken stock divided': 21,
 'dried porcini mushroom': 22,
 'chopped onion': 23,
 'thinly sliced cremini mushroom': 24,
 'thinly sliced shiitake mushroom cap': 25,
 'uncooked arborio rice': 26,
 'parmesan cheese grated': 27,
 'kosher salt': 28,
 'freshly ground black pepper': 29,
 'chopped fresh sage': 30,
 'sage': 31,
 'garlic halved': 32,
 'cooking spray': 33,
 'peeled red potato': 34,
 'butter melted': 35,
 'salt': 36,
 'shredded gruyere cheese': 37,
 'fat-free milk': 38,
 'le

In [109]:
vocab_cuisine = utils.Vocabulary()
for cuisine in df['cuisine'].value_counts().index:
    vocab_cuisine.add_word(cuisine)
vocab_cuisine.word2idx

{'American': 0,
 'Italian': 1,
 'Mexican': 2,
 'Asian': 3,
 'French': 4,
 'Indian': 5,
 'Kid-Friendly': 6,
 'Southwestern': 7,
 'Thai': 8,
 'Barbecue': 9,
 'Chinese': 10,
 'Southern & Soul Food': 11,
 'Greek': 12,
 'Mediterranean': 13,
 'Spanish_Portuguese': 14,
 'Cuban': 15,
 'Cajun & Creole': 16,
 'English_Irish': 17,
 'Moroccan': 18,
 'Japanese': 19}

In [110]:
# # df["cuisine_id"]=vocab_cuisine.word2idx[df.loc["cuisine"]]
# class_vector=[]
# for el in df["cuisine"]:
#     class_vector.append(vocab_cuisine.word2idx[el])
# class_vector = torch.Tensor(class_vector)

## Data processing

In [250]:
def ingr2idx(ingr_list):
    # If I didn't do the one-hot encoding by myself and used directly an embedding layer in the net, 
    # I would have to pad the input
    input_=[]
    for ingr in ingr_list:
        try:
            input_.append(vocab_ingrs.word2idx[ingr])
        except KeyError:
            input_.append(vocab_ingrs.word2idx["<unk>"])
    input_ = torch.LongTensor(input_)
    onehot_enc = F.one_hot(input_.to(torch.int64), INPUT_SIZE)
    output = torch.sum(onehot_enc,0)
    return output

class RecipesDataset(Dataset):
    """Recipes dataset for cuisine classification. Only from ingredients for now"""

    def __init__(self, input_,labels):
        """
        Args:
            file (string): Path to the file
        """
        self.input_ = input_
        self.labels = labels

    def __len__(self):
        return len(self.input_)

    def __getitem__(self, idx):
        # XXX useful ?
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        ingr_one_hot = ingr2idx(self.input_.loc[idx,"ingredients"])
        label = vocab_cuisine.word2idx[self.labels.loc[idx,"cuisine"]]

        return ingr_one_hot, label

In [288]:
def make_weights_for_balanced_classes(samples, nclasses): 
    count = [0] * nclasses
    weight_per_class = [0.] * nclasses
    N = len(samples)
    
    for a,el in samples.value_counts().items():
        count[vocab_cuisine.word2idx[a]]=el

    for i in range(nclasses): 
        weight_per_class[i] = N/float(count[i]) # divide by max count[i] ? Or just different scale, order is same
        print(vocab_cuisine.idx2word[i], weight_per_class[i])
    weight = [0] * N
    
    for idx, val in enumerate(samples): 
        weight[idx] = weight_per_class[vocab_cuisine.word2idx[val]] 
        
    
    return torch.DoubleTensor(weight)

# from https://gist.github.com/srikarplus/15d7263ae2c82e82fe194fc94321f34e

In [243]:
INPUT_SIZE = len(vocab_ingrs)
EMBED_DIM1 = 300
EMBED_DIM2 = 64
NUM_CLASSES = len(vocab_cuisine) #51
BATCH_SIZE = 100
PRINT_FREQ = 20

Weighted random sampling, with stratified split for the train and test dataset. But loss doesn't decrease (need to see more epochs ?)

In [289]:
#TODO when switch to python file, can put num_workers & have to put if __name__ == '__main__':
X_train, X_test, y_train, y_test = train_test_split(df["ingredients"],df["cuisine"], test_size=0.1, random_state=42,stratify=df["cuisine"])
weights = make_weights_for_balanced_classes(y_train, len(vocab_cuisine.word2idx)) 
sampler = torch.utils.data.sampler.WeightedRandomSampler(weights, len(weights)) 

X_train = X_train.reset_index()
X_test = X_test.reset_index()
y_train = y_train.reset_index()
y_test = y_test.reset_index()

train_dataset = RecipesDataset(X_train,y_train)
test_dataset = RecipesDataset(X_test,y_test)

train_loader = DataLoader(train_dataset,batch_size = BATCH_SIZE, sampler = sampler)#, pin_memory=True)
test_loader = DataLoader(test_dataset,batch_size = 1)

American 2.35534293292914
Italian 6.043509965969859
Mexican 8.138461538461538
Asian 14.150825270347182
French 17.683499288762448
Indian 18.964912280701753
Kid-Friendly 28.71016166281755
Southwestern 46.04259259259259
Thai 79.68910256410257
Barbecue 81.78618421052632
Chinese 104.02928870292887
Southern & Soul Food 182.81617647058823
Greek 345.31944444444446
Mediterranean 382.5076923076923
Spanish_Portuguese 460.4259259259259
Cuban 887.9642857142857
Cajun & Creole 1553.9375
English_Irish 1912.5384615384614
Moroccan 2486.3
Japanese 2762.5555555555557


In [286]:
y_train["cuisine"].value_counts()

American                10556
Italian                  4114
Mexican                  3055
Asian                    1757
French                   1406
Indian                   1311
Kid-Friendly              866
Southwestern              540
Thai                      312
Barbecue                  304
Chinese                   239
Southern & Soul Food      136
Greek                      72
Mediterranean              65
Spanish_Portuguese         54
Cuban                      28
Cajun & Creole             16
English_Irish              13
Moroccan                   10
Japanese                    9
Name: cuisine, dtype: int64

In [267]:
# Random split, but loss decreases...
dataset = RecipesDataset(df[["ingredients","cuisine"]],df[["ingredients","cuisine"]])
train_d,test_d = torch.utils.data.random_split(dataset, [len(dataset)-1000,1000])
train_loader = DataLoader(train_d,batch_size = BATCH_SIZE, shuffle = True)
test_loader = DataLoader(test_d, batch_size=1)

# Model

In [279]:
class Net(nn.Module):
    def __init__(self, vocab_size, embedding_dim1, embedding_dim2, num_classes):
        super(Net, self).__init__()
        self.layer_1 = nn.Linear(vocab_size, embedding_dim1, bias=True)
        self.layer_2 = nn.Linear(embedding_dim1, embedding_dim1, bias=True)
        self.layer_3 = nn.Linear(embedding_dim1, embedding_dim2, bias=True)
        self.output_layer = nn.Linear(embedding_dim2, num_classes, bias=True)

    def forward(self, x):
        out = F.relu(self.layer_1(x))
        out = F.relu(self.layer_2(out))
        out = F.relu(self.layer_3(out))
        out = self.output_layer(out)
        return out

net = Net(INPUT_SIZE, EMBED_DIM1, EMBED_DIM2, NUM_CLASSES)

## Training

In [280]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

In [281]:
for epoch in range(3):

    running_loss = 0.0
    for i, data in enumerate(train_loader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs = data[0]
        labels = data[1]
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs.float())
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

        # print statistics
        running_loss += loss.item()
        if i % PRINT_FREQ == PRINT_FREQ-1:    # print every 2000 mini-batches
            print('[Epoch %d, Iteration %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 2000))
            running_loss = 0.0
    
    print('Accuracy of the network on epoch %d: %d %%' % (epoch+1, 100 * correct / total))

print('Finished Training')

[Epoch 1, Iteration    20] loss: 0.030
[Epoch 1, Iteration    40] loss: 0.030
[Epoch 1, Iteration    60] loss: 0.030
[Epoch 1, Iteration    80] loss: 0.030
[Epoch 1, Iteration   100] loss: 0.030
[Epoch 1, Iteration   120] loss: 0.030
[Epoch 1, Iteration   140] loss: 0.030
[Epoch 1, Iteration   160] loss: 0.030
[Epoch 1, Iteration   180] loss: 0.030
[Epoch 1, Iteration   200] loss: 0.030
[Epoch 1, Iteration   220] loss: 0.030
[Epoch 1, Iteration   240] loss: 0.030
Accuracy of the network on epoch 1: 6 %
[Epoch 2, Iteration    20] loss: 0.030
[Epoch 2, Iteration    40] loss: 0.030
[Epoch 2, Iteration    60] loss: 0.030
[Epoch 2, Iteration    80] loss: 0.030
[Epoch 2, Iteration   100] loss: 0.030
[Epoch 2, Iteration   120] loss: 0.030
[Epoch 2, Iteration   140] loss: 0.030
[Epoch 2, Iteration   160] loss: 0.030
[Epoch 2, Iteration   180] loss: 0.030
[Epoch 2, Iteration   200] loss: 0.030
[Epoch 2, Iteration   220] loss: 0.030
[Epoch 2, Iteration   240] loss: 0.030
Accuracy of the network 

## Testing

In [282]:
# df_test=pd.read_pickle(os.path.join(FOLDER_PATH,DATASET[0],FILES[1]))
# df_test_s=df_test.sample(1000)
# df_test_s=df_test_s.reset_index()

# test_dataset=RecipesDataset(df_test_s)

# test_loader = DataLoader(test_dataset, batch_size=1)#, sampler = sampler)
# df_test_s

In [283]:
correct = 0
total = 0
with torch.no_grad():
    for data in test_loader:
        inputs = data[0]
        labels = data[1]
        outputs = net(inputs.float())
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print('Accuracy of the network on the test dataset: %d %%' % (
    100 * correct / total))

Accuracy of the network on the test dataset: 0 %
