In [37]:
import sys
sys.path.insert(0, "D:\\Documents\\food_recipe_gen\\recipe_1m_analysis")
import os
import pandas as pd
import torch
import utils
import numpy as np

from sklearn import metrics
from sklearn.model_selection import train_test_split

from torch.utils.data import Dataset, DataLoader, Sampler
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

## Data preprocessing

In [38]:
FOLDER_PATH = "D:\\Google Drive\\Catherning Folder\\THU\\Thesis\\Recipe datasets\\"
DATASET = ["scirep-cuisines-detail","Yummly28"]
FILES = ["random_data.pkl","cluster_centroid_data.pkl","full_data.pkl"]
balanced = True

In [54]:
# TODO put together both datasets (same method of undersampling, or undersample afterwards ?) & test
df=pd.read_pickle(os.path.join(FOLDER_PATH,DATASET[1],FILES[0]))
df=df.reset_index()
df

Unnamed: 0,level_0,index,id,cuisine,ingredients,all_ingredients
0,0,0,0,Italian,"[baby bella mushroom sliced, arborio rice, oli...",baby bella mushroom sliced;arborio rice;olive ...
1,1,1,1,Barbecue,"[pork country style rib fat trimmed, soy sauce...",pork country style rib fat trimmed;soy sauce;w...
2,2,2,2,Italian,"[garlic, extra-virgin olive oil divided, madei...",garlic;extra-virgin olive oil divided;madeira ...
3,3,3,3,French,"[garlic halved, cooking spray, peeled red pota...",garlic halved;cooking spray;peeled red potato;...
4,4,4,4,Barbecue,"[lean ground beef, worcestershire sauce, liqui...",lean ground beef;worcestershire sauce;liquid s...
...,...,...,...,...,...,...
17065,27615,27627,27627,Thai,"[coconut milk, creamy peanut butter, + yellow ...",coconut milk;creamy peanut butter;+ yellow cur...
17066,27618,27630,27630,Italian,"[linguine, kosher salt, olive oil, garlic chop...",linguine;kosher salt;olive oil;garlic chopped ...
17067,27621,27633,27633,Italian,"[zwiebeln, knoblauchzehen, dose dicke weisse b...",zwiebeln;knoblauchzehen;dose dicke weisse bohn...
17068,27623,27635,27635,French,"[gr. bread flour, ml. water, gr. yeast, gr. st...",gr. bread flour;ml. water;gr. yeast;gr. strong...


In [55]:
vocab_ingrs = utils.Vocabulary()
for ingredients in df.loc[:,"ingredients"]:
    for ingr in ingredients:
        vocab_ingrs.add_word(ingr)
vocab_ingrs.add_word("<unk>")
vocab_ingrs.word2idx

{'baby bella mushroom sliced': 0,
 'arborio rice': 1,
 'olive oil': 2,
 'butter': 3,
 'shallot minced': 4,
 'white wine': 5,
 'fat free chicken stock': 6,
 'salt pepper': 7,
 'grated parmesan cheese': 8,
 'chopped parsley': 9,
 'pork country style rib fat trimmed': 10,
 'soy sauce': 11,
 'white vinegar': 12,
 'lemon juice': 13,
 'brown sugar': 14,
 'garlic crushed': 15,
 'black pepper': 16,
 'crushed red pepper flake': 17,
 'garlic': 18,
 'extra-virgin olive oil divided': 19,
 'madeira divided': 20,
 'unsalted chicken stock divided': 21,
 'dried porcini mushroom': 22,
 'chopped onion': 23,
 'thinly sliced cremini mushroom': 24,
 'thinly sliced shiitake mushroom cap': 25,
 'uncooked arborio rice': 26,
 'parmesan cheese grated': 27,
 'kosher salt': 28,
 'freshly ground black pepper': 29,
 'chopped fresh sage': 30,
 'sage': 31,
 'garlic halved': 32,
 'cooking spray': 33,
 'peeled red potato': 34,
 'butter melted': 35,
 'salt': 36,
 'shredded gruyere cheese': 37,
 'fat-free milk': 38,
 'le

In [56]:
vocab_cuisine = utils.Vocabulary()
for cuisine in df['cuisine'].value_counts().index:
    vocab_cuisine.add_word(cuisine)
vocab_cuisine.word2idx

{'Italian': 0,
 'Mexican': 1,
 'Asian': 2,
 'French': 3,
 'Indian': 4,
 'American': 5,
 'Kid-Friendly': 6,
 'Southwestern': 7,
 'Thai': 8,
 'Barbecue': 9,
 'Chinese': 10,
 'Southern & Soul Food': 11,
 'Greek': 12,
 'Mediterranean': 13,
 'Spanish_Portuguese': 14,
 'Cuban': 15,
 'Cajun & Creole': 16,
 'English_Irish': 17,
 'Moroccan': 18,
 'Japanese': 19}

## Data processing

In [57]:
def ingr2idx(ingr_list):
    # If I didn't do the one-hot encoding by myself and used directly an embedding layer in the net, 
    # I would have to pad the input
    input_=[]
    for ingr in ingr_list:
        try:
            input_.append(vocab_ingrs.word2idx[ingr])
        except KeyError:
            input_.append(vocab_ingrs.word2idx["<unk>"])
    input_ = torch.LongTensor(input_)
    onehot_enc = F.one_hot(input_.to(torch.int64), INPUT_SIZE)
    output = torch.sum(onehot_enc,0)
    return output

class RecipesDataset(Dataset):
    """Recipes dataset for cuisine classification. Only from ingredients for now"""

    def __init__(self, input_,labels):
        """
        Args:
            file (string): Path to the file
        """
        self.input_ = input_
        self.labels = labels

    def __len__(self):
        return len(self.input_)

    def __getitem__(self, idx):
        # XXX useful ?
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        ingr_one_hot = ingr2idx(self.input_.loc[idx,"ingredients"])
        label = vocab_cuisine.word2idx[self.labels.loc[idx,"cuisine"]]

        return ingr_one_hot, label

In [58]:
def make_weights_for_balanced_classes(samples, nclasses): 
    count = [0] * nclasses
    weight_per_class = [0.] * nclasses
    N = len(samples)
    
    for a,el in samples.value_counts().items():
        count[vocab_cuisine.word2idx[a]]=el

    for i in range(nclasses): 
        weight_per_class[i] = max(count)/float(count[i]) # divide by max count[i] ? Or just different scale, order is same
        print(vocab_cuisine.idx2word[i], weight_per_class[i])
    weight = [0] * N
    
    for idx, val in enumerate(samples): 
        weight[idx] = weight_per_class[vocab_cuisine.word2idx[val]] 
        
    
    return torch.Tensor(weight_per_class), torch.DoubleTensor(weight)

# from https://gist.github.com/srikarplus/15d7263ae2c82e82fe194fc94321f34e

In [59]:
INPUT_SIZE = len(vocab_ingrs)
EMBED_DIM1 = 300
EMBED_DIM2 = 64
NUM_CLASSES = len(vocab_cuisine) #51 or 20
BATCH_SIZE = 100
PRINT_FREQ = 20

Weighted random sampling, with stratified split for the train and test dataset. But loss doesn't decrease (need to see more epochs ?)

In [60]:
#TODO when switch to python file, can put num_workers & have to put if __name__ == '__main__':
X_train, X_test, y_train, y_test = train_test_split(df["ingredients"],df["cuisine"], test_size=0.1, random_state=42,stratify=df["cuisine"])

X_train = X_train.reset_index()
X_test = X_test.reset_index()
y_train = y_train.reset_index()
y_test = y_test.reset_index()

train_dataset = RecipesDataset(X_train,y_train)
test_dataset = RecipesDataset(X_test,y_test)

if balanced:
    weights_classes, weights_labels = make_weights_for_balanced_classes(y_train["cuisine"], len(vocab_cuisine.word2idx)) 
    print(len(weights_labels))
    sampler = torch.utils.data.sampler.WeightedRandomSampler(weights_labels, len(weights_labels)) 
    train_loader = DataLoader(train_dataset,batch_size = BATCH_SIZE, sampler = sampler)#, pin_memory=True)
else:
    train_loader = DataLoader(train_dataset,batch_size = BATCH_SIZE)#, pin_memory=True)    

test_loader = DataLoader(test_dataset,batch_size = 1)

Italian 1.0
Mexican 1.346644844517185
Asian 2.3414911781445644
French 2.926031294452347
Indian 3.1380625476735315
American 3.8958333333333335
Kid-Friendly 4.750577367205543
Southwestern 7.618518518518519
Thai 13.185897435897436
Barbecue 13.532894736842104
Chinese 17.213389121338913
Southern & Soul Food 30.25
Greek 57.138888888888886
Mediterranean 63.292307692307695
Spanish_Portuguese 76.18518518518519
Cuban 146.92857142857142
Cajun & Creole 257.125
English_Irish 316.46153846153845
Moroccan 411.4
Japanese 457.1111111111111
15363


In [61]:
# # Random split, but loss decreases...

# dataset = RecipesDataset(df[["ingredients","cuisine"]],df[["ingredients","cuisine"]])
# train_d,test_d = torch.utils.data.random_split(dataset, [len(dataset)-1000,1000])
# train_loader = DataLoader(train_d,batch_size = BATCH_SIZE, shuffle = True)
# test_loader = DataLoader(test_d, batch_size=1)

# Model

In [68]:
class Net(nn.Module):
    def __init__(self, vocab_size, embedding_dim1, embedding_dim2, num_classes):
        super(Net, self).__init__()
        self.layer_1 = nn.Linear(vocab_size, embedding_dim1, bias=True)
        self.layer_2 = nn.Linear(embedding_dim1, embedding_dim1, bias=True)
#         self.layer_3 = nn.Linear(embedding_dim1, embedding_dim2, bias=True)
        self.output_layer = nn.Linear(embedding_dim1, num_classes, bias=True)

    def forward(self, x):
        out = F.relu(self.layer_1(x))
        out = F.relu(self.layer_2(out))
#         out = F.relu(self.layer_3(out))
        out = self.output_layer(out)
        return out

net = Net(INPUT_SIZE, EMBED_DIM1, EMBED_DIM2, NUM_CLASSES)

## Training

In [69]:
if balanced:
    criterion = nn.CrossEntropyLoss(weights_classes)
else:
    criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.01, momentum=0.9) #change to Adam ?

In [70]:
for epoch in range(30):

    running_loss = 0.0
    correct = 0
    total = 0
    for i, data in enumerate(train_loader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs = data[0]
        labels = data[1]
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs.float())
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

        # print statistics
        running_loss += loss.item()
        if i % PRINT_FREQ == PRINT_FREQ-1:    # print every 2000 mini-batches
            print(f'[Epoch {epoch + 1}, Iteration {i + 1}] loss: {running_loss / 2000:.3f}')
            running_loss = 0.0
    
    print(f'Accuracy of the network on epoch {epoch+1}: {100 * correct / total:.3f}')

print('Finished Training')

[Epoch 1, Iteration 20] loss: 0.029
[Epoch 1, Iteration 40] loss: 0.027
[Epoch 1, Iteration 60] loss: 0.024
[Epoch 1, Iteration 80] loss: 0.022
[Epoch 1, Iteration 100] loss: 0.021
[Epoch 1, Iteration 120] loss: 0.019
[Epoch 1, Iteration 140] loss: 0.018
Accuracy of the network on epoch 1: 7.902
[Epoch 2, Iteration 20] loss: 0.014
[Epoch 2, Iteration 40] loss: 0.013
[Epoch 2, Iteration 60] loss: 0.012
[Epoch 2, Iteration 80] loss: 0.010
[Epoch 2, Iteration 100] loss: 0.009
[Epoch 2, Iteration 120] loss: 0.008
[Epoch 2, Iteration 140] loss: 0.007
Accuracy of the network on epoch 2: 19.859
[Epoch 3, Iteration 20] loss: 0.007
[Epoch 3, Iteration 40] loss: 0.007
[Epoch 3, Iteration 60] loss: 0.006
[Epoch 3, Iteration 80] loss: 0.005
[Epoch 3, Iteration 100] loss: 0.005
[Epoch 3, Iteration 120] loss: 0.005
[Epoch 3, Iteration 140] loss: 0.005
Accuracy of the network on epoch 3: 25.086
[Epoch 4, Iteration 20] loss: 0.005
[Epoch 4, Iteration 40] loss: 0.004
[Epoch 4, Iteration 60] loss: 0.005

[Epoch 28, Iteration 20] loss: 0.001
[Epoch 28, Iteration 40] loss: 0.001
[Epoch 28, Iteration 60] loss: 0.001
[Epoch 28, Iteration 80] loss: 0.001
[Epoch 28, Iteration 100] loss: 0.001
[Epoch 28, Iteration 120] loss: 0.001
[Epoch 28, Iteration 140] loss: 0.001
Accuracy of the network on epoch 28: 60.678
[Epoch 29, Iteration 20] loss: 0.001
[Epoch 29, Iteration 40] loss: 0.001
[Epoch 29, Iteration 60] loss: 0.001
[Epoch 29, Iteration 80] loss: 0.001
[Epoch 29, Iteration 100] loss: 0.001
[Epoch 29, Iteration 120] loss: 0.001
[Epoch 29, Iteration 140] loss: 0.001
Accuracy of the network on epoch 29: 60.340
[Epoch 30, Iteration 20] loss: 0.001
[Epoch 30, Iteration 40] loss: 0.001
[Epoch 30, Iteration 60] loss: 0.001


KeyboardInterrupt: 

## Testing

In [None]:
# df_test=pd.read_pickle(os.path.join(FOLDER_PATH,DATASET[0],FILES[1]))
# df_test_s=df_test.sample(1000)
# df_test_s=df_test_s.reset_index()

# test_dataset=RecipesDataset(df_test_s)

# test_loader = DataLoader(test_dataset, batch_size=1)#, sampler = sampler)
# df_test_s

In [None]:
def f2_score(y_true, y_pred, threshold=0.5):
    return fbeta_score(y_true, y_pred, 2, threshold)


def fbeta_score(y_true, y_pred, beta, threshold, eps=1e-9):
    beta2 = beta**2

    y_pred = torch.ge(y_pred.float(), threshold).float()
    y_true = y_true.float()

    true_positive = (y_pred * y_true).sum(dim=1)
    precision = true_positive.div(y_pred.sum(dim=1).add(eps))
    recall = true_positive.div(y_true.sum(dim=1).add(eps))

    return torch.mean(
        (precision*recall).
        div(precision.mul(beta2) + recall + eps).
        mul(1 + beta2))

In [71]:
correct = 0
total = 0
all_predict = []
all_labels = []
with torch.no_grad():
    for data in test_loader:
        inputs = data[0]
        labels = data[1]
        outputs = net(inputs.float())
        _, predicted = torch.max(outputs.data, 1)
        all_predict.append(predicted)
        all_labels.append(labels)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
print(predicted)

print(f'Accuracy of the network on the test dataset: {100 * correct / total:.3f} %')

one_hot_pred = F.one_hot(torch.LongTensor(all_predict).to(torch.int64), NUM_CLASSES)
one_hot_lab = F.one_hot(torch.LongTensor(all_labels).to(torch.int64), NUM_CLASSES)
fbeta_pytorch = f2_score(one_hot_pred, one_hot_lab)

print(f'Score is {100* fbeta_pytorch:.3f} %')

tensor([11])
Accuracy of the network on the test dataset: 7.323 %
Score is 7.323 %
