In [98]:
import sys
sys.path.insert(0, "D:\\Documents\\food_recipe_gen\\recipe_1m_analysis")
import os
import pandas as pd
import torch
import utils
import numpy as np

from torch.utils.data import Dataset, DataLoader, Sampler
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

## Data preprocessing

In [184]:
FOLDER_PATH = "D:\\Google Drive\\Catherning Folder\\THU\\Thesis\\Recipe datasets\\"
DATASET = ["scirep-cuisines-detail","Yummly28"]
FILES = ["cleaned_data.pkl","full_data.pkl"]

In [186]:
df=pd.read_pickle(os.path.join(FOLDER_PATH,DATASET[1],FILES[1]))
df=df.reset_index()
df

Unnamed: 0,index,cuisine,id,ingredients,all_ingredients,american_id,nb_ingr
0,0,Canada,0,"[egg, yeast, wheat, milk, lard]",egg;yeast;wheat;milk;lard,,5
1,1,Canada,1,"[pork, carrot, pea, onion, potato]",pork;carrot;pea;onion;potato,,5
2,2,Canada,2,[maple_syrup],maple_syrup,,1
3,3,Canada,3,"[wheat, yeast, almond, honey, oat, date, veget...",wheat;yeast;almond;honey;oat;date;vegetable_oi...,,8
4,4,Canada,4,"[butter, lovage, clam, wheat, onion, thyme, po...",butter;lovage;clam;wheat;onion;thyme;potato;ye...,,16
...,...,...,...,...,...,...,...
57686,57686,Italian,2453,"[kiwi, olive_oil, clam, white_wine, orange, sa...",kiwi;olive_oil;clam;white_wine;orange;salmon;f...,,17
57687,57687,Italian,2454,"[tomato, butter, beef, onion, red_wine, black_...",tomato;butter;beef;onion;red_wine;black_pepper...,,11
57688,57688,Italian,2455,"[vegetable, wheat, egg, cheese, olive_oil]",vegetable;wheat;egg;cheese;olive_oil,,5
57689,57689,Italian,2456,"[tomato, clam, black_pepper, parsley, celery, ...",tomato;clam;black_pepper;parsley;celery;macaroni,,6


In [187]:
vocab_ingrs = utils.Vocabulary()
for ingredients in df.loc[:,"ingredients"]:
    for ingr in ingredients:
        vocab_ingrs.add_word(ingr)
vocab_ingrs.word2idx

{'egg': 0,
 'yeast': 1,
 'wheat': 2,
 'milk': 3,
 'lard': 4,
 'pork': 5,
 'carrot': 6,
 'pea': 7,
 'onion': 8,
 'potato': 9,
 'maple_syrup': 10,
 'almond': 11,
 'honey': 12,
 'oat': 13,
 'date': 14,
 'vegetable_oil': 15,
 'whole_grain_wheat_flour': 16,
 'butter': 17,
 'lovage': 18,
 'clam': 19,
 'thyme': 20,
 'black_pepper': 21,
 'parsley': 22,
 'ginger': 23,
 'bay': 24,
 'celery': 25,
 'cinnamon': 26,
 'mustard': 27,
 'cane_molasses': 28,
 'raisin': 29,
 'cream': 30,
 'asparagus': 31,
 'olive_oil': 32,
 'pepper': 33,
 'garlic': 34,
 'tomato': 35,
 'cilantro': 36,
 'tea': 37,
 'jasmine': 38,
 'vegetable': 39,
 'brown_rice': 40,
 'lemon_juice': 41,
 'soy_sauce': 42,
 'white_wine': 43,
 'chicken': 44,
 'vanilla': 45,
 'rice': 46,
 'mushroom': 47,
 'chicken_broth': 48,
 'basil': 49,
 'porcini': 50,
 'mozzarella_cheese': 51,
 'tuna': 52,
 'lemon': 53,
 'beef': 54,
 'fish': 55,
 'cocoa': 56,
 'green_bell_pepper': 57,
 'oregano': 58,
 'rosemary': 59,
 'coffee': 60,
 'banana': 61,
 'squash': 

In [188]:
vocab_cuisine = utils.Vocabulary()
for cuisine in df.loc[:,"cuisine"]:
    vocab_cuisine.add_word(cuisine)

In [189]:
# # df["cuisine_id"]=vocab_cuisine.word2idx[df.loc["cuisine"]]
# class_vector=[]
# for el in df["cuisine"]:
#     class_vector.append(vocab_cuisine.word2idx[el])
# class_vector = torch.Tensor(class_vector)

## Data processing

In [191]:
def ingr2idx(ingr_list):
    # If I didn't do the one-hot encoding by myself and used directly an embedding layer in the net, 
    # I would have to pad the input
    input_=[]
    for ingr in ingr_list:
        input_.append(vocab_ingrs.word2idx[ingr])
    input_ = torch.LongTensor(input_)
    onehot_enc = F.one_hot(input_.to(torch.int64), INPUT_SIZE)
    output = torch.sum(onehot_enc,0)
    return output

class RecipesDataset(Dataset):
    """Recipes dataset for cuisine classification. Only from ingredients for now"""

    def __init__(self, dataframe):
        """
        Args:
            file (string): Path to the file
        """
        self.data = dataframe

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        ingr = ingr2idx(self.data.loc[idx,"ingredients"])
        label = vocab_cuisine.word2idx[self.data.loc[idx,"cuisine"]]

        return ingr, label

In [192]:
# class StratifiedSampler(Sampler):
#     """Stratified Sampling
#     Provides equal representation of target classes in each batch
#     """
#     def __init__(self, class_vector, batch_size):
#         """
#         Arguments
#         ---------
#         class_vector : torch tensor
#             a vector of class labels
#         batch_size : integer
#             batch_size
#         """
#         self.n_splits = int(class_vector.size(0) / batch_size)
#         self.class_vector = class_vector

#     def gen_sample_array(self):
#         try:
#             from sklearn.model_selection import StratifiedShuffleSplit
#         except:
#             print('Need scikit-learn for this functionality')
        
#         s = StratifiedShuffleSplit(n_splits=self.n_splits, test_size=0.2)
#         X = th.randn(self.class_vector.size(0),2).numpy()
#         y = self.class_vector.numpy()
#         s.get_n_splits(X, y)

#         train_index, test_index = next(s.split(X, y))
#         return np.hstack([train_index, test_index])

#     def __iter__(self):
#         return iter(self.gen_sample_array())

#     def __len__(self):
#         return len(self.class_vector)

In [193]:
# def sampleFromClass(ds, k):
#     class_counts = {}
#     train_data = []
#     train_label = []
#     test_data = []
#     test_label = []
#     for data, label in ds:
#         c = label
#         class_counts[c] = class_counts.get(c, 0) + 1
#         if class_counts[c] <= k:
#             train_data.append(data)
#             train_label.append(label)
#         else:
#             test_data.append(data)
#             test_label.append(label)
#     train_data = torch.cat(train_data)
#     for ll in train_label:
#         print(ll)
#     train_label = torch.cat(train_label)
#     test_data = torch.cat(test_data)
#     test_label = torch.cat(test_label)

#     return (TensorDataset(train_data, train_label), 
#         TensorDataset(test_data, test_label))

In [200]:
INPUT_SIZE=len(vocab_ingrs)
EMBED_DIM1 = 300
EMBED_DIM2 = 64
NUM_CLASSES = len(vocab_cuisine) #51
BATCH_SIZE = 100
PRINT_FREQ = 20

In [195]:
dataset = RecipesDataset(df[["ingredients","cuisine"]])
# sampler = StratifiedSampler(class_vector, BATCH_SIZE)
train_d,test_d = torch.utils.data.random_split(dataset, [len(dataset)-1000,1000])
# train_ds, test_ds = sampleFromClass(dataset, 5)
train_loader = DataLoader(train_d,batch_size = BATCH_SIZE, shuffle = True)
test_loader = DataLoader(test_d, batch_size=1)

# Model

In [196]:
class Net(nn.Module):
    def __init__(self, vocab_size, embedding_dim1, embedding_dim2, num_classes):
        super(Net, self).__init__()
        self.layer_1 = nn.Linear(vocab_size, embedding_dim1, bias=True)
        self.layer_2 = nn.Linear(embedding_dim1, embedding_dim1, bias=True)
        self.layer_3 = nn.Linear(embedding_dim1, embedding_dim2, bias=True)
        self.output_layer = nn.Linear(embedding_dim2, num_classes, bias=True)

    def forward(self, x):
        out = F.relu(self.layer_1(x))
        out = F.relu(self.layer_2(out))
        out = F.relu(self.layer_3(out))
        out = self.output_layer(out)
        return out

net = Net(INPUT_SIZE, EMBED_DIM1, EMBED_DIM2, NUM_CLASSES)

## Training

In [197]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

In [198]:
for epoch in range(10):

    running_loss = 0.0
    for i, data in enumerate(train_loader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs = data[0]
        labels = data[1]
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs.float())
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % PRINT_FREQ == PRINT_FREQ-1:    # print every 2000 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 2000))
            running_loss = 0.0

print('Finished Training')

[1,  2000] loss: 1.809
[1,  4000] loss: 1.470
[1,  6000] loss: 1.414
[1,  8000] loss: 1.402
[1, 10000] loss: 1.348
[1, 12000] loss: 1.315
[1, 14000] loss: 1.252
[1, 16000] loss: 1.222
[1, 18000] loss: 1.236
[1, 20000] loss: 1.169
[1, 22000] loss: 1.160
[1, 24000] loss: 1.210
[1, 26000] loss: 1.141
[1, 28000] loss: 1.220
[1, 30000] loss: 1.132
[1, 32000] loss: 1.104
[1, 34000] loss: 1.151
[1, 36000] loss: 1.172
[1, 38000] loss: 1.182
[1, 40000] loss: 1.146
[1, 42000] loss: 1.080
[1, 44000] loss: 1.067
[1, 46000] loss: 1.058
[1, 48000] loss: 1.101
[1, 50000] loss: 1.061
[1, 52000] loss: 1.088
[1, 54000] loss: 1.024
[1, 56000] loss: 0.999
[2,  2000] loss: 1.062
[2,  4000] loss: 1.046
[2,  6000] loss: 1.067
[2,  8000] loss: 1.023
[2, 10000] loss: 1.033
[2, 12000] loss: 1.029
[2, 14000] loss: 1.056
[2, 16000] loss: 1.057
[2, 18000] loss: 1.027
[2, 20000] loss: 1.051
[2, 22000] loss: 1.053
[2, 24000] loss: 1.043
[2, 26000] loss: 1.064
[2, 28000] loss: 1.067
[2, 30000] loss: 1.068
[2, 32000] 

## Testing

In [199]:
correct = 0
total = 0
with torch.no_grad():
    for data in test_loader:
        inputs = data[0]
        labels = data[1]
        outputs = net(inputs.float())
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print('Accuracy of the network on the test dataset: %d %%' % (
    100 * correct / total))


Accuracy of the network on the test dataset: 73 %
