In [2]:
# https://bastings.github.io/annotated_encoder_decoder/

import torch
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as funct
import os
import pickle

In [3]:
class Vocabulary(object):
    """Simple vocabulary wrapper."""
    def __init__(self):
        self.word2idx = {}
        self.idx2word = {}
        self.idx = 0

    def add_word(self, word, idx=None):
        if idx is None:
            if not word in self.word2idx:
                self.word2idx[word] = self.idx
                self.idx2word[self.idx] = word
                self.idx += 1
            return self.idx
        else:
            if not word in self.word2idx:
                self.word2idx[word] = idx
                if idx in self.idx2word.keys():
                    self.idx2word[idx].append(word)
                else:
                    self.idx2word[idx] = [word]

                return idx

    def __call__(self, word):
        if not word in self.word2idx:
            return self.word2idx['<pad>']
        return self.word2idx[word]

    def __len__(self):
        return len(self.idx2word)

In [4]:
folder = "D:\\Documents\\food_recipe_gen\\recipe_1m_analysis"
files = ["allingrs_count.pkl","allwords_count.pkl","recipe1m_train.pkl",
         "recipe1m_vocab_ingrs.pkl","recipe1m_vocab_toks.pkl"]

with open(os.path.join(folder,"data",files[3]),'rb') as f:
    vocab_ingrs=pickle.load(f)

with open(os.path.join(folder,"data",files[4]),'rb') as f:
    vocab_toks=pickle.load(f)
    
with open(os.path.join(folder,"data",files[0]),'rb') as f:
    ingrs_count=pickle.load(f)
    
with open(os.path.join(folder,"data",files[1]),'rb') as f:
    tokens_count=pickle.load(f)
    
with open(os.path.join(folder,"data",files[2]),'rb') as f:
    recipes=pickle.load(f)

In [5]:
# class RecipesDataset(Dataset):
#     """Face Landmarks dataset."""

#     def __init__(self, file):
#         """
#         Args:
#             file (string): Path to the file
#         """
#         with open(file,'rb') as f:
#             self.data=pickle.load(f)

#     def __len__(self):
#         return len(self.data)

#     def __getitem__(self, idx):
#         if torch.is_tensor(idx):
#             idx = idx.tolist()
        
#         sample = {'title': self.data[idx]["title"], 
#                   'ingredients': self.data[idx]["ingredients"],
#                   'instructions':self.data[idx]["tokenized"],}

#         return sample

In [6]:
# recipe_dataset=RecipesDataset(os.path.join(folder,"data",files[2]))
# dataset_loader = torch.utils.data.DataLoader(recipe_dataset,
#                                              batch_size=32, shuffle=True,
#                                              num_workers=4)

In [7]:
# recipe=recipe_dataset[0]
# ingr=recipe["ingredients"]
# ingr_idx=[]
# for el in ingr:
#     ingr_idx.append(vocab_ingrs.word2idx[el])

# one_hot_enc = torch.nn.functional.one_hot(torch.LongTensor(ingr_idx), max(vocab_ingrs.idx2word.keys())) # size=(4,7,n)
# one_hot_enc.shape

In [8]:
ingrs_count

Counter({'elbow_macaroni': 2214,
         'american_cheese': 1577,
         'celery': 24610,
         'green_peppers': 7053,
         'pimentos': 335,
         'mayonnaise': 23453,
         'vinegar': 8903,
         'salt': 211868,
         'dry_dill_weed': 104,
         'tomatoes': 29987,
         'kosher_salt': 23488,
         'red_onion': 9626,
         'green_bell_pepper': 5559,
         'red_bell_pepper': 7237,
         'yellow_bell_pepper': 1509,
         'cucumber': 3808,
         'olive_oil': 97975,
         'fresh_basil': 8784,
         'watermelon_gelatin': 4,
         'boiling_water': 7961,
         'cool_whip': 2715,
         'seedless_watermelon': 336,
         'graham_cracker_crust': 568,
         'shredded_coconut': 1834,
         'lean_ground_beef': 5082,
         'fresh_garlic': 1573,
         'salt_and_black_pepper': 4000,
         'lemon_juice': 33064,
         'soy_sauce': 24806,
         'cornstarch': 17727,
         'pineapple_chunks': 1465,
         'mandarin_ora

In [9]:
ingrs_count.most_common(20)

[('salt', 211868),
 ('butter', 142550),
 ('sugar', 124625),
 ('olive_oil', 97975),
 ('water', 91932),
 ('eggs', 88945),
 ('garlic_cloves', 77069),
 ('milk', 63222),
 ('onion', 61999),
 ('flour', 61783),
 ('onions', 54554),
 ('all_-_purpose_flour', 51937),
 ('brown_sugar', 45666),
 ('egg', 45631),
 ('pepper', 43565),
 ('baking_powder', 43398),
 ('salt_and_pepper', 41458),
 ('vegetable_oil', 39474),
 ('baking_soda', 35932),
 ('vanilla_extract', 33882)]

In [10]:
tokens_count.most_common(30)

[('.', 5526316),
 (',', 3646076),
 ('and', 3298408),
 ('the', 3215253),
 ('in', 1680957),
 ('a', 1651714),
 ('to', 1628121),
 ('with', 1186772),
 ('until', 932030),
 ('add', 835463),
 ('of', 809827),
 ('minutes', 737664),
 ('for', 722802),
 ('heat', 568673),
 ('or', 562675),
 ('on', 530642),
 ('into', 510204),
 ('over', 459298),
 (';', 386917),
 ('bowl', 384456),
 ('cook', 371581),
 ('stir', 366431),
 ('mix', 344792),
 ('mixture', 341912),
 ('pan', 328432),
 ('salt', 326056),
 (')', 319568),
 ('(', 318020),
 ('about', 315881),
 ('is', 307990)]

In [11]:
recipes[0].keys()

dict_keys(['id', 'instructions', 'tokenized', 'ingredients', 'title'])

In [13]:
avg_ingrs=0
for recipe in recipes:
    avg_ingrs+=len(recipe["ingredients"])
avg_ingrs/=len(recipes)
avg_ingrs

9.993764306575105

In [16]:
len(vocab_ingrs.idx2word.keys())

{0: '<end>',
 1: ['elbow_macaroni',
  'macaroni',
  'cooked_macaroni',
  'macaroni_and_cheese_mix',
  'corkscrew_macaroni',
  'whole_wheat_elbow_macaroni',
  'macaroni_and_cheese_dinner_mix',
  'cooked_elbow_macaroni',
  'hamburger_helper_cheeseburger_macaroni',
  'cooked_salad_macaroni',
  'wagon_wheel_macaroni',
  'shell_macaroni',
  'whole_wheat_macaroni',
  'small_elbow_macaroni',
  'salad_macaroni',
  'dry_pasta_shell_macaroni',
  'low_-_carb_elbow_macaroni',
  'macaroni_and_white_cheddar_cheese_mix',
  'cooked_corkscrew_macaroni',
  'multi_colored_shell_macaroni',
  'tri_-_colored_corkscrew_macaroni',
  'macaroni_and_cheese_powdered_topping',
  'hamburger_helper_lasagna_macaroni'],
 2: ['american_cheese',
  'kraft_grated_parmesan_cheese',
  'cheddar_cheese',
  'shredded_cheddar_cheese',
  'cream_cheese',
  'whipped_cream_cheese',
  'cheese',
  'mozzarella_cheese',
  'parmesan_cheese',
  'goat_cheese',
  'velveeta_cheese',
  'ricotta_cheese',
  'philadelphia_cream_cheese',
  'proc

In [17]:
len(vocab_ingrs.idx2word.keys())

1044

In [20]:
avg_clust_size=0
for v in vocab_ingrs.idx2word.values():
    avg_clust_size+=len(v)
avg_clust_size/=(len(vocab_ingrs.idx2word.keys())-1) #for <end>. should also remove other special tokens
avg_clust_size

14.087248322147651