In [1]:
import numpy as np
import pandas as pd
import json

import torch
import torch.nn.functional as F
from torch.nn import RNNCellBase, Parameter
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
class MaskedDataSet(torch.utils.data.Dataset):


    def __init__(self, partial_recipe, goal, ingredients, masked):
        self.X = partial_recipe
        self.y = masked
        self.goal = goal
        self.ingredients = ingredients

        assert len(self.X) == len(self.y), print("Number of examples don't match up")

    def __len__(self):
        return len(self.X)

    def __getitem__(self, index):
        return self.X[index], self.y[index], self.goal[index], self.ingredients[index]

class Model(torch.nn.Module):
    def __init__(self, wv_matrix):
        super(Model, self).__init__()
        vocab_size, embedding_size = wv_matrix.shape
        self.embedding = torch.nn.Embedding(vocab_size, embedding_size)
        self.embedding.weight.data.copy_(torch.from_numpy(wv_matrix))
        self.embedding.weight.requires_grad = False
        hidden_size = embedding_size
        self.cgru = CustomChecklistCell(embedding_size, hidden_size)
        self.fc = torch.nn.Linear(embedding_size, vocab_size)

    def forward(self, recipe, g, ingr):
        recipe_embed = self.embedding(recipe)
        goal_embed = self.embedding(g)
        ingr_embed = self.embedding(ingr)
        goal_embed = goal_embed.sum(axis=1)
        output, ht, a, E_t_new = self.cgru(recipe_embed, goal_embed, ingr_embed)
        logits = self.fc(output)
        return logits, ht, a, E_t_new

class CustomChecklistCell(RNNCellBase):

    def __init__(self, input_size, hidden_size, bias=True, batch_first=True, beta=5, gamma=2):
        super(CustomChecklistCell, self).__init__(input_size, hidden_size, bias, num_chunks=5)

        self.batch_first = batch_first

        self.Z = Parameter(torch.Tensor(hidden_size, hidden_size))
        self.Y = Parameter(torch.Tensor(hidden_size, hidden_size))
        self.U_g = Parameter(torch.Tensor(hidden_size, hidden_size))

        self.z_bias = Parameter(torch.ones(hidden_size))
        self.y_bias = Parameter(torch.ones(hidden_size))

        self.beta = beta
        self.gamma = gamma

        self.S = Parameter(torch.ones(3, hidden_size))
        self.P = Parameter(torch.ones(hidden_size, hidden_size))

        self.reset_parameters()

    def init_hidden(self, g):
        return F.linear(g, self.U_g)

    def cell(self, inp, hidden, g, E_t_new, activation=F.tanh):

        w_ih = self.weight_ih
        w_hh = self.weight_hh
        b_ih = self.bias_ih
        b_hh = self.bias_hh
        Y = self.Y
        Z = self.Z
        y_bias = self.y_bias
        z_bias = self.z_bias

        # Calculate all W @ inp. w_ih is actually 5 different weights
        gi = F.linear(inp, w_ih, b_ih)

        # Calcualtes all W @ hidden, w_hh is actually 5 different weights
        gh = F.linear(hidden, w_hh, b_hh)

        # Split them into seperate terms
        i_reset, i_update, i_new, i_goal, i_item = gi.chunk(5, 1)
        h_reset, h_update, h_new, h_goal, h_item = gh.chunk(5, 1)

        # Update Gate
        z_t = F.sigmoid(i_update + h_update)

        # Reset Gate
        r_t = F.sigmoid(i_reset + h_reset)

        # Goal Select Gate
        s_t = F.sigmoid(i_goal + h_goal)

        # Item Select Gate
        q_t = F.sigmoid(i_item + h_item)

        # That term
        tmp = torch.einsum('mlk -> mk', E_t_new)

        # New Gate
        h_tilde_t = activation(i_new
                               + r_t * h_new
                               + s_t * F.linear(g, Y, y_bias)
                               + q_t * F.linear(tmp, Z, z_bias))

        # tp1: t plus 1
        h_tp1 = h_tilde_t + z_t * (hidden - h_tilde_t)

        return h_tp1

    def attention(self, ht, a, E):
        ref_type = torch.nn.Softmax()(F.linear(self.beta * ht, self.S))
        h_proj = F.linear(ht, self.P)

        E_new = torch.einsum('ml, mlk -> mlk', 1 - a, E)
        alpha_new = self.gamma * torch.einsum('mk,mlk->ml', h_proj, E_new)
        alpha_new = torch.nn.Softmax()(alpha_new)
        c_new = torch.einsum('mlk, ml -> mk', E, alpha_new)

        E_used = torch.einsum('ml, mlk -> mlk', a, E)
        alpha_used = self.gamma * torch.einsum('mk,mlk->ml', h_proj, E_used)
        alpha_used = torch.nn.Softmax()(alpha_used)
        c_used = torch.einsum('mlk, ml -> mk', E, alpha_used)

        c_gru = h_proj

        out = ref_type[:, 0].reshape(-1, 1)*c_gru + ref_type[:, 1].reshape(-1, 1)*c_new + ref_type[:, 2].reshape(-1, 1)*c_used
        a = a + ref_type[:, 1].reshape(-1, 1) * alpha_new

        return out, a, E_new

    def forward(self, inp, g, E):

        # E will need padding as well so that it can be minibatched
        '''
        inp: (examples, seq_length, inp_dim) assuming batch_first
        g: (examples, hidden_dim)
        E: (examples, agenda length, hidden_dim)
        '''
        L = E.shape[1]

        ht = self.init_hidden(g)
        a = torch.zeros(inp.shape[0], L).to(device)

        if len(inp.shape) == 3:
            if self.batch_first:
                inp = inp.transpose(0, 1)

        E_t_new = torch.einsum('ml, mlk -> mlk', 1-a, E)

        lst_o = list()
        ot = torch.zeros_like(inp[0])
        zero = torch.zeros_like(inp[0])
        for t in np.arange(inp.shape[0]):

            if not torch.allclose(inp[t], zero):
                ht, ot, a, E_t_new = self.step(inp[t], g, E, ht, a, E_t_new)

            lst_o.append(ot)

        output = torch.stack(lst_o)

        if self.batch_first:
            output = output.transpose(0, 1)

        return output, ht, a, E_t_new

    def step(self, inp, g, E, ht=None, a=None, E_t_new=None):

        if ht is None:
            ht = self.init_hidden(g)

        if a is None:
            L = E.shape[1]
            a = torch.zeros(inp.shape[0], L).to(device)
            E_t_new = torch.einsum('ml, mlk -> mlk', 1 - a, E)

        ht = self.cell(
            inp, ht, g, E_t_new
        )

        ot, a, E_t_new = self.attention(ht, a, E)

        return ht, ot, a, E_t_new

In [3]:
def tokenize_sentence(sentence, word2idx, sent_type='goal'):
    unk_index = word2idx['UNK']
    sos_index = word2idx['SOS']
    eos_index = word2idx['EOS']
    
    tokenized_sentences = []
    
    list_of_tokens = []
    if sent_type == 'recipe':
        list_of_tokens.append(sos_index)
        
    sentence = sentence.split(' ') if type(sentence) is type('x') else sentence
    
    for word in sentence:
        if word in word2idx:
            list_of_tokens.append(word2idx[word])
        else:
            list_of_tokens.append(unk_index)
    
    if sent_type == 'recipe':
        list_of_tokens.append(eos_index)
    
    return list_of_tokens

In [4]:
def tokens_to_sent(tokens, append=True):
    words = []
    for token in tokens:
        if token != 0:
            try:
                words.append(idx2word[token])
            except:
                words.append(idx2word[token.item()])
                
    return ' '.join(words) if append else words

In [5]:
def generate_text(g, E, method='greedy'):

    # EXPERIMENTAL!
    '''
    inputs: 
    g (1, seq_len)
    E (1, num_of_ingredients)
    method: random or greedy
    '''

    tokens = []
    words = []

    a = None
    E_t_new = None
    ht = None

    g = model.embedding(g)
    E = model.embedding(E)
    g = g.sum(axis=1)

    token = torch.tensor([word2idx['SOS']]).to(device)
    tokens.append(token)

    for i in range(1000):
        inp = model.embedding(tokens[i].squeeze())
        inp = inp[None, :]

        ht, ot, a, E_t_new = model.cgru.step(inp, g, E, ht, a, E_t_new)
        logits = model.fc(ot)
        
        
        #print(logits.shape)
        
        if method == 'greedy':
            out = model.fc(ot)
            token = torch.argmax(out)
        elif method == 'random':
            dist = torch.distributions.categorical.Categorical(logits=logits[0])
            token = dist.sample()
        
        tokens.append(token)
        words.append(idx2word[token.item()])
        
    return ' '.join(words)

In [6]:
def beam_search_with_start_token(g, E, N=3, start_token='SOS', ht=None, a=None, E_t_new=None, MAX_LEN = 50):
    tokens = []
    words = []

    g_emb = model.embedding(g)
    E_emb = model.embedding(E)
    g_emb = g_emb.sum(axis=1)
    
    # First dct creation
    
    token = torch.tensor([word2idx[start_token]]).to(device)
    tokens.append(token)

    inp = model.embedding(tokens[0].squeeze())
    inp = inp[None, :]
    ht, ot, a, E_t_new = model.cgru.step(inp, g_emb, E_emb, ht, a, E_t_new)
    logits = model.fc(ot)
    log_softmax = torch.nn.LogSoftmax()
    probs = log_softmax(logits)
    values, indices = torch.topk(probs, N)

    dct = dict()
    for n in range(N):
        dct[f'seq{n}'] = dict()
        dct[f'seq{n}']['tokens'] = tokens + [indices.squeeze()[n]]
        dct[f'seq{n}']['log_prob'] = values.squeeze().tolist()[n]
        dct[f'seq{n}']['prev'] = (ht, a, E_t_new)
        
    del tokens, words

    # Actual Beam Search
    vocab_size = len(probs.squeeze())

    for j in range(MAX_LEN - 1):

        lst_probs = []
        lst_prevs = []

        for n in range(N):
            tokens = dct[f'seq{n}']['tokens']
            ht, a, E_t_new =  dct[f'seq{n}']['prev']
            prev_prob = dct[f'seq{n}']['log_prob']


            inp = model.embedding(tokens[-1].squeeze())
            inp = inp[None, :]

            ht, ot, a, E_t_new = model.cgru.step(inp, g_emb, E_emb, ht, a, E_t_new)
            logits = model.fc(ot)
            probs = log_softmax(logits)

            lst_probs.append(probs + prev_prob)
            lst_prevs.append((ht, a, E_t_new))

        values, indices = torch.stack(lst_probs).squeeze().reshape(-1).topk(N)

        idxs = indices % vocab_size
        seqs = indices // vocab_size
        
        new_dct = {}
        for i, n in enumerate(seqs):
            tokens = dct[f'seq{n}']['tokens'] + [idxs[i]]
            ht, a, E_t_new =  lst_prevs[n]
            current_prob = values[i].item()

            new_dct[f'seq{i}'] = dict()
            new_dct[f'seq{i}']['tokens'] = tokens
            new_dct[f'seq{i}']['log_prob'] = current_prob
            new_dct[f'seq{i}']['prev'] = (ht, a, E_t_new)
            

        dct = new_dct
        
        if dct[f'seq0']['tokens'][-1].item() == 10343:
            break

        assert len(dct) == N
        
    dct['seq0']['tokens'] = [w.item() for w in dct['seq0']['tokens']]
        
    return dct['seq0']['tokens']


def mask_sentence(tokens, mask=0.5):
    idx = int((1 - mask) * len(tokens))
    inp = tokens[:idx]
    masked = tokens[idx:]
    return inp, masked

In [7]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
emb_mat = np.load('emb_mat.npy')

a_file = open("word2idx.json", "r")
word2idx = json.load(a_file)
a_file.close()

idx2word = {v:k for k,v in word2idx.items()}

model = Model(emb_mat).to(device)
model.load_state_dict(torch.load('classifier_new_10.pt'))

<All keys matched successfully>

In [8]:
df_train = pd.read_pickle('train_fixed.pkl')
df_test = pd.read_pickle('test_fixed.pkl')

In [9]:
df_train[['0.5_inp', '0.5_out']] = pd.DataFrame(df_train['tokenized_instructions']\
                                       .apply(lambda x: mask_sentence(x, mask=0.5))\
                                       .tolist(), index=df_train.index)

df_train[['0.2_inp', '0.8_out']] = pd.DataFrame(df_train['tokenized_instructions']\
                                       .apply(lambda x: mask_sentence(x, mask=0.8))\
                                       .tolist(), index=df_train.index)

In [10]:
df_test[['0.5_inp', '0.5_out']] = pd.DataFrame(df_test['tokenized_instructions']\
                                       .apply(lambda x: mask_sentence(x, mask=0.5))\
                                       .tolist(), index=df_test.index)

df_test[['0.2_inp', '0.8_out']] = pd.DataFrame(df_test['tokenized_instructions']\
                                       .apply(lambda x: mask_sentence(x, mask=0.8))\
                                       .tolist(), index=df_test.index)

In [11]:
goal_test = pad_sequences(df_test['tokenized_goal'])
ingr_test = pad_sequences(df_test['tokenized_ingredients'])

recipe_half_test = pad_sequences(df_test['0.5_inp'])
recipe_fifth_test = pad_sequences(df_test['0.2_inp'])

label_half_test = pad_sequences(df_test['0.5_out'])
label_fifth_test = pad_sequences(df_test['0.8_out'])


dataloader_params = {'batch_size': 1, 'shuffle': True, 'num_workers': 6}

test_half_data = MaskedDataSet(recipe_half_test, goal_test, ingr_test, label_half_test)
test_half_generator = torch.utils.data.DataLoader(test_half_data, **dataloader_params)

test_fifth_data = MaskedDataSet(recipe_fifth_test, goal_test, ingr_test, label_fifth_test)
test_fifth_generator = torch.utils.data.DataLoader(test_fifth_data, **dataloader_params)

In [12]:
def beam_search(g, E, N=3, start_sent='SOS', start_tokens=None, MAX_LEN = 200):
    tokens = []
    words = []

    g_emb = model.embedding(g)
    E_emb = model.embedding(E)
    g_emb = g_emb.sum(axis=1)
    
    # First dct creation
    
    if start_tokens is not None:
        ot, ht, a, E_t_new = model.forward(start_tokens, g, E)
        # Get logits for next word
        logits = ot[:, -1, :]
    elif start_sent is not None:
        inp = model.embedding(torch.tensor(tokenize_sentence(start_sent, word2idx)).to(device))
        ht, ot, a, E_t_new = model.cgru.step(inp, g_emb, E_emb)
        # Get logits for next word
        logits = model.fc(ot)
    else:
        raise Exception('Please provide tokens or a sentence as input!')
        
    log_softmax = torch.nn.LogSoftmax()
    probs = log_softmax(logits)
    values, indices = torch.topk(probs, N)

    dct = dict()
    for n in range(N):
        dct[f'seq{n}'] = dict()
        dct[f'seq{n}']['tokens'] = tokens + [indices.squeeze()[n]]
        dct[f'seq{n}']['log_prob'] = values.squeeze().tolist()[n]
        dct[f'seq{n}']['prev'] = (ht, a, E_t_new)
        
    del tokens, words

    # Actual Beam Search
    vocab_size = len(probs.squeeze())

    
    #print(tokens)
    for j in range(MAX_LEN - 1):

        lst_probs = []
        lst_prevs = []

        for n in range(N):
            tokens = dct[f'seq{n}']['tokens']
            ht, a, E_t_new =  dct[f'seq{n}']['prev']
            prev_prob = dct[f'seq{n}']['log_prob']

            
            inp = model.embedding(tokens[-1].squeeze())
            inp = inp[None, :]


            ht, ot, a, E_t_new = model.cgru.step(inp, g_emb, E_emb, ht, a, E_t_new)
            logits = model.fc(ot)
            probs = log_softmax(logits)

            lst_probs.append(probs + prev_prob)
            lst_prevs.append((ht, a, E_t_new))

        values, indices = torch.stack(lst_probs).squeeze().reshape(-1).topk(N)

        idxs = indices % vocab_size
        seqs = indices // vocab_size
        
        new_dct = {}
        for i, n in enumerate(seqs):
            tokens = dct[f'seq{n}']['tokens'] + [idxs[i]]
            ht, a, E_t_new =  lst_prevs[n]
            current_prob = values[i].item()

            new_dct[f'seq{i}'] = dict()
            new_dct[f'seq{i}']['tokens'] = tokens
            new_dct[f'seq{i}']['log_prob'] = current_prob
            new_dct[f'seq{i}']['prev'] = (ht, a, E_t_new)
            

        dct = new_dct
        
        if dct[f'seq0']['tokens'][-1].item() == 10343:
            break

        assert len(dct) == N
        
    dct['seq0']['tokens'] = [w.item() for w in dct['seq0']['tokens']]
        
    return dct['seq0']['tokens']

In [13]:
# dct = {
#     'generated_text': [],
#     'goal': [],
#     'ingredients': [],
#     'label': []
# }

# c = 0

# model.eval()
# with torch.no_grad():
#     for data in test_half_generator:
#         recipe, label, goal, ingr = data
#         recipe, label = recipe.type(torch.LongTensor).to(device), label.type(torch.LongTensor).to(device)
#         goal, ingr = goal.type(torch.LongTensor).to(device), ingr.type(torch.LongTensor).to(device)

#         pred = beam_search(goal, ingr, start_tokens=recipe)

#         dct['generated_text'].append(tokens_to_sent(pred).split(' '))
#         dct['goal'].append(tokens_to_sent(goal.squeeze()))
#         dct['ingredients'].append(tokens_to_sent(ingr.squeeze(), append=False))
#         dct['label'].append(tokens_to_sent(label.squeeze()).split(' '))
        
#         c += 1

In [14]:
dct = {
    'generated_text': [],
    'goal': [],
    'ingredients': [],
    'label': []
}

c = 0

model.eval()
with torch.no_grad():
    for data in test_fifth_generator:
        recipe, label, goal, ingr = data
        recipe, label = recipe.type(torch.LongTensor).to(device), label.type(torch.LongTensor).to(device)
        goal, ingr = goal.type(torch.LongTensor).to(device), ingr.type(torch.LongTensor).to(device)

        pred = beam_search(goal, ingr, start_tokens=recipe)

        dct['generated_text'].append(tokens_to_sent(pred).split(' '))
        dct['goal'].append(tokens_to_sent(goal.squeeze()))
        dct['ingredients'].append(tokens_to_sent(ingr.squeeze(), append=False))
        dct['label'].append(tokens_to_sent(label.squeeze()).split(' '))
        
        c += 1

  ref_type = torch.nn.Softmax()(F.linear(self.beta * ht, self.S))
  alpha_new = torch.nn.Softmax()(alpha_new)
  alpha_used = torch.nn.Softmax()(alpha_used)
  probs = log_softmax(logits)
  probs = log_softmax(logits)


In [21]:
with open('80_percent_masked.json', 'w') as f:
    json.dump(dct, f)