In this notebook we'll learn how to sample new texts with deep learning!

imports for all cases of life:

In [None]:
import sys, json, codecs, csv
import numpy as np
import random
import pandas as pd
import tqdm
import itertools
import seaborn as sns

# visualization
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline  

In [None]:
from keras.layers import Dense, Activation
from keras.layers import LSTM, GRU, Dropout, BatchNormalization
from keras.layers import Input, Dense, Embedding
from keras.models import Sequential
from keras.optimizers import RMSprop, Adam

# word2vec
from gensim.models import word2vec

In [None]:
with codecs.open('../data/edimdoma_dataset.csv', 'r') as f:
    reader = csv.reader(f)

    names = []
    descriptions = []
    categories = []
    instructions = []
    cookTimes = []
    for line_id, line in enumerate(reader):
        if line_id == 0:
            continue
        doc = line[2]
        recipe_info = json.loads(doc)

        cookTime = -1
        if 'cookTime' in recipe_info:
            cookTime = recipe_info['cookTime']
            if cookTime.startswith('PT'):
                cookTime = cookTime[2:]
            if cookTime.endswith('H'):
                cookTime = cookTime[:-1]
            if cookTime.isdigit():
                cookTime = int(cookTime)
                
        if not 'recipeCategory' in recipe_info:
            continue
        category = recipe_info['recipeCategory'].strip() if 'recipeCategory' in recipe_info else 'NONE'
        cuisine = recipe_info['recipeCuisine'].strip() if 'recipeCuisine' in recipe_info else 'NONE'
        name = recipe_info['name'].strip()
        description = recipe_info['description'].strip()
        recipeIngredient = recipe_info['recipeIngredient']
        recipeInstructions = [x.strip() for x in recipe_info['recipeInstructions'] if not x == None]

        names.append(name)
        descriptions.append(description)
        categories.append(category)
        instructions.append(' '.join(recipeInstructions))
        cookTimes.append(cookTime)

In [None]:
print("number of texts: {}".format(len(instructions)))

Let's review one element of our our dataset:

In [None]:
print("Name:")
print(names[0])
print("---------------------------------")
print("Description:")
print(descriptions[0])
print("---------------------------------")
print("Category:")
print(categories[0])
print("---------------------------------")
print("Instruction:")
print(instructions[0])

# Generation of new recipes with deep learning!

![img](http://vsekidki.ru/uploads/posts/2017-03/1490096075_uffu7xomszo.jpg)

Let's prepare text for training and vocabulary:

In [None]:
text = "".join(instructions)[:300000]
chars = sorted(list(set(text)))
print('total chars:', len(chars))

char2id = dict((c, i) for i, c in enumerate(chars))

create the model:

In [None]:
print('Build model...')
model = Sequential()
<your code here>
model.add(Dense(len(chars)))
model.add(Activation('softmax'))
optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

In [None]:
seq_size = 40

here we generate our batches:

In [None]:
def get_batches(batch_count=128, batch_size=64):
    batch_x = np.zeros((batch_size, seq_size))
    batch_y = np.zeros((batch_size, len(chars)))
    for bi in range(batch_count):
        for seq_index in range(batch_size):
            pos = random.randint(0, len(text) - seq_size - 10)
            sequence = text[pos:pos+seq_size]
            next_char = text[pos+seq_size]
            for i in range(seq_size):
                batch_x[seq_index,i] = <initialize with character ids>
            batch_y[seq_index, :] = 0
            batch_y[seq_index, char2id[next_char]] = 1
        yield batch_x, batch_y

sampling of next character under particular temperature:

In [None]:
def sample_character(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

Here we're training our model:

In [None]:
for epoch in range(100):
    print("epoch: {}".format(epoch))
    losses = []
    for batch_x, batch_y in get_batches():
        loss = model.train_on_batch(batch_x, batch_y)
        losses.append(loss)
    print("train_loss: {}".format(np.mean(losses)))
    
    if epoch > 40:
        max_gen_count = 500
        index = random.randint(0, len(text) - 100)
        for t in [0.1, 0.4, 0.7, 1]:
            print("t={}".format(t))

            sentence = text[index: index + seq_size]
            generated = sentence

            for step in range(max_gen_count):
                batch = np.zeros((1, seq_size,))
                for i in range(seq_size):
                    batch[0,i] = char2id[sentence[i]]
                id = sample_character(model.predict_on_batch(batch)[0], t)
                generated += chars[id]
                sentence = sentence[1:] + chars[id]
            print(generated)

# Homework

1. **3 points**: try to make the model significantly better. You can add any crazy stuff and tune any hyperparameters which come to mind (e.g sequence size, change number of gru/lstm layers etc).
3. **7 points**: solve problem with word'based approach. Try to gain good looking generated texts.