In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import re
import os
import requests
from collections import Counter
from num2words import num2words
from nltk.corpus import cmudict
import copy
import torch
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM
%load_ext line_profiler
cmudict = cmudict.dict()
# Load pre-trained BERT model (weights)
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Create Functions to Assist in Analysis

In [2]:
# Split text into words
def getwords(text):
    words = pd.Series(re.findall(r"[\w']+", text))
    return words
# Split text into individual haikus
def gethaikus(text):
    haikus = pd.Series(text.split("\n\n"))
    return haikus
# Convert an array of words to a single string
def arraytotext(arr):
    text = ' '.join(arr)
    return text
# Find words that are not located in the cmudict phoenetic dictionary
def getunknowns(words):
    unknown_words = np.array([word for word in words if word.lower() not in cmudict.keys()])
    return unknown_words
# Gets frequency of word use in a given list of words
def wordcount(unknown_words):
    prob_word_freq = pd.Series(Counter(unknown_words)).sort_values(ascending=False)
    return prob_word_freq

# Import All Haikus in the Same Format

In [3]:
# Source 1
text = open('Haikus/haikuzao.txt', 'r').read()
text = text.lower()
haikus = gethaikus(text)
# Source 2
gutenberg = pd.read_csv('Haikus/gutenberg.csv')
haikus = haikus.append(pd.Series(gutenberg['haiku']).apply(lambda x: x.lower()))
# Source 3
modern_renaissance = pd.read_csv('Haikus/modern_renaissance.csv')
# make lower case and ensure that the new line notation is the same
modern_renaissance = pd.Series(modern_renaissance['content']).apply(lambda x: x.lower().replace("\r\n", "\n"))
haikus = haikus.append(modern_renaissance)
# Source 4
sballas = pd.read_csv('Haikus/sballas8.csv', header=None)
haikus = haikus.append(pd.Series(sballas[0]))
# Source 5
temps = pd.read_csv('Haikus/tempslibres.csv', encoding = "ISO-8859-1")
# Only English
temps = temps[temps['lang']=='en']
# make lower case and ensure that the new line notation is the same
haikus = haikus.append(pd.Series(temps['haiku']).apply(lambda x: x.lower().replace("\r\n", "\n")))
# Source 6
hjson = pd.read_json('Haikus/unim_poem.json')
haikus = haikus.append(pd.Series(hjson['poem']))

In [4]:
# Get words from haikus and determine whih ones do not exist in 
text = arraytotext(haikus)
unknown_words = getunknowns(getwords(text))
wordcount(unknown_words).head(30)

an'       2419
o'er      2041
sg        1421
wi'        879
sc         646
'          614
pl         591
a'         564
acc        533
't         500
ii         479
iii        417
th'        403
sae        379
_          378
nbsp       358
iv         292
honour     277
nought     275
tho'       272
nae        264
thro'      259
aught      248
hae        246
e'er       243
beheld     223
frae       215
e'en       213
canst      211
quoth      208
dtype: int64

# Clean Words

In [5]:
# Remove unwanted characters
def cleanwords(text):    
    #Clean dashes
    text = text.replace('-'," ")
    #Clean apostrophe
    text = text.replace('\'',"")
    # Clean numbers
    words = getwords(text)
    for word in words:
        if word.isdigit():
            text = re.sub(rf'\b{word}\b',num2words(word),text)
    
    return text

In [7]:
# Clean Haikus
haikus = haikus.apply(cleanwords)

  This is separate from the ipykernel package so we can avoid doing imports until


# Convert Words to Phonemes

In [8]:
# Convert plain text to Phoenetic
def getphoneme(word):
    phoneme = ''.join(cmudict[word][0]) # always use first pronuciations at index 0
    return phoneme

# Finds unknown words that can be split into two words
def findwordsplits(unknown_words):
    splitwords = {}
    for word in unknown_words:
        wordlength = len(word)
        for i in range(wordlength):
            split1 = word[0:i+1]
            split2 = word[i+1:wordlength]

            if split1 in cmudict.keys():
                if split2 in cmudict.keys():
                    splitwords[word] = [split1,split2]      
    return splitwords

In [9]:
splitwords = findwordsplits(unknown_words)
# inspect what words have single letters as vowels to determine appropriate sounds
vowels = ['a', 'e', 'i','o', 'u', 'y']
for i in splitwords:
    if (len(splitwords[i][0]) ==1 or len(splitwords[i][1]) ==1):
        if(splitwords[i][0] in vowels or splitwords[i][1] in vowels):
            print(splitwords[i])

['lun', 'e']
['sleet', 'y']
['ko', 'i']
['mis', 'o']
['a', 'kora']
['fur', 'u']
['abet', 'e']
['kon', 'u']
['y', 'ori']
['hit', 'o']
['haig', 'a']
['nap', 'e']
['bash', 'o']
['criss', 'e']
['en', 'e']
['tomb', 'e']
['i', 'pod']
['w', 'i']
['ham', 'e']
['inca', 'a']
['stern', 'y']
['stam', 'e']
['a', 'hint']
['syn', 'e']
['both', 'y']
['agan', 'e']
['o', 'ot']
['who', 'o']
['sare', 'e']
['couch', 'e']
['brum', 'e']
['burk', 'a']
['night', 'i']
['delayed', 'a']
['mint', 'y']
['napp', 'y']
['de', 'i']
['mort', 'o']
['plumeri', 'a']
['chill', 'i']
['dusk', 'y']
['kim', 'i']
['b', 'a']
['i', 'wa']
['ot', 'o']
['a', 'ah']
['um', 'e']
['iwan', 'u']
['ter', 'a']
['ash', 'i']
['kyo', 'o']
['yuk', 'i']
['kam', 'i']
['har', 'u']
['brenn', 'e']
['corell', 'i']
['ruck', 'e']
['sec', 'u']
['familial', 'e']
['c', 'e']
['tracer', 'y']
['suon', 'o']
['sill', 'a']
['aspirin', 'e']
['i', 'cycles']
['slat', 'y']
['moss', 'y']
['draught', 'y']
['gess', 'o']
['i', 'i']
['o', 'dour']
['dull', 'y']
['ta', 'u'

['doss', 'o']
['nod', 'i']
['coll', 'o']
['bors', 'a']
['spent', 'a']
['lent', 'a']
['vent', 'a']
['cent', 'o']
['pon', 'e']
['fell', 'o']
['quell', 'i']
['colt', 'o']
['vann', 'o']
['suon', 'i']
['mag', 'o']
['esser', 'e']
['piatt', 'i']
['viet', 'a']
['cole', 'i']
['reg', 'i']
['test', 'e']
['pote', 'a']
['ru', 'i']
['distend', 'e']
['crud', 'a']
['pent', 'e']
['mali', 'e']
['avant', 'e']
['inter', 'o']
['castell', 'a']
['demon', 'i']
['mis', 'i']
['donn', 'o']
['stanch', 'e']
['voland', 'o']
['tacit', 'i']
['intent', 'i']
['pres', 'i']
['stent', 'a']
['brin', 'a']
['rob', 'a']
["d'or", 'a']
['ruin', 'a']
['manifest', 'a']
['a', 'pri']
['gropp', 'a']
['frate', 'i']
['arment', 'o']
['vist', 'e']
['norm', 'e']
['fess', 'e']
['sal', 'i']
['poggi', 'o']
['dic', 'a']
['ebb', 'i']
['valor', 'e']
['brut', 'i']
['ard', 'o']
['confess', 'o']
['rende', 'i']
['foss', 'o']
['verri', 'a']
['sermon', 'e']
['o', 'recchia']
['sommers', 'e']
['animal', 'i']
['poet', 'i']
['e', 'cuba']
['miser', 'a']


In [10]:
# handle words with single letters separately
# Note: this may not always be accruacte but should approximate well enough for the intended goal
def convertsplitwords(splitwords):
    worddict = copy.deepcopy(splitwords)
    special_cases = { 
        'a':'AH0',
        'i': 'IY0',
        'o': 'OW0',
        'u' : 'UW0',
        'y':'IY0',
        's': 'Z'
    }
    
    for i in worddict:
        
        firstword = worddict[i][0]
        secondword = worddict[i][1]

        # If the first word is a single letter then make it all caps
        if (len(firstword)==1):
            firstword = firstword.upper()
            
        # Otherwise use CMUDICT to convert
        else:
            firstword = getphoneme(firstword)

        # If the second word is a single letter then apply the special cases where applicable
        if (len(secondword)==1):
            
            # If letter is a special case replace it
            if secondword in special_cases.keys():
                secondword = pd.Series(worddict[i]).replace(special_cases)[1]
                
            # Otherwise make it all caps
            else:
                secondword = secondword.upper()
                
       # Otherwise use CMUDICT to convert
        else:
            secondword = getphoneme(secondword)
            
        # added nested list to match format of CMUDICT
        worddict[i] = [[firstword,secondword]]   
        
    return worddict

# takes in haikus and separates haikus that have words not in the CMUDICT
def splithaikus(haikus):
    
    bad_haikus = []
    good_haikus = []
    
    for haiku in haikus:
        words = getwords(haiku)
        if all(word in cmudict.keys() for word in words):
            good_haikus.append(haiku)
        else:
            bad_haikus.append(haiku)
                        
    return bad_haikus, good_haikus

# Takes in a haiku and transforms it into the equivalent phoneme version
def haikutransform(haiku):
    words = getwords(haiku)
    try:
        phonemes = [* map(getphoneme,words)]
        phonemes = ' '.join(phonemes)
    except:
        raise ValueError('A word in the Haiku was not in the CMUDICT.' \
        ' Make sure only valid haikus are used for this function.')
        return
        
    return phonemes

# CMUDICT uses numbers (0-2) to denote stress of the syllable. Although this is something that could be explored
# later, it is adding unnecessary complexity and should be changed to a consistent format. 
def convertsyllables(haiku):
    haiku = re.sub('(1|2)','0',haiku)
    return haiku

In [11]:
# add new word phoenetics to cmudict
cmudict = {**cmudict ,**convertsplitwords(splitwords)}
# split haikus into a usuable set and a set that can be further inspected for transformations
bad_haikus, valid_haikus = splithaikus(haikus)
# transform the good haikus into phonemes
haikus_transformed = pd.Series(map(haikutransform,valid_haikus))
# Convert all syllables to 0
haikus_transformed = pd.Series(map(convertsyllables,haikus_transformed))

  This is separate from the ipykernel package so we can avoid doing imports until


In [12]:
# Check to see if we didnt lose any haikus
len(bad_haikus) + len(haikus_transformed) == len(haikus)

True

# Create Function to Convert Back to Regular English

In [13]:
# Invert the cmudict so that we can transform from a phoneme to english
def invertdictionary(cmudict):
    
    idict = {}
    
    for word in cmudict:
        # Create a list to hold all of the possible words associated with a phoneme
        p_list = []
        
        phoneme = ''.join(cmudict[word][0]) # always use first phoneme for a word
        phoneme = convertsyllables(phoneme)
        
        # if the phoneme already exists add it to that list
        if phoneme in idict.keys():
            p_list = idict[phoneme]
            p_list.append(word)
            
        # Otherwise create a new list
        else:
            p_list.append(word)
           
        idict[phoneme] = p_list
    return idict

#def getenglish(phoneme):
 #   words = idict[phoneme][0] # always use first pronuciations at index 0
                                         # This will be explored in the future to optimize word choice
  #  return words

def transformback(haiku):
    # use getwords to get phonemes
    phonemes = getwords(haiku)
    words = [* map(getenglish,phonemes)]
    words = ' '.join(words)
    return words

In [15]:
# Create Inverted Phoneme Dictionary
idict = invertdictionary(copy.deepcopy(cmudict))

In [16]:
# A function that takes in haikus and predicts what words should be used in the list. The algorithm works as such:
# 1) Create a sentence using a "naive" prediction which simply uses the first word in the array
# 2) Iterate through the phonemes in the sentence, if it only has one option than use it, else move to step 3
# 3) Use BERT a bi-directional NLP package to create a similarity matrix of most likely words in the sentence
# 4) Join array of similar words with possible words associated with the phoneme
# 5) If the array is empty default to the first word, else pick the highest ranked word
# 6) Add the word to the list and move to the next phoneme 

def getenglish(haiku):
    english_haiku = []
    # use getwords to get phonemes
    phonemes = getwords(haiku)
    # get 2D array of lists of possibilities for each word
    wordsarray = phonemes.apply(lambda x: idict[x])
    # Get a baseline to use predictions off of by taking the first word for each list
    baseline ='[CLS] '
    baseline = baseline + ' '.join(wordsarray.apply(lambda x: x[0])) # Use first word
    baseline = baseline + ' [SEP]'
    for index, word in enumerate(wordsarray):
        
        if len(word[0])>1:
            try:
                # predict best word to use for each phoeneme
                sentence = re.sub(word[0],'[MASK]',baseline)

                tokenized_text = tokenizer.tokenize(sentence)
                indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

                # Create the segments tensors.
                segments_ids = [0] * len(indexed_tokens)

                # Convert inputs to PyTorch tensors
                tokens_tensor = torch.tensor([indexed_tokens])
                segments_tensors = torch.tensor([segments_ids])

                # Predict all tokens
                with torch.no_grad():
                    predictions = model(tokens_tensor, segments_tensors)

                predicted_words = pd.DataFrame(predictions[0,index])
                predicted_words['Word'] = [tokenizer.convert_ids_to_tokens([x])[0] for x in range(len(predicted_words))]
                # Create a word dataframe to merge with
                wordlist = pd.DataFrame(word,columns=['Word'])
                best_word = pd.merge(predicted_words,wordlist, on ='Word').sort_values(0, ascending= False).loc[0,'Word']

            # If BERT is unable to predict word then use the first one
            except:
                best_word=word[0]

        else:
            best_word=word[0]
            
        english_haiku.append(best_word)
    return english_haiku

In [17]:
getenglish(haikus_transformed[0])

['a',
 'skein',
 'of',
 'birds',
 'twines',
 'across',
 'the',
 'sky',
 'the',
 'northbound',
 'train',
 'departs']

# Format Haikus w/ Correct Syllable Structure

In [18]:
# Count Syllables 
df_syll = pd.DataFrame(zip(haikus_transformed,
    haikus_transformed.apply(lambda x: sum(letter.isdigit() for letter in x))))

In [19]:
# We need poems with 17 syllables (5 + 7 + 5)
df_17_syll = df_syll[df_syll[1]==17]
print(len(df_17_syll),len(arraytotext(df_17_syll[0])))

1816 172055


This is an extremley dissapointing result. 1816 haikus wont even scratch the surface of the amount of text needed for an RNN to learn a 5-7-5 syllable structure. 

However, the goal of the project is to create an RNN model that can understand syllables and learn enlgish as phoenemes. Although some context will be loss, these goals can be met by treating all haikus as one single text and then split into several 5-7-5 haikus individually. 

In [100]:
# Return Haikus Back to Text
haiku_text = arraytotext(haikus_transformed)
# Get all words
words = getwords(haiku_text)
# Get syllables for each word
syllables = words.apply(lambda x: sum(letter.isdigit() for letter in x))
# zip lists together
syll_list = [*zip(words,syllables)]     
len(syll_list)

2449369

In [93]:
# Convert list of words into 5-7-5 snippets
haikus = []
while len(syll_list) > 17:
    cum = 0
    haiku = []
    line = 1
    for i,tup in enumerate(syll_list):
        cum = cum + tup[1]
        if line == 1:
            if cum < 5:      
                haiku.append(syll_list.pop(i)[0])

            elif cum==5:
                haiku.append(syll_list.pop(i)[0])
                haiku.append('\n')
                line = 2
                
            # if line 1 has word with a 5 syllable remove it
            elif tup[1] > 5:
                del syll_list[i]
                cum = cum - tup[1]
                continue
                
            else:
                cum = cum - tup[1]
                continue
                
        elif line == 2:
            if cum < 12:
                haiku.append(syll_list.pop(i)[0])

            elif cum == 12:
                #print('Second Break:' + tup[0]+ 'Cummulative' + str(cum))
                haiku.append(syll_list.pop(i)[0])
                haiku.append('\n')
                line = 3
            
            # if line 1 has word with a 7 syllable remove it
            elif tup[1] > 6:
                del syll_list[i]
                continue
                
            else:
                cum = cum - tup[1]
                continue
                
        elif line == 3:
            if cum < 17:
                #print('Third:' + tup[0]+ 'Cummulative' + str(cum))
                haiku.append(syll_list.pop(i)[0])
                
            elif cum == 17:
                haiku.append(syll_list.pop(i)[0])
                haikus.append(haiku)
                #print('Haiku Ends, Words to go:' + str(len(syll_list)))
                break
             # if line 1 has word with a 5 syllable remove it
            elif tup[1] > 4:
                del syll_list[i]
                cum = cum - tup[1]
                continue
                    
            else:
                cum = cum - tup[1]
                continue      

In [101]:
haikus[len(haikus)-1]

['DIH0STRAH0KSHAH0N',
 'OW0SHAH0N',
 '\n',
 'DEH0SAH0LAH0T',
 'WAA0CH',
 'RAY0T',
 'JHEH0NTLIY0',
 '\n',
 'DHAH0',
 'AA0R',
 'AH0BAE0NDAH0ND']

In [98]:
getenglish(arraytotext(haikus[len(haikus)-1]))

['destruction',
 'ocean',
 'desolate',
 'watch',
 'right',
 'gently',
 'the',
 'r',
 'abandoned']

In [None]:
haiku_text[:500]

In [None]:
# def insertbreaks(haiku):
#     words = getwords(haiku)
#     word_syllables = words.apply(lambda x: sum(letter.isdigit() for letter in x))
#     #Get cumulative
#     cum_syllables = np.cumsum(word_syllables)
#     # Add double line break to signify end of haiku
#     #words[len(words)-1] = words[len(words)-1]+'\n\n'
#     first_break = 0
#     for i,num in enumerate(cum_syllables):
#         if first_break == 0:
#             if(num >= 5):
#                 words[i] = words[i]+'\n'
#                 first_break = 1
#         else:
#             if(num >= 12):
#                 words[i] = words[i]+'\n'
#                 return arraytotext(words)

In [None]:
formatted_haikus = df_17_syll[0].apply(insertbreaks)
# Remove space after \n that was crated it is not needed
formatted_haikus = formatted_haikus.apply(lambda x: x.replace('\n ',"\n")).reset_index(drop=True)
formatted_text = arraytotext(formatted_haikus)

# Train Model

In [None]:
# Convert Haikus into a DataFrame
df_haikus = pd.DataFrame(formatted_haikus)
df_haikus.columns = ['text']
# Get Unique Letters
vocab = sorted(set(formatted_text))
print(vocab)
print('Unique Characters: {}'.format(len(vocab)))

In [None]:
# Creating a mapping from unique characters to indices
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

def encodehaikus(haiku):
    encoded_haiku = np.array([char2idx[c] for c in haiku])
    return encoded_haiku
encoded_haikus = formatted_haikus.apply(encodehaikus)
df_haikus['encoded'] = encoded_haikus
print('{')
for char,_ in zip(char2idx, range(20)):
    print('  {:4s}: {:3d},'.format(repr(char), char2idx[char]))
print('  ...\n}')

In [None]:

# Show how the first 13 characters from the text are mapped to integers
print ('{}\n v ---- characters mapped to int ---- v \n{}'\
       .format(repr(df_haikus.loc[0,'text']), df_haikus.loc[0,'encoded']))

# Create training examples and targets

In [None]:
# Pad sequences with 0s so they are all the same length

# Get character lengths of each haiku 
df_haikus['length'] = df_haikus['encoded'].apply(lambda x:len(x))
max_length = df_haikus['length'].max()

def getpadded(row):
    leng = row['length']
    zeros = np.zeros((max_length-leng), dtype=np.int32)
    padded = np.append(row['encoded'],zeros)
    return padded

df_haikus['padded'] = df_haikus.apply(getpadded,axis=1)   

In [None]:
dataset = tf.data.Dataset.from_tensor_slices((df_haikus['input_text'],df_haikus['target_text']))
dataset

In [None]:
for input_example, target_example in  dataset.take(1):
    print ('Input data: ', repr(''.join(idx2char[input_example.numpy()])))
    print ('Target data:', repr(''.join(idx2char[target_example.numpy()])))

In [None]:
for i, (input_idx, target_idx) in enumerate(zip(input_example[:5], target_example[:5])):
    print("Step {:4d}".format(i))
    print("  input: {} ({:s})".format(input_idx, repr(idx2char[input_idx])))
    print("  expected output: {} ({:s})".format(target_idx, repr(idx2char[target_idx])))

In [None]:
# Batch size
BATCH_SIZE = 128

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

dataset

In [None]:
# Length of the vocabulary in chars
vocab_size = len(vocab)

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024

# Build the Model

In [958]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                              batch_input_shape=[batch_size, None]),
    tf.keras.layers.GRU(rnn_units,
                        return_sequences=True,
                        stateful=True,
                        recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dense(vocab_size)
  ])
    return model

In [961]:
model = build_model(
  vocab_size = len(vocab),
  embedding_dim=embedding_dim,
  rnn_units=rnn_units,
  batch_size=BATCH_SIZE)

In [962]:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

In [963]:
model.summary()

In [964]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()

In [965]:
print("Input: \n", repr("".join(idx2char[input_example_batch[0]])))
print()
print("Next Char Predictions: \n", repr("".join(idx2char[sampled_indices])))

In [966]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

example_batch_loss  = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("scalar_loss:      ", example_batch_loss.numpy().mean())

In [967]:
model.compile(optimizer='adam', loss=loss)

In [968]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [969]:
EPOCHS=10

In [991]:
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

In [992]:
model.save('haiku_v1.h5') 

In [996]:
from tensorflow.keras.models import load_model

model = create_model(vocab_size, embedding_dim, rnn_units, batch_size=1)

model.load_weights('haiku_v1.h5')

model.build(tf.TensorShape([1, None]))


In [997]:
model.summary()

In [998]:
def generate_text(model, start_string):
    # Evaluation step (generating text using the learned model)

    # Number of characters to generate
    num_generate = 113

    # Converting our start string to numbers (vectorizing)
    input_eval = [char2idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)

    # Empty string to store our results
    text_generated = []

    # Low temperatures results in more predictable text.
    # Higher temperatures results in more surprising text.
    # Experiment to find the best setting.
    temperature = 1.0

    # Here batch size == 1
    model.reset_states()
    for i in range(num_generate):
      predictions = model(input_eval)
      # remove the batch dimension
      predictions = tf.squeeze(predictions, 0)

      # using a categorical distribution to predict the character returned by the model
      predictions = predictions / temperature
      predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

      # We pass the predicted character as the next input to the model
      # along with the previous hidden state
      input_eval = tf.expand_dims([predicted_id], 0)

      text_generated.append(idx2char[predicted_id])

    return (start_string + ''.join(text_generated))