This notebook intends to train an RNN to produce czech names resembling those that are used for naming biological spicies. I wrote this notebook in a colaboration with my friend Adam Blazek who did the data gathering.

Text generation is based on the following article: https://www.tensorflow.org/tutorials/text/text_generation

Dominik Farhan, April 2019

First, we muse getter the data...

In [1]:
# all imports
from platform import python_version
print(f'Running Python version: {python_version()}')

import os
import re
from sys import stderr
from typing import Iterator
from urllib.request import urlopen
from random import choice, randrange
from itertools import islice
from time import time

import numpy as np
import tensorflow as tf
from tensorflow.keras.utils import to_categorical

Running Python version: 3.7.7


In [2]:
# Some utility functions
def names_list_to_ids(names:list, char_to_id:dict) -> list:  
    return [name_to_ids(n, char_to_id) for n in names]

def name_to_ids(name:str, char_to_id:dict) -> list: 
    return [char_to_id[c] for c in name]

def ids_to_name(char_inds: list, id_to_char:dict) -> str: 
    print(char_inds)
    return ''.join([id_to_char[i] for i in char_inds])

def ids_list_to_names(inds_names:list, id_to_char:dict) -> list: 
    return [ids_to_name(ids, id_to_char)for ids in inds_names]

In [3]:
# Data scrapping

def wikipedia_table_names(url: str, additional_names=True) -> Iterator[str]:
    def process_name(rawname: str) -> Iterator[str]:
        rawname = re.sub(r"\s+", " ", rawname).strip()
        if re.search(r"[_\d]", rawname):
            return
        match = re.match(r"(\w[\w\s.]*\w\s)(\w+/[\w/]+)", rawname)
        if match:
            for variant in re.split(r"/", match[2]):
                yield (match[1] + variant).lower()
            return
        match = re.match(r"(\w+/[\w/]+)(\s\w[\w\s.]*\w)", rawname)
        if match:
            for variant in re.split(r"/", match[1]):
                yield (variant + match[2]).lower()
            return
        for name in re.split(r"\s*/\s*", rawname):
            yield name.lower()

    page = urlopen(url).read().decode("utf-8")
    for row in re.findall(r"<tr>\n.*", page):
        if re.search(r"Český název", row):
            continue
        row = re.sub(r"<.*?>", "", row).strip()
        match = re.search(r"(\w[\w\s./]*\w)(?:\s\(([^)<]*))?", row)
        if match:
            for name in process_name(match[1]):
                yield name
            if additional_names and match[2]:
                for rawname in re.findall(r"\w[\w\s./]*\w", match[2]):
                    for name in process_name(rawname):
                        yield name
        else:
            print(f"Couldn't match: '{row}'", file=stderr)
            
def botany_names(url: str) -> Iterator[str]:
    page = urlopen(url).read().decode("utf-8")
    for name in re.findall(r"–\s+([\w\s.]*)", page):
        if name:
            yield name.lower()
            

sources = [
    (
        wikipedia_table_names,
        "https://cs.wikipedia.org/wiki/Seznam_l%C3%A9%C4%8Div%C3%BDch_rostlin",
    ),
    (
        wikipedia_table_names,
        "https://cs.wikipedia.org/wiki/Seznam_nejjedovat%C4%9Bj%C5%A1%C3%ADch_rostlin",
    ),
    (botany_names, "https://botany.cz/cs/kvetena-ceske-republiky/"),
]

            
def lnames(sources: list, save_to_file = False) -> list:
    """ Returns a list with names used for training. """
    names = []
    for function, url in sources:
        for name in function(url):
            names.append(name)
    names = sorted(set(names))
    if save_to_file:
        with open("czech_plant_names.txt", "w") as output:
            for name in names:
                print(name, file=output)
    return names

def lwords(names: list) -> list: return [word for words in names for word in words.split()]
    


In [4]:
names = lnames(sources, save_to_file = True)
names.append('rys ostrovid') # Just to get it to 'nicer' number, 3999 -> 4000

In [5]:
vocab = set(c for w in names for c in w)
print(f'Number of distinct names {len(names)}')
print(f'Number of distinct characters among all words {len(vocab)}')

Number of distinct names 4000
Number of distinct characters among all words 46


## Now, we have to prepare the data so that we can feed it to the model.

In [6]:
vocab.add('&') # & will be the ending char so model can output 'end'.

char_to_id = {c:i for i, c in enumerate(sorted(vocab))}
id_to_char = {v:k for k, v in char_to_id.items()}
print(f'The longest name has {max([len(p) for p in names])} characters')
print(f'The shortest name has {min([len(p) for p in names])} characters')

The longest name has 53 characters
The shortest name has 3 characters


In [7]:
# Convert chars in names to indicies
numerized = names_list_to_ids(names, char_to_id)
print(f'First name numerised: {numerized[0]}')
print(f'First name converted back: {ids_to_name(numerized[0], id_to_char)}')

First name numerised: [4, 5, 20, 4, 11, 29, 16, 45, 24, 1, 21, 22, 20, 18, 16]
[4, 5, 20, 4, 11, 29, 16, 45, 24, 1, 21, 22, 20, 18, 16]
First name converted back: abrahámův strom


Model

In [8]:
# Length of the vocabulary in chars
vocab_size = len(vocab)

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 64

BATCH_SIZE = 20

In [9]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                              batch_input_shape=[batch_size, None]),
    tf.keras.layers.GRU(rnn_units,
                        return_sequences=True,
                        stateful=True,
                        recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dense(vocab_size)
    ])
    return model

In [10]:
model = build_model(
    vocab_size = len(vocab),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units,
    batch_size=BATCH_SIZE)

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (20, None, 256)           12032     
_________________________________________________________________
gru (GRU)                    (20, None, 64)            61824     
_________________________________________________________________
dense (Dense)                (20, None, 47)            3055      
Total params: 76,911
Trainable params: 76,911
Non-trainable params: 0
_________________________________________________________________


In [11]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "last_checkpoint")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)


In [12]:
def loss(labels, logits): 
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

model.compile(optimizer='adam', loss=loss)

In [13]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    #print(input_text)
    #print(target_text)
    return input_text, target_text

def generate_training_data(names, max_len):
    while True:
        for name in names:
            x,y = split_input_target(name)
            x += [char_to_id['&']] * (max_len - 1 - len(x)) 
            y += [char_to_id['&']] * (max_len - 1 - len(y)) 
            yield x, y
    

In [14]:
x_train = []
y_train = []
gen = generate_training_data(numerized, 54)
for i in range(len(numerized)):
    x,y = next(gen)
    x_train.append(x)
    y_train.append(y)

In [15]:
x_train = np.array(x_train)
y_train = np.array(y_train)

In [16]:
x_train.shape

(4000, 53)

In [17]:
EPOCHS = 50

history = model.fit(x_train, y_train,
                    epochs=EPOCHS, verbose = 3)


Train on 4000 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [18]:
model.save_weights('model/last_epoch_model')

In [19]:
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)

model.load_weights('model/last_epoch_model')

model.build(tf.TensorShape([1, None]))
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (1, None, 256)            12032     
_________________________________________________________________
gru_1 (GRU)                  (1, None, 64)             61824     
_________________________________________________________________
dense_1 (Dense)              (1, None, 47)             3055      
Total params: 76,911
Trainable params: 76,911
Non-trainable params: 0
_________________________________________________________________


In [20]:
class Text_generator:
    """ 
        A class for generating biological names.
        
        params:
            model: model in keras trained to predict text.
            names: a list of biological names (usually two words per string but not needed)
            temperature: optional(0.2), it 'provides' a randomness to the prediction.
                        The higher the value the more weird and unpredictable result look.
                        The smaller the temprature the more will the value resemble a real biological name.
                               
    """
    def __init__(self, model,names, temperature = 0.2):
        self.model = model
        self.names = names
        self.words = lwords(names)
        self.temperature = temperature
        self.part_gen = self.generate_part_of_word()
        
    def generate_part_of_word(self):
        while True:
            w = choice(self.words)
            if self.ok_ending(w) and len(w) > 1:
                yield w[:randrange(1,len(w))]
        
    def __iter__(self):
        return self
    
    def __next__(self) -> str:
        part = next(self.part_gen)
        t = time()
        while True:
            pred = self.generate_text(part, num_generate = 54)
            name = self.get_name(pred)
            if self.ok_ending(name.split()[0]):
                break
            if time() - t > 0.3: 
                # Net can get into a cycle in that case different starting part is needed
                part = next(self.part_gen)
                t = time()
        return name
        
    def generate_text(self, start_string: str, num_generate = 60) -> str:
        """ Generates a name string of a given lenght. """

        input_eval = [char_to_id[s] for s in start_string]
        input_eval = tf.expand_dims(input_eval, 0)

        text_generated = []


        self.model.reset_states()
        for i in range(num_generate):
            predictions = self.model(input_eval)
            predictions = tf.squeeze(predictions, 0)

            predictions = predictions / self.temperature
            predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

            input_eval = tf.expand_dims([predicted_id], 0)

            text_generated.append(id_to_char[predicted_id])

        return (start_string + ''.join(text_generated))


    def get_name(self, NN_pred: str) -> str: return ' '.join(NN_pred.split()[:2])

    def get_start(self) -> str: return choice(self.words)
    
    def ok_ending(self, word): return not word[-1] in 'ýáíéů'

In [21]:
gen = Text_generator(model, names, temperature = 0.5)

In [23]:
t = time()
for name in islice(gen,0,20):
    print(name)
(time() - t)/20

kozice setá
turne vonný
rapontikavice chlubatý
růžka poutinaláské
chrpa hlavatý
menka šilicovitý
hluchav setý
tět střídovatý
kalka. polní
jeten kozí
jestřábník obecný
baďyně tuhátolistý
žbábník vonný
cinkatec maloplodý
mandlík krasanový
lavsoniet setý
kvasička krasnicovitý
béřka parovatý
kavič stromovitý
kamzičník kamerinovitý


0.5473289489746094

In [24]:
# What if all the words start with a letter from alphabet? ('q' in unknown to the model)
for letter in 'abcdefghijklmnoprstuvwxzy':
    print(' '.join(gen.generate_text(letter, num_generate = 100).split()[:2]))

afrástý tamatinatý
běžluk setý
chýl setý
dyněličník jadlý
elček setý
fiaalka. posnědý
gajník prososný
hýl klaný
ivá bahenník
javaný klasnatý
kardský tenolistá
listý balpský
menná setá
nátkový stromovitý
ový stříbníkovitý
pý kapská
rý vonný
ská zahradníkovitý
tolistý pazpovníkovátec
ubířský prodloditý
vanový chlupatý
wasovný kapská
xajas červené
zdník evořský
y rozpový


In [11]:
# This is interesting, there are 'words' that have lenght 1...
for w in gen.words:
    if len(w) == 1:
        print(w)

z
v


In [15]:
# Those are all correct names.
# However are model cant predict those because its bounded to precting two-word names
for n in gen.names:
    # This checks for some positions of z and v, but not all of them.
    for c in [' v ', ' z ']:
        if c in n:
            print(n)
            break

existuje snad přece jen v některých případech možnost

panenka v trní

růže z jericha



## The following is all that is needed to use the generator outside this notebook...

In [1]:
# imports:
from random import choice, randrange
from time import time
from itertools import islice

import tensorflow as tf

In [2]:
class Bio_names_generator:
    """ 
        A class for generating biological names.
        
        params:
            model: optional (None), model in keras trained to predict text.
            names: optional (None), a list of biological names (usually two words per 
                   string but not needed). If None 'load_names' is called
            temperature: optional (0.2), it 'provides' a randomness to the prediction.
                        The higher the value the more weird and unpredictable result look.
                        The smaller the temprature the more will the value resemble a real 
                        biological name.
                               
    """
    def __init__(self, model = None,names = None, temperature = 0.2):
        if names == None:
            self.load_names()
        else:
            self.names = names
        self.words = self.lwords(self.names)
        self.create_vocab()
        if model == None:
            self.load_model()
        else:
            self.model = model
        self.part_gen = self.generate_part_of_word()
        self.temperature = temperature
        self.char_to_id = {c:i for i, c in enumerate(sorted(self.vocab))}
        self.id_to_char = {v:k for k, v in self.char_to_id.items()}
    
    # Preparatory functions:
    
    def create_vocab(self):
        self.vocab = set(c for w in self.names for c in w)
        #self.vocab.add('&') # Ending char.
    
    def build_model(self, embedding_dim = 256, rnn_units = 64, batch_size = 1):
        """ Builds the model given some parameters of it.
        
            params:
                None of the params should be changed unless the model architecture is changed!
                embedding_dim: optional (256)
                rnn_unnits: optional (64)
                batch_size: optional (1) size of the batch should be 1, because the generator 
                            is built to predict one name at a time. However changing it and
                            rewriting some other parts may speed up the whole process.
        """
        vocab_size = len(self.vocab)
        model = tf.keras.Sequential([
            tf.keras.layers.Embedding(vocab_size, embedding_dim,
                                      batch_input_shape=[batch_size, None]),
            tf.keras.layers.GRU(rnn_units,
                                return_sequences=True,
                                stateful=True,
                                recurrent_initializer='glorot_uniform'),
            tf.keras.layers.Dense(vocab_size)
            ])
        return model    
        
    def load_model(self, path_to_model = 'model/last_epoch_model'):
        """ Loads the model given path"""
        model = self.build_model()
        model.load_weights(path_to_model)
        model.build(tf.TensorShape([1, None]))
        self.model = model
        
    def lwords(self, names: list) -> list:
        """ Creates list of all words in names. """
        return list(set([word for words in names for word in words.split()]))
        
    def load_names(self, path = 'czech_plant_names.txt'):
        """ Loads names, expects one name per line. """
        with open(path, 'r') as data:
            self.names = [name for name in data]
            
    # The generating part: 
    
    def generate_part_of_word(self):
        """ Generator, yields substrings of random len of words that are good by 'ok_ending'. """
        while True:
            w = choice(self.words)
            if self.ok_ending(w) and len(w) > 1:
                yield w[:randrange(1,len(w))]
        
    def __iter__(self):
        return self
    
    def __next__(self) -> str:
        part = next(self.part_gen)
        t = time()
        while True:
            pred = self.generate_text(part, num_generate = 54)
            name = self.get_name(pred)
            if self.ok_ending(name.split()[0]):
                break
            if time() - t > 0.3: 
                # Net can get into a cycle in that case different starting part is needed
                part = next(self.part_gen)
                t = time()
        return name
        
    def generate_text(self, start_string: str, num_generate = 60) -> str:
        """ Generates a name string of a given lenght. """

        input_eval = [self.char_to_id[s] for s in start_string]
        input_eval = tf.expand_dims(input_eval, 0)

        text_generated = []


        self.model.reset_states()
        for i in range(num_generate):
            predictions = self.model(input_eval)
            predictions = tf.squeeze(predictions, 0)

            predictions = predictions / self.temperature
            predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

            input_eval = tf.expand_dims([predicted_id], 0)

            text_generated.append(self.id_to_char[predicted_id])

        return (start_string + ''.join(text_generated))


    def get_name(self, NN_pred: str) -> str: return ' '.join(NN_pred.split()[:2])
    
    def ok_ending(self, word): return not word[-1] in 'ýáíéů'

In [3]:
gen = Bio_names_generator()

In [4]:
for name in islice(gen,0,20):
    print(name)

koflíček polník
kaštěk polník
andělika polní
kokrhen polníkový
borůvka vonná
maďovec vonný
krva dvoudovitá
zlatičník kostřední
ouška polní
tras. prostřední
rozchodník polník
dvouřadník pravý
ka polník
pepřovník polní
dýl tuhýchostřední
cypřišek polník
blík obecný
chvoštět střešní
zdník poponský
růžka polní
