<a href="https://colab.research.google.com/github/acevedosharp/name-generator/blob/master/latest_and_greatest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Character level language modelling

In [192]:
import numpy as np
import pandas as pd
from tensorflow import keras
from keras.layers import LSTM, Dense
from keras.callbacks import LambdaCallback

## Importando los datos y métodos de apoyo

In [193]:
dino_df = pd.read_csv('https://raw.githubusercontent.com/junosuarez/dinosaurs/master/dinosaurs.csv', names=['dino_name'])
cities_df = pd.read_csv('https://raw.githubusercontent.com/datasets/world-cities/master/data/world-cities.csv')

In [194]:
def get_names_and_chars(df, name_col):
  names = [*df[name_col]]
  names = list(map(lambda s: s + '!', names))
  chars = set(char for name in names for char in name)
  chars = list(chars) # There are much better ways of doing this!
  print(f"There are {len(names)} names and {len(chars)} unique characters.")
  
  return names, chars

## Nombres de dinosaurios

In [195]:
names, chars = get_names_and_chars(dino_df, 'dino_name')

There are 1532 names and 28 unique characters.


## Nombres de ciudades

#### Limpiamos un poco los nombres (se puede hacer más).

In [196]:
cities_df = cities_df.apply(lambda x: x.astype(str).str.lower())
my_dict = {
    'ä': 'ae',
    'ö': 'oe',
    'ü': 'ue',
    'ß': 'ss'
}
clean_cities_df = cities_df[cities_df.country == 'germany'].replace(my_dict.keys(), my_dict.values(), regex=True)

In [197]:
names, chars = get_names_and_chars(clean_cities_df, 'name')

There are 1055 names and 33 unique characters.


## Preparación extra

In [198]:
char_to_index = { char:idx for idx, char in enumerate(chars) }
index_to_char = { idx:char for idx, char in enumerate(chars) }

In [199]:
# Nombre con mayor cantidad de caracteres
max_char = len(max(names, key=len))

# Cantidad de nombres
m = len(names)

# Cantidad de caracteres en nuestro diccionario
uq_chars = len(char_to_index)

#### Ahora convertimos nuestra información en algo que una red pueda entender

In [200]:
X = np.zeros((m, max_char, uq_chars))
Y = np.zeros((m, max_char, uq_chars))

#### Convertimos cada carácter en un vector one-hot (one-hot encoding)

In [201]:
for i in range(m):
  name = list(names[i])
  for j in range(len(name)):
    X[i, j, char_to_index[name[j]]] = 1
    if j < len(name)-1:
      Y[i, j, char_to_index[name[j+1]]] = 1

## El modelo LSTM 

In [211]:
model = keras.Sequential(
    [
     LSTM(128, input_shape=(max_char, uq_chars), return_sequences=True),
     Dense(uq_chars, activation='softmax')
    ]
)

model.compile(loss='categorical_crossentropy', optimizer='adam')

#### Creamos un método que recibe el modelo y genera secuencias 

In [203]:
def make_name(model):
    name = []
    x = np.zeros((1, max_char, uq_chars))
    end = False
    i = 0
    
    while end==False:
      probs = list(model.predict(x)[0,i])
      probs = probs / np.sum(probs) # Vector de probabilidades para cada carácter en nuestro vocabulario
      index = np.random.choice(range(len(chars)), p=probs)
      if i == max_char-2:
        character = '!'
        end = True
      else:
        character = index_to_char[index]
        name.append(character)
        x[0, i+1, index] = 1
        i += 1
        if character == '!':
            end = True
    
    print(''.join(name))

In [204]:
def generate_name_loop(epoch, _):
    if epoch % 25 == 0:
        
        print(f'Names generated in epoch: {epoch}')

        for i in range(5):
            make_name(model)
        
        print()

In [205]:
name_generator = LambdaCallback(on_epoch_end = generate_name_loop)

In [None]:
model.fit(X, Y, batch_size=32, epochs=325, callbacks=[name_generator], verbose=0)

In [None]:
for i in range(20):
  make_name(model)