# Generate last names for a specific natural language

In [1]:
import pandas as pd
import numpy as np
import math
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F
#from torch.nn.functional import softmax
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
np.set_printoptions(precision=2, suppress=True, linewidth=3000, threshold=20000)
from typing import Sequence

dtype = torch.float
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [2]:
def normal_transform(x, mean=0.0, std=0.01):
    "Convert x to have mean and std"
    return x*std + mean

def randn(n1, n2,          
          mean=0.0, std=0.01, requires_grad=False,
          device=torch.device('cuda:0' if torch.cuda.is_available() else 'cpu'),
          dtype=torch.float64):
    x = torch.randn(n1, n2, device=device, dtype=dtype)
    x = normal_transform(x, mean=mean, std=std)
    x.requires_grad=requires_grad
    return x

In [3]:
def plot_history(history, yrange=(0.0, 5.00), figsize=(3.5,3)):
    plt.figure(figsize=figsize)
    plt.ylabel("Sentiment log loss")
    plt.xlabel("Epochs")
    loss = history[:,0]
    valid_loss = history[:,1]
    plt.plot(loss, label='train_loss')
    plt.plot(valid_loss, label='val_loss')
    # plt.xlim(0, 200)
    plt.ylim(*yrange)
    plt.legend()#loc='lower right')
    plt.show()

In [25]:
def getvocab(strings):
    letters = [list(l) for l in strings]
    vocab = set([c for cl in letters for c in cl])
    vocab = sorted(list(vocab))
    ctoi = {c:i for i, c in enumerate(vocab)}
    return vocab, ctoi

In [4]:
def softmax(y):
    expy = torch.exp(y)
    if len(y.shape)==1: # 1D case can't use axis arg
        return expy / torch.sum(expy)
    return expy / torch.sum(expy, axis=1).reshape(-1,1)

def cross_entropy(y_prob, y_true):
    """
    y_pred is n x k for n samples and k output classes and y_true is n x 1
    and is often softmax of final layer.
    y_pred values must be probability that output is a specific class.
    Binary case: When we have y_pred close to 1 and y_true is 1,
    loss is -1*log(1)==0. If y_pred close to 0 and y_true is 1, loss is
    -1*log(small value) = big value.
    y_true values must be positive integers in [0,k-1].
    """
    n = y_prob.shape[0]
    # Get value at y_true[j] for each sample with fancy indexing
    p = y_prob[range(n),y_true]
    return torch.mean(-torch.log(p))

In [33]:
def onehot(c) -> torch.tensor:
    v = torch.zeros((len(vocab),1), dtype=torch.float64)
    v[ctoi[c]] = 1
    return v

## Load using pickled data from my RNN article

In [6]:
import pickle
with open('data/X-lastnames.pkl', 'rb') as f:
    X = pickle.load(f)
with open('data/y-lastnames.pkl', 'rb') as f:
    y = pickle.load(f)

In [10]:
# TESTING SUBSAMPLE
idx = list(np.random.randint(0,len(X),size=2000))
X = np.array(X)[idx].tolist()
y = np.array(y)[idx].tolist()

## Pick language of interest

In [23]:
lang2idx = {
    'Arabic': 0,
    'Chinese': 1,
    'Czech': 2,
    'Dutch': 3,
    'English': 4,
    'French': 5,
    'German': 6,
    'Greek': 7,
    'Irish': 8,
    'Italian': 9,
    'Japanese': 10,
    'Korean': 11,
    'Polish': 12,
    'Portuguese': 13,
    'Russian': 14,
    'Scottish': 15,
    'Spanish': 16,
    'Vietnamese': 17
}

# get just these names and then we can ignore y
X_train = np.array(X)[np.array(y)==lang2idx['English']]
list(X_train[:5])

[['y', 'e', 'a', 't', 'm', 'a', 'n'],
 ['j', 'o', 'h', 'n', 's', 'o', 'n'],
 ['d', 'u', 'p', 'o', 'n', 't'],
 ['d', 'o', 'w', 'n', 'e', 'r'],
 ['c', 'o', 'o', 'm', 'b', 'e', 's']]

In [26]:
vocab, ctoi = getvocab(X)

In [30]:
nhidden = 100
nfeatures = len(vocab)
nclasses = nfeatures
n = len(X_train)
print(f"{n:,d} training records, {nfeatures} features (chars), state is {nhidden}-vector")

401 training records, 28 features (chars), state is 100-vector


In [66]:
def forward(x):
    loss = 0.0
    outputs = []
    h = torch.zeros(nhidden, 1, dtype=torch.float64, requires_grad=False)  # reset hidden state at start of record
    for j in range(len(x)):  # for each char in a name
        h = W@h + U@onehot(x[j])
        h = torch.tanh(h)
        o = V@h
        o = o.reshape(1,nclasses)
        o = softmax(o)
        outputs.append( o[0] ) 
    return torch.stack(outputs)

def forwardN(X:Sequence[Sequence]):#, apply_softmax=True):
    "Cut-n-paste from body of training for use with metrics"
    outputs = []
    for i in range(0, len(X)): # for each input record
        o = forward1(X[i])
        outputs.append( o[0] ) 
    return torch.stack(outputs)

In [102]:
#%%time 
#torch.manual_seed(0) # SET SEED FOR TESTING
W = torch.eye(nhidden,    nhidden,   dtype=torch.float64, requires_grad=True)
U = torch.randn(nhidden,  nfeatures, dtype=torch.float64, requires_grad=True) # embed one-hot char vec
V = torch.randn(nclasses, nhidden,   dtype=torch.float64, requires_grad=True) # take RNN output (h) and predict target

optimizer = torch.optim.Adam([W,U,V], lr=0.001, weight_decay=0.0)

history = []
epochs = 25
for epoch in range(1, epochs+1):
#     print(f"EPOCH {epoch}")
    epoch_training_loss = 0.0
    epoch_training_accur = 0.0
    total_char = 0
    for i in range(0, n): # an epoch trains all names
        x = X_train[i]    # get one name and compute y as x shifted
#         print("NAME", x)
        x, y = x[0:-1], [ctoi[y] for y in x[1:]]
        total_char += len(x)
        o = forward(x)
        loss = cross_entropy(o, y)
        optimizer.zero_grad()
        loss.backward() # autograd computes U.grad, M.grad, ...
        optimizer.step()

#         print(loss.item())

        epoch_training_loss += loss.detach().item()
#         print(torch.argmax(o, dim=1), 'vs', y)
        correct = torch.sum( torch.argmax(o, dim=1)==torch.tensor(y) )
        epoch_training_accur += correct

    epoch_training_loss /= n
    epoch_training_accur /= total_char
    print(f"Epoch {epoch:3d} training loss {epoch_training_loss:7.4f} accur {epoch_training_accur:7.4f}")

#     with torch.no_grad():
#         o = forward(X_train)#, apply_softmax=False)
#         train_loss = cross_entropy(o, y)
#         correct = torch.argmax(o, dim=1).detach()==y_train
#         train_accur = torch.sum(correct) / float(len(X_train))

#         o = forward(X_valid)
#         valid_loss = cross_entropy(o, y_valid)
#         correct = torch.argmax(o, dim=1).detach()==y_valid
#         valid_accur = torch.sum(correct) / float(len(X_valid))

#         history.append((train_loss, valid_loss))
#         print(f"Epoch: {epoch:3d} accum loss {epoch_training_loss:7.4f} accur {epoch_training_accur:4.3f} | train loss {train_loss:7.4f} accur {train_accur:4.3f} | valid loss {valid_loss:7.4f} accur {valid_accur:4.3f}")

# history = torch.tensor(history)
# plot_history(history, yrange=(0,7))

Epoch   1 training loss  8.7718 accur  0.0950
Epoch   2 training loss  4.7915 accur  0.2344
Epoch   3 training loss  3.3553 accur  0.3432
Epoch   4 training loss  2.5772 accur  0.4238
Epoch   5 training loss  2.0987 accur  0.4849
Epoch   6 training loss  1.7790 accur  0.5461
Epoch   7 training loss  1.5576 accur  0.5813
Epoch   8 training loss  1.3970 accur  0.6230
Epoch   9 training loss  1.2784 accur  0.6480
Epoch  10 training loss  1.1817 accur  0.6698
Epoch  11 training loss  1.1002 accur  0.6846
Epoch  12 training loss  1.0319 accur  0.7008
Epoch  13 training loss  0.9764 accur  0.7147
Epoch  14 training loss  0.9324 accur  0.7249
Epoch  15 training loss  0.8972 accur  0.7341
Epoch  16 training loss  0.8689 accur  0.7383
Epoch  17 training loss  0.8455 accur  0.7434
Epoch  18 training loss  0.8275 accur  0.7466
Epoch  19 training loss  0.8127 accur  0.7485
Epoch  20 training loss  0.8022 accur  0.7485
Epoch  21 training loss  0.7931 accur  0.7499
Epoch  22 training loss  0.7826 ac

In [173]:
def sample(initial_chars, n, temperature=0.1):
    "Derived from Karpathy: https://gist.github.com/karpathy/d4dee566867f8291f086"
    chars = initial_chars
    n -= len(initial_chars)
    with torch.no_grad():
        for i in range(n):
            h = torch.zeros(nhidden, 1, dtype=torch.float64, requires_grad=False)  # reset hidden state at start of record
            for j in range(len(chars)):  # for each char in a name
                h = W@h + U@onehot(chars[j])
                h = torch.tanh(h)
            o = V@h
            o = o.reshape(nclasses)
            p = softmax(o)
#             wi = torch.argmax(p)
#             print(p)
#             print(wi)
            wi = np.random.choice(range(len(vocab)), p=p) # don't always pick most likely; pick per distribution
            chars.append(vocab[wi])
    return chars

In [190]:
sample(list('hub'), 7)

['h', 'u', 'b', 'b', 'a', 'r', 'd']

In [196]:
sample(list('j'), 7)

['j', 'o', 'h', 'n', 's', 'o', 'n']

In [180]:
sample(list('ar'), 7)

['a', 'r', 'c', 'h', 'e', 'r', 's']

In [168]:
list(X_train)[0:20]

[['y', 'e', 'a', 't', 'm', 'a', 'n'],
 ['j', 'o', 'h', 'n', 's', 'o', 'n'],
 ['d', 'u', 'p', 'o', 'n', 't'],
 ['d', 'o', 'w', 'n', 'e', 'r'],
 ['c', 'o', 'o', 'm', 'b', 'e', 's'],
 ['b', 'y', 'r', 'n', 'e'],
 ['d', 'u', 'f', 'f', 'i', 'e', 'l', 'd'],
 ['d', 'o', 'r', 'a', 'n'],
 ['n', 'u', 'g', 'e', 'n', 't'],
 ['p', 'e', 'a', 'c', 'h'],
 ['a', 'l', 'd', 'e', 'n'],
 ['f', 'i', 'e', 'l', 'd', 'e', 'r'],
 ['p', 'i', 'c', 'k', 't', 'h', 'a', 'l', 'l'],
 ['d', 'o', 'w', 'n', 'e', 'r'],
 ['p', 'i', 'c', 'k'],
 ['d', 'o', 'w', 'n', 'i', 'e'],
 ['c', 'l', 'i', 'f', 'f'],
 ['b', 'r', 'e', 'l', 's', 'f', 'o', 'r', 'd'],
 ['a', 'r', 'c', 'h', 'e', 'r'],
 ['e', 't', 't', 'r', 'i', 'c', 'k']]