In [310]:
from torch.utils.data import Dataset, DataLoader
from torch.nn import functional as F
import pandas as pd
import torch.nn as nn
import torch.optim as optim


# Country Classification based on Surname

- 
https://github.com/joosthub/PyTorchNLPBook/blob/master/chapters/chapter_7/7_3_surname_generation/7_3_Model1_Unconditioned_Surname_Generation.ipynb



In [549]:
class Vocabulary():
    def __init__(self, surname_list, country_list):
        self.surname_ix2char = {}
        self.surname_char2ix = {}
        self.max_n_chars = np.max(surname_list.apply(len))+2 # len + <INIT> and <END>
        self.country_ix2word = {}
        self.country_word2ix = {}
        self.init = '<init>'
        self.end = '<end>'
        self.unk = '<unk>'
        self.mask = '<mask>'
        
        self.create_surname_vocabulary(surname_list)
        self.create_country_vocabulary(country_list)
        
    def create_surname_vocabulary(self, surnames):
        count_ix = 0
        surnames = surnames.str.lower()
        for surname in surnames:
            for char in surname:
                if char not in self.surname_ix2char.values():
                    self.surname_ix2char[count_ix] = char
                    count_ix += 1

        sequence_vocab = [self.init, self.end, self.unk, self.mask]
        for special_char in sequence_vocab:
            self.surname_ix2char[count_ix] = special_char
            count_ix += 1
        
        self.surname_char2ix = {e: k for k,e in self.surname_ix2char.items()} 
        
    def create_country_vocabulary(self, countries):
        count_ix = 0
        countries = countries.str.lower()
        for country in countries:
            if country not in self.country_ix2word.values():
                self.country_ix2word[count_ix] = country
                count_ix += 1
        
        self.country_word2ix = {e: k for k,e in self.country_ix2word.items()} 
        
        
    def get_surname_char2ix(self, char):
        char = char.lower()
        if char in self.surname_char2ix.keys():
            return self.surname_char2ix[char]
        else:
            return self.surname_char2ix[self.unk]
    
    def get_surname_ix2char(self, ix):
        return self.surname_ix2char[ix]
    
    def get_country_word2ix(self, word):
        word = word.lower()
        return self.country_word2ix[word]
    
    def get_country_ix2word(self, ix):
        return self.country_ix2word[ix]

class Vectorizer():
    def __init__(self, vocabulary):
        self.vocabulary = vocabulary
        
    def vectorize(self, word, vector_type):
        word_ixs = []
        if(vector_type == 'surname'):
            for c in word.lower():
                word_ixs.append(self.vocabulary.get_surname_char2ix(c))
            return_ixs = [self.vocabulary.get_surname_char2ix(self.vocabulary.init)]
            return_ixs.extend(word_ixs)
            return_ixs.append(self.vocabulary.get_surname_char2ix(self.vocabulary.end))

            size_mask_leftover = self.vocabulary.max_n_chars - len(return_ixs)
            return_ixs.extend([self.vocabulary.get_surname_char2ix(self.vocabulary.mask)] * size_mask_leftover)
        else:
            return_ixs = self.vocabulary.get_country_word2ix(word)
        
        return return_ixs
        
class SurnameDataset(Dataset):
    def __init__(self, csv_file_path):
        self.df = pd.read_csv(csv_file_path)
        self.vectorizer = Vectorizer(Vocabulary(self.df['surname'], self.df['nationality']))
        self.dataset_split = 'train'
        self.df_splitted = {}
        self.split_df()
    
    def __len__(self):
        return self.get_dataset_on_type()['size']
    
    def __getitem__(self, ix):
        row = self.get_dataset_on_type()['dataset'].iloc[ix]
        x = self.vectorizer.vectorize(row['surname'], 'surname')
        y = self.vectorizer.vectorize(row['nationality'], 'nationality')
        return {
            'X': x,
            'y': y
        }  
    
    def set_dataset_split(self, dataset_split_type):
        self.dataset_split = dataset_split_type
    
    def split_df(self):
        train_ix = int(d.shape[0] * 0.7)
        val_ix = int(d.shape[0] * 0.15)
        test_ix = int(d.shape[0] * 0.15)

        self.df['split'] = ''
        self.df.loc[:train_ix, 'split'] = 'train'
        self.df.loc[train_ix:(train_ix + val_ix), 'split'] = 'val'
        self.df.loc[(train_ix + val_ix):(train_ix + val_ix + test_ix), 'split'] = 'test'
        
        split_types = ['train', 'val', 'test']
        for st in split_types:
             self.df_splitted[st] = {
                 'dataset': self.df.loc[self.df['split'] == st],
                 'size': self.df.loc[self.df['split'] == st].shape[0]
             }
    
    def get_dataset_on_type(self):
        return self.df_splitted[self.dataset_split]
        
class SequentialModelGRU(nn.Module):
    
    def __init__(self, char_vocab_size, country_class_size, embedding_dim, padding_ix, rnn_hidden_size, batch_first, dropout_p):
        """
        :param char_vocab_size: 
        :param embedding_dim: 
        :param padding_ix: 
        
        """
        super(SequentialModelGRU, self).__init__()
        
        self.emb = nn.Embedding(num_embeddings=char_vocab_size,
                                embedding_dim=embedding_dim,
                                padding_idx=padding_ix)
        
        self.rnn = nn.GRU(input_size=embedding_dim, 
                          hidden_size=rnn_hidden_size,
                          batch_first=batch_first)
        
        self.dropout = nn.Dropout(dropout_p)
        
        self.fc = nn.Linear(in_features=rnn_hidden_size, 
                            out_features=country_class_size)
        
        self.dropout_p = dropout_p
        
        self.rnn_hidden_size = rnn_hidden_size
        
    def forward(self, x_in, apply_softmax=True):        
        # x_in.shape = [19, 64] =  [seq_length, batch_size]
        x_in = x_in.permute(1,0)     # shape = [batch_size, seq_length]

        x_out = self.emb(x_in)       # shape = [64, 19, 100] [batch_size, seq_length, emb_dim]
        
        # x_in shape for rnn --> https://discuss.pytorch.org/t/tensor-shape-for-rnn-batch-training/13466        
        # h_n of shape (num_layers * num_directions, batch, hidden_size): tensor containing the hidden state for t = seq_len
        x_out, h_n = self.rnn(x_out) # x_out shape [64, 19, 100] h_n shape [1, 19, 64]
        
        x_out = x_out.permute(1,0,2)
        x_out = self.dropout(x_out[-1])
        
        x_out = self.fc(x_out) # x_out shape [64,18]  
        
        if(apply_softmax):
            x_out = F.softmax(x_out, dim=1)
#             print(x_out)

        return x_out
        
        

Loop helper functions

In [550]:
def generate_batches(dataset, batch_size, shuffle, drop_last):
    """
    Generate different DataLoaders. Allow for train, val sets generation
    """
    s_dl = DataLoader(dataset=s_df, batch_size=batch_size,
                            shuffle=shuffle, drop_last=drop_last)
    
    for data_dict in s_dl:
        yield data_dict
        
def get_validation_performance(dataset, model, loss):
        dataset.set_dataset_split('val')
        n_batches = len(dataset)/batch_size
        
        s_dl_val = generate_batches(dataset=dataset, batch_size=batch_size, 
                        shuffle=gen_batches_shuffle, drop_last=gen_batches_drop_last)
        moving_loss_val = 0
        with torch.no_grad():
            model = model.eval()
            for batch_ix_val, batch_data_val in enumerate(s_dl_val,0):
                X_val = torch.stack(batch_data_val['X'])
                y_val = batch_data_val['y']

                y_val_pred = model(X_val)

                loss_val = loss(y_val_pred, y_val)
                moving_loss_val += loss_val.item()


        avg_batch_loss_val = moving_loss_val/n_batches
        return avg_batch_loss_val        

In [552]:
batch_size = 65
lr = 1e-3
n_epochs = 100
char_vocab_size = len(s_df.vectorizer.vocabulary.surname_char2ix)
country_class_size = len(s_df.vectorizer.vocabulary.country_word2ix)
embedding_dim = 100
padding_ix = s_df.vectorizer.vocabulary.get_surname_char2ix('<mask>')
rnn_hidden_size = 64
batch_first = True
dropout_p = .3
gen_batches_shuffle = True
gen_batches_drop_last = True

n_batches = int(len(s_df)/batch_size)

s_df = SurnameDataset('data/surnames.csv')


rnn = SequentialModelGRU(char_vocab_size=char_vocab_size, country_class_size=country_class_size, 
                         embedding_dim=embedding_dim, padding_ix=padding_ix, rnn_hidden_size=rnn_hidden_size,
                         batch_first=batch_first, dropout_p=dropout_p)

loss_ce = nn.CrossEntropyLoss()
optimiser = optim.Adam(rnn.parameters(), lr=lr)

avg_batch_loss_train_all = []
avg_batch_loss_val_all = []

for ep in range(n_epochs):
    moving_loss_train = 0
    s_df.set_dataset_split('train')
    s_dl = generate_batches(dataset=s_df, batch_size=batch_size, 
                            shuffle=gen_batches_shuffle, drop_last=gen_batches_drop_last)
    rnn = rnn.train()
    for batch_ix, batch_data in enumerate(s_dl, 0):
        X = torch.stack(batch_data['X'])
        y = batch_data['y']
        
        rnn.zero_grad()
        
        y_pred = rnn(X)
        
        loss = loss_ce(y_pred, y) # size of y_pred: https://github.com/pytorch/pytorch/issues/5554
        moving_loss_train += loss.item()
        loss.backward()
        
        optimiser.step()
        
    avg_batch_loss_train = moving_loss_train/n_batches
    avg_batch_loss_train_all.append(avg_batch_loss_train)
    print(f'epoch {ep}:')
    print(f'Training Loss: {avg_batch_loss_train}')
    
    # validation performance
    avg_batch_loss_val = get_validation_performance(dataset=s_df,
                                                    model=rnn,
                                                    loss=loss_ce)
    avg_batch_loss_val_all.append(avg_batch_loss_val)
    
    print(f'Validation Loss: {avg_batch_loss_val}')
    print(f'--------------------------------')
    

epoch 0:
Training Loss: 2.7104017956782194
Validation Loss: 2.5207804526136224
--------------------------------
epoch 1:
Training Loss: 2.5467777716911444
Validation Loss: 2.498944313075952
--------------------------------
epoch 2:
Training Loss: 2.5374222104832276
Validation Loss: 2.4964449719797863
--------------------------------
epoch 3:
Training Loss: 2.5331581103599676
Validation Loss: 2.4930495591907698
--------------------------------
epoch 4:
Training Loss: 2.5292810929023615
Validation Loss: 2.4953531668847737
--------------------------------
epoch 5:
Training Loss: 2.515852998878996
Validation Loss: 2.4336896083104516
--------------------------------
epoch 6:
Training Loss: 2.4452564170805076
Validation Loss: 2.3964455182724604
--------------------------------
epoch 7:
Training Loss: 2.423965328830784
Validation Loss: 2.382495584962726
--------------------------------
epoch 8:
Training Loss: 2.414209905317274
Validation Loss: 2.379480084290125
-------------------------------

Validation Loss: 2.2874867684492286
--------------------------------
epoch 74:
Training Loss: 2.2742357375258107
Validation Loss: 2.2854693745148134
--------------------------------
epoch 75:
Training Loss: 2.2754199565467186
Validation Loss: 2.283042033802906
--------------------------------
epoch 76:
Training Loss: 2.2731854168035217
Validation Loss: 2.2834035224022835
--------------------------------
epoch 77:
Training Loss: 2.2735605947041915
Validation Loss: 2.285668391781861
--------------------------------
epoch 78:
Training Loss: 2.2741019422725097
Validation Loss: 2.2884003039049103
--------------------------------
epoch 79:
Training Loss: 2.274435481782687
Validation Loss: 2.2860345003745306
--------------------------------
epoch 80:
Training Loss: 2.2755593263496787
Validation Loss: 2.284219021788495
--------------------------------
epoch 81:
Training Loss: 2.2781182244672613
Validation Loss: 2.291351143490132
--------------------------------
epoch 82:
Training Loss: 2.27932

In [554]:
def get_prediction(surname, classifier, vectorizer): 
    classifier = classifier.eval()
    vectorized_surname = torch.Tensor(vectorizer.vectorize(surname, 'surname')).type(torch.long)
    vectorized_surname = vectorized_surname.unsqueeze(0).permute(1,0)

    prediction = classifier(vectorized_surname)

    prob, ix = prediction.max(dim=1)
    prob = prob.squeeze(0).item()
    ix = ix.squeeze(0).item()
    
    country = vectorizer.vocabulary.get_country_ix2word(ix)

    return prob, country

surnames = ['McMahan', 'Nakamoto', 'Wan', 'Cho']
for surname in surnames:
    prob, country = get_prediction(surname=surname, classifier=rnn, vectorizer=s_df.vectorizer)
    print(f'Surname: {surname}, probability {prob} - Class: {country}')

Surname: McMahan, probability 0.9999935626983643 - Class: english
Surname: Nakamoto, probability 0.9999966621398926 - Class: japanese
Surname: Wan, probability 0.9999995231628418 - Class: english
Surname: Cho, probability 0.9999802112579346 - Class: japanese


# TODO:

- Early Stopping
- Examples from test set

# Surname Generation based on Country