In [2]:
import glob
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import os
import pandas as pd
import pytorch_lightning as pl
import random
import string
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchmetrics
import unicodedata
from pytorch_lightning.loggers import TensorBoardLogger
from sklearn.model_selection import train_test_split
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

%matplotlib inline

<font size='4'>Loading and processing dataset</font>  
    <font size = '2'>(based on PyTorch NLP tutorial i<sup>1</sup> and ii<sup>2</sup>)</font>

In [6]:
"""
< : Start of string (SOS)
> : End of string (EOS)
# : PAD
"""
all_letters = "#<>" + string.ascii_letters + " .,;'"
n_letters = len(all_letters)

In [7]:
# Turn a Unicode string to plain ASCII, thanks to https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

In [8]:
# Build the category_lines dictionary, a list of names per language
category_lines = {}
all_categories = []

# Read a file and split into lines
def readLines(filename):
    lines = open(filename, encoding='utf-8').read().strip().split('\n')
    return [f'<{unicodeToAscii(line)}>' for line in lines]

for filename in glob.glob('names/*.txt'):
    category = os.path.splitext(os.path.basename(filename))[0]
    all_categories.append(category)
    lines = readLines(filename)
    category_lines[category] = lines

n_categories = len(all_categories)

In [9]:
#creating dictionaries w.r.t. letters
letter2idx = {key: value for value, key in enumerate(all_letters)}
idx2letter = {key: value for key, value in enumerate(all_letters)}

In [10]:
#converting names to tensors
def name2tensor(name):
    indices = []
    for letter in name:
        idx = letter2idx[letter]
        indices.append(idx)
    return torch.tensor(indices, dtype=torch.int32)

<font size='4'>Creating datasets</font>

In [11]:
#creating dictionaries w.r.t. categories
lang2idx = {key: value for value, key in enumerate(all_categories)}
idx2lang = {key: value for key, value in enumerate(all_categories)}

In [12]:
all_data = []
for language in category_lines:
    lang_idx = lang2idx[language]
    for name in category_lines[language]:
        all_data.append([name, language, lang_idx])

df = pd.DataFrame(
    all_data, columns=['Name', 'Language', 'Language index']
)

In [13]:
#number of duplicates
int(df.Name.duplicated().value_counts()[1])

2652

In [14]:
train, test = train_test_split(df, test_size=0.2, random_state=7)

In [15]:
#dataset with dict output structure
class NamesDataset(Dataset):
    def __init__(self, df):
        super().__init__()
        self.df = df
       
    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        entry = self.df.iloc[idx]
        name = entry[0]
        name_tensor = name2tensor(name)
        language = entry[1]
        language_index = entry[2]
        return {'Name':name, 'Name tensor':name_tensor, 
                'Language':language, 'Language index':language_index}

In [16]:
dataset = NamesDataset(df)
dataset[10]

{'Name': '<Bastl>',
 'Name tensor': tensor([ 1, 30,  3, 21, 22, 14,  2], dtype=torch.int32),
 'Language': 'Czech',
 'Language index': 0}

<font size='4'>Creating dataloaders with custom collate_fn</font>

In [17]:
train_dataset = NamesDataset(train)
val_dataset = NamesDataset(test)

In [18]:
def collate_fn(batch):
    names = []
    name_tensors = []
    languages = []
    language_indices = []
    for item in batch:
        names.append(item['Name'])
        name_tensors.append(item['Name tensor'])
        languages.append(item['Language'])
        language_indices.append(item['Language index'])

    name_tensors = pad_sequence(name_tensors, batch_first=True)
    language_indices = torch.tensor(language_indices)

    return names, name_tensors, languages, language_indices

In [19]:
#a test of pad_sequence
a = torch.tensor([1, 2])
b = torch.tensor([3, 4, 8])
c = [a, b]
pad_sequence(c)

tensor([[1, 3],
        [2, 4],
        [0, 8]])

In [20]:
batch_size = 32

In [21]:
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=32, 
                                               shuffle=True, collate_fn=collate_fn)

val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=32, 
                                             shuffle=True, collate_fn=collate_fn)

In [26]:
sample_batch = next(iter(train_dataloader))
sample_batch

(['<Libusov>',
  '<Handal>',
  '<Whyte>',
  '<Gayazov>',
  '<Simon>',
  '<Nazari>',
  '<Islanov>',
  '<Balabuha>',
  '<Eizen>',
  '<Pokhitonov>',
  '<Seer>',
  '<Xie>',
  '<Cullen>',
  '<Mikhaleiko>',
  '<Antar>',
  '<Bruhn>',
  '<Barros>',
  '<Kitson>',
  '<Jarrett>',
  '<Shening>',
  '<Finkelshtein>',
  '<Minkin>',
  '<Zhiharevitch>',
  '<Clay>',
  '<Chalykh>',
  '<Chuvashev>',
  '<Baron>',
  '<Zogby>',
  '<Shamoon>',
  '<Jakuba>',
  '<Mniszech>',
  '<Almasi>'],
 tensor([[ 1, 40, 11,  4, 23, 21, 17, 24,  2,  0,  0,  0,  0,  0],
         [ 1, 36,  3, 16,  6,  3, 14,  2,  0,  0,  0,  0,  0,  0],
         [ 1, 51, 10, 27, 22,  7,  2,  0,  0,  0,  0,  0,  0,  0],
         [ 1, 35,  3, 27,  3, 28, 17, 24,  2,  0,  0,  0,  0,  0],
         [ 1, 47, 11, 15, 17, 16,  2,  0,  0,  0,  0,  0,  0,  0],
         [ 1, 42,  3, 28,  3, 20, 11,  2,  0,  0,  0,  0,  0,  0],
         [ 1, 37, 21, 14,  3, 16, 17, 24,  2,  0,  0,  0,  0,  0],
         [ 1, 30,  3, 14,  3,  4, 23, 10,  3,  2,  0,  0,  0, 

<font size='4'>Attention layer (custom implementation)</font>

In [27]:
class AttentionLayer(nn.Module):
    def __init__(self, hidden_size, method):
        super().__init__()
        assert method in ['dot', 'general', 'concat']
        
        self.hidden_projector = nn.Linear(hidden_size * 2, hidden_size)
        
        self.method = method
        if self.method == 'general':
            self.linear_gen = nn.Linear(hidden_size, hidden_size)
        elif self.method == 'concat':
            self.linear_cat_1 = nn.Linear(hidden_size * 2, hidden_size)
            self.linear_cat_2 = nn.Linear(hidden_size, 1)
            
        
    def forward(self, rnn_outputs, rnn_final_hidden):
        """
        INPUT:
        [\bar{h}_s]     rnn_outputs           shape: (batch_size, sequence_length, hidden_size)
        [h_t]           rnn_final_hidden      shape: (batch_size, hidden_size)
        
        OUTPUT:
        [\hat{h}_t]     attention_output      shape: (batch_size, hidden_size)
        [\alpha_t(s)]   attention_weights     shape: (batch_size, sequence_length)
        """
        if self.method == 'dot':
            att_score = torch.bmm(rnn_outputs, rnn_final_hidden.unsqueeze(dim=2)).squeeze() #(N x L)
        elif self.method == 'general':
            #att_score = torch.bmm(rnn_outputs, self.linear_gen(rnn_final_hidden).unsqueeze(dim=2)).squeeze()
            att_score = torch.bmm(self.linear_gen(rnn_outputs), rnn_final_hidden.unsqueeze(dim=2)).squeeze()
        elif self.method == 'concat':
            hidden_cat = torch.cat((rnn_outputs, rnn_final_hidden.unsqueeze(1).expand(rnn_outputs.shape)), dim=2)
            att_embed = torch.tanh(self.linear_cat_1(hidden_cat))
            att_score = self.linear_cat_2(att_embed).squeeze()
            
        attention_weights = F.softmax(att_score, dim=1) #(N x L)
        context_vector = torch.bmm(attention_weights.unsqueeze(dim=1), rnn_outputs).squeeze()
        attention_output = torch.tanh(self.hidden_projector(torch.cat((context_vector, rnn_final_hidden), dim=1)))
        
        return attention_output, attention_weights

<font size='4'>LSTM classifier with trainable embedding and attention</font>

In [28]:
class LSTMClassifier(nn.Module):

    def __init__(self):
        super(LSTMClassifier, self).__init__()
        embedding_dim = 20
        #c_out = 25
        hidden_dim = 30
        self.char_embedding = nn.Embedding(len(all_letters), embedding_dim, 
                                          padding_idx=0)
        #self.conv = nn.Conv1d(20, c_out, kernel_size=3, padding=1) #(N, L, 20) optional conv layer
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.linear = nn.Linear(hidden_dim * 2, len(all_categories))
        self.attention_layer = AttentionLayer(hidden_dim * 2, 'general')

    def forward(self, x):
        x = self.char_embedding(x) #(N, L, 20)
        #x = self.conv(x) #(N, L, 25) optional conv
        lstm_output = self.lstm(x)
        rnn_outputs = lstm_output[0]
        rnn_final_hidden = lstm_output[1][0] #(N, 2, 30) ('2' for bidirectional case)
        rnn_final_hidden = rnn_final_hidden.permute(1, 0, -1)
        rnn_final_hidden = torch.cat((torch.chunk(rnn_final_hidden, 2, dim=1)[0], 
                                      torch.chunk(rnn_final_hidden, 2, dim=1)[1]), dim=2)
        rnn_final_hidden = rnn_final_hidden.squeeze()
        x = self.attention_layer(rnn_outputs, rnn_final_hidden)[0]
        x = self.linear(x)
        scores = F.log_softmax(x, dim=1)
        return scores

In [29]:
#checking the cat function
cat_sample = torch.randn(32, 2, 30)
torch.cat((torch.chunk(cat_sample, 2, dim=1)[0], torch.chunk(cat_sample, 2, dim=1)[1]), dim=2).shape

torch.Size([32, 1, 60])

In [30]:
#checking if everything is fine dimension-wise
next(iter(train_dataloader))[1].shape

torch.Size([32, 14])

In [31]:
lstm = LSTMClassifier()
lstm(next(iter(train_dataloader))[1]).shape

torch.Size([32, 18])

In [32]:
#testing applying conv
test_conv = nn.Conv1d(1, 20, kernel_size=3)
test_seq_batch = torch.randn(32, 1, 16)
test_conv(test_seq_batch).shape

torch.Size([32, 20, 14])

<font size='4'>Training with PL and Tensorboard viz</font>

In [29]:
class PLModel(pl.LightningModule):
    def __init__(self, model):
        super().__init__()
        self.model = model
        self.train_accuracy = torchmetrics.Accuracy()
        self.val_accuracy = torchmetrics.Accuracy()

    def forward(self, x):
        #defines prediction/inference actions
        return torch.exp(self.model(x))

    def training_step(self, batch, batch_idx):      
        x, y = batch[1], batch[3]
        neg_logs = self.model(x)
        loss = loss_fn(neg_logs, y)
        probs = torch.exp(neg_logs)
        train_accuracy = self.train_accuracy(probs, y)
        
        # logging to tensorboard
        self.log("train loss", loss, prog_bar=True)
        self.log("train acc", train_accuracy, prog_bar=True)

        return loss
    
    def validation_step(self, batch, batch_idx):   
        x, y = batch[1], batch[3]
        neg_logs = self.model(x)
        loss = loss_fn(neg_logs, y)
        probs = torch.exp(neg_logs)
        self.val_accuracy(probs, y)
        
        #logging to tensorboard
        self.log("val loss", loss, prog_bar=True)
        self.log("val acc", self.val_accuracy, prog_bar=True)
        
    def training_epoch_end(self, *args, **kwargs):
        self.train_accuracy.reset()
        
    def validation_epoch_end(self, outs):
        self.log('val acc', self.val_accuracy.compute(), prog_bar=True)
        self.val_accuracy.reset()
        
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.model.parameters(), lr=0.01)
        
        return optimizer

In [30]:
model = LSTMClassifier()
loss_fn = nn.NLLLoss()

pl_model = PLModel(model)
logger = TensorBoardLogger('lstm_logs', default_hp_metric=False) 
trainer = pl.Trainer(max_epochs=10, logger=logger)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [31]:
%load_ext tensorboard
%tensorboard --logdir lstm_logs/

In [32]:
trainer.fit(pl_model, train_dataloader, val_dataloader)

Missing logger folder: lstm_logs/default

  | Name           | Type           | Params
--------------------------------------------------
0 | model          | LSTMClassifier | 25.7 K
1 | train_accuracy | Accuracy       | 0     
2 | val_accuracy   | Accuracy       | 0     
--------------------------------------------------
25.7 K    Trainable params
0         Non-trainable params
25.7 K    Total params
0.103     Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(
  rank_zero_warn(


Training: -1it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

In [33]:
#validation run
trainer.validate(pl_model, val_dataloader)

Validating: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 VALIDATE RESULTS
{'val acc': 0.8049813508987427, 'val loss': 0.7084192037582397}
--------------------------------------------------------------------------------


[{'val loss': 0.7084192037582397, 'val acc': 0.8049813508987427}]

<sup>1</sup>https://pytorch.org/tutorials/intermediate/char_rnn_classification_tutorial.html  
<sup>2</sup>https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html