# Classifying Surname With MLP
Adaptation from Natural Language Processing with PyTorch$^{[1]}$

# Surname Dataset
All dataset can be downloaded from Dataset index$^{[2]}$

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from nlppt.ch04 import *
import torch.nn as nn
import torch.optim as optim
import torch

In [5]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [7]:
df = GdriveHelper.surname_df()
df.head()

Unnamed: 0,nationality,nationality_index,split,surname
0,Arabic,15,train,Totah
1,Arabic,15,train,Abboud
2,Arabic,15,train,Fakhoury
3,Arabic,15,train,Srour
4,Arabic,15,train,Sayegh


## Initiating Preprocessing Components

`Vocabulary`: meant to transform from tokens to numbers

`Vectorizer`: make transformed numbers into a vector, in this notebook I will use own implementation of `OneHotVectorizer` instead of scikit-learns'

In [30]:
X_column = 'surname'
y_column = 'nationality'
surname_vocab = Vocabulary.from_df_series(df, X_column)
nationality_vocab = Vocabulary.from_df_series(df, y_column)
surname_vectorizer = OneHotVectorizer(surname_vocab)
nationality_vectorizer = OneHotVectorizer(nationality_vocab)

In [31]:
dataset = SurnameDataset(
    df,
    X_column=X_column,
    y_column=y_column,
    X_vectorizer=surname_vectorizer,
    y_vectorizer=nationality_vectorizer
)

## Constructing The Model

Here, as the book suggest, I will implement 2 layers MLP (Multilayer Preceptron) or some people called Dense layer.

In [32]:
mdl = SurnameClassifier(
    input_dim=len(surname_vocab),
    hidden_dim=300,
    output_dim=len(nationality_vocab)
).to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(mdl.parameters(), lr=0.01)
num_epochs = 5

In [33]:
mdl

SurnameClassifier(
  (fc1): Linear(in_features=9042, out_features=300, bias=True)
  (fc2): Linear(in_features=300, out_features=19, bias=True)
)

## The Training Routine


In [34]:
from tqdm import tqdm

In [35]:
for epoch in range(num_epochs):
    dataset.set_split('train')
    running_loss = 0.0
    running_acc = 0.0
    
    loss_log = list()
    acc_log = list()
    train_data_size = len(dataset)
    batch_generator = generate_batches(dataset, batch_size=dataset.n_batches(32), device=device)
    for batch_dict in tqdm(batch_generator):
        optimizer.zero_grad()
        
        logit = mdl(batch_dict['x'].float())
        loss = loss_fn(logit, batch_dict['y'])
        loss_log.append(loss.item())
        
        loss.backward()
        optimizer.step()
    
    running_loss = np.mean(loss_log)
    
    print('epoch:{}, current running loss: {}'.format(epoch, running_loss))

32it [00:01, 17.02it/s]
2it [00:00, 17.43it/s]

epoch:0, current running loss: 2.25553472712636


32it [00:01, 17.31it/s]
2it [00:00, 17.74it/s]

epoch:1, current running loss: 1.3143062740564346


32it [00:01, 17.35it/s]
2it [00:00, 17.06it/s]

epoch:2, current running loss: 0.4609020045027137


32it [00:01, 17.25it/s]
2it [00:00, 16.91it/s]

epoch:3, current running loss: 0.12896277173422277


32it [00:01, 16.98it/s]

epoch:4, current running loss: 0.07962732284795493





## Predicting 

predicting is about applying and reverting tensor/vector variable back into text.

In [51]:
def predict(mdl, vectorizer, nationality_vocab, name, device=device):
    vectorized_name = torch.tensor(vectorizer.vectorize(name)).to(device).view(1, -1)
    res = mdl(vectorized_name.float(), predict_proba=True)
    
    prob_values, indices = res.max(dim=1)
    index = indices.item()
    
    predicted_nationality = nationality_vocab.token_for(index)
    return (predicted_nationality, prob_values.item())

In [52]:
predict(mdl, vectorizer=surname_vectorizer, nationality_vocab=nationality_vocab, name='Fakhoury')

('Arabic', 0.9998503923416138)

# References
1. Rao, Delip. McMahan, Brian. Natural Language Processing with PyTorch
2. https://github.com/joosthub/PyTorchNLPBook/tree/master/data