# ethnicity-pytorch

Implementation of model proposed in 
[Name Nationaltiy Classification with Recurrent Neural Networks (Lee et al., IJCAI 2017)](https://www.ijcai.org/proceedings/2017/0289) 
in PyTorch.

Here is the [Tensorflow implementation](https://github.com/jhyuklee/ethnicity-tensorflow).

### Package Requirements
* python 3.6.4.
* pytorch 1.3.1+cu92.
* re(regex) 2.2.1.
* numpy 1.17.0.
* json 2.0.9.
* tqdm (optional) 4.38.0.
* gensim 3.8.1

### Data
Many thanks to [https://github.com/jhyuklee/ethnicity-tensorflow](https://github.com/jhyuklee/ethnicity-tensorflow) for the data.
A collection of ~10000 sample pairs of names, nationality with ethnicity and ~3000 validation and testing samples of the 
same.

### How to run 
`config.json` can be edited to tweak the model, change the running mode (train/test), change the lr decay rate, etc.
`globals.py` used the change the paths and add / edit global variables.

Note :- In `config.json`, *Vocab_len* keys specify the ngram idx2grams and grams2idx size. *embed_dim* key specify the embedding dimension.

To run the code : python run.py

In [1]:
import dataset
import models
import globals

In [2]:
import os
from tqdm import tqdm
import json
import torch
import torch.optim as optim
import torch.nn as nn
import time

In [3]:
# Defining hyperparas
training_params = json.load(open('config.json'))["training_params"]
print(training_params)

{'is_train': 'True', 'start_epoch': 201, 'batch_size': 32, 'end_epoch': 300, 'decay_rate': 0.99, 'train_embeddings': 'True', 'early_stopping': 'True', 'early_stop_window': 15}


In [4]:
# Creating Model and intializing
Nationality_model = models.Nationality_Model().float().to(globals.device)
print(globals.device, "is used")

cpu is used


In [5]:
# Generating a num:country diction
idx2country = dataset.create_country_dict('idx')[1]
print(idx2country)

{0: 'Algeria', 1: 'Argentina', 2: 'Australia', 3: 'Austria', 4: 'Azerbaijan', 5: 'Bahamas', 6: 'Belarus', 7: 'Belgium', 8: 'Brazil', 9: 'Bulgaria', 10: 'Canada', 11: 'Chile', 12: 'Chinese Taipei', 13: 'Colombia', 14: 'Croatia', 15: 'Cuba', 16: 'Czech Republic', 17: 'Czechoslovakia', 18: "Democratic People's Republic Of Korea", 19: 'Denmark', 20: 'Egypt', 21: 'Estonia', 22: 'Ethiopia', 23: 'Federal Republic Of Germany (1950-1990, "GER" Since) ', 24: 'Finland', 25: 'France', 26: 'Georgia', 27: 'German Democratic Republic (1955-1990, ', 28: 'Germany', 29: 'Great Britain', 30: 'Greece', 31: 'Haiti', 32: 'Hungary', 33: 'Iceland', 34: 'India', 35: 'Indonesia', 36: 'Ireland', 37: 'Islamic Republic Of Iran', 38: 'Israel', 39: 'Italy', 40: 'Jamaica', 41: 'Japan', 42: 'Kazakhstan', 43: 'Kenya', 44: 'Latvia', 45: 'Liechtenstein', 46: 'Lithuania', 47: 'Malaysia', 48: 'Mexico', 49: 'Mixed Team', 50: 'Mongolia', 51: 'Montenegro', 52: 'Morocco', 53: 'Netherlands', 54: 'New Zealand', 55: 'Nigeria', 56

In [6]:
# Loading the saved model
raw_weights_list = os.listdir(globals.WEIGHTS_DIR)
weights_list = [int(items.split('_')[2].rstrip('.pt')) for items in raw_weights_list] # Geting the weights list
weights_list = sorted(weights_list)
saved_model = torch.load(os.path.join(globals.WEIGHTS_DIR, 'LSTM_Model1_{}.pt'.format(weights_list[-1])), map_location=globals.device) # Geting the saved model
print(saved_model)

OrderedDict([('embedding_layers.0.weight', tensor([[ 0.0898,  0.0725,  0.1353,  ..., -0.2049, -0.1760,  0.2540],
        [ 1.0000,  1.0000,  1.0000,  ...,  1.0000,  1.0000,  1.0000],
        [ 0.0665,  1.3495,  0.3942,  ..., -0.1252, -0.8352, -0.9232],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]])), ('embedding_layers.1.weight', tensor([[ 0.0557,  0.1028,  0.0971,  ...,  0.1118, -0.1347, -0.1201],
        [ 0.9754,  0.8708,  1.0123,  ...,  0.9374,  1.0041,  0.9794],
        [-0.3504, -0.7471, -0.7305,  ..., -0.6514, -1.1134,  0.8742],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]])), ('embedding_layers.2.weight', tensor([[ 0.0295, -0.0817

In [7]:
# Temply setting the weights list as [] to retrain
weights_list = []

if weights_list == [] : # decide if there is saved models
    models.initialize_embeddings(Nationality_model.embedding_layers, globals.device)
else :
    print("Loading weights for training epoch {}".format(weights_list[-1]))
    Nationality_model.load_state_dict(saved_model)

Intializing embeddings for unigram
[['$', 'f', 'r', 'i', 't', 'z', '$', ' ', '+', 's', 't', 'ö', 'c', 'k', 'l', 'i', '+'], ['$', 'v', 'i', 'v', 'i', 'a', 'n', '$', ' ', '$', 'j', 'e', 'p', 'k', 'e', 'm', 'o', 'i', '$', ' ', '+', 'c', 'h', 'e', 'r', 'u', 'i', 'y', 'o', 't', '+'], ['$', 't', 'i', 'b', 'o', 'r', '$', ' ', '+', 'p', 'e', 'z', 's', 'a', '+'], ['$', 'j', 'a', 'c', 'k', 's', 'o', 'n', '$', ' ', '+', 'r', 'i', 'c', 'h', 'a', 'r', 'd', 's', 'o', 'n', '+'], ['$', 'a', 'r', 't', 'u', 'r', '$', ' ', '$', 'j', 'o', 'h', 'n', '$', ' ', '+', 'd', 'a', 'r', 'b', 'y', '+'], ['$', 't', 'o', 'm', 'o', 'a', 'k', 'i', '$', ' ', '+', 's', 'a', 't', 'o', '+'], ['$', 'a', 'n', 't', 'o', 'n', 'i', 'a', '$', ' ', '+', 'm', 'o', 'r', 'a', 'i', 't', 'i', '+'], ['$', 'm', 'a', 'r', 'k', '$', ' ', '+', 'h', 'u', 'n', 't', 'e', 'r', '+'], ['$', 't', 'o', 'd', 'd', '$', ' ', '+', 'b', 'r', 'o', 's', 't', '+'], ['$', 'i', 'm', 'r', 'e', '$', ' ', '+', 'f', 'ö', 'l', 'd', 'i', '+'], ['$', 'd', 'a', 'm'

TypeError: __init__() got an unexpected keyword argument 'iter'

In [8]:
# Deciding if train
if training_params["train_embeddings"] == "False" :
    print("Embeddings are not trainable")
    for items in Nationality_model.embedding_layers :
        items.weight.requires_grad = False

In [9]:
training_params["is_train"]

'True'

In [13]:
if training_params["is_train"] == "True" :
    print("----------------------------------------------------------------")
    print("Mode : Train")

    # Creating Dataset
    print("Creating Training Dataset")
    train_loader = dataset.create_dataloader('train', training_params["batch_size"], shuffle=True)
    valid_loader = dataset.create_dataloader('valid', shuffle=False)

    # Training
    print("Training the model")
    optimizer = optim.Adam(Nationality_model.parameters(), lr = 0.001)
    criterion = nn.CrossEntropyLoss()

    train_logs = open(os.path.join(globals.LOG_DIR, 'training_logs.txt'), 'a')
    valid_logs = open(os.path.join(globals.LOG_DIR, 'validation_logs.txt'), 'a')

    max_accuracy = 0

    # lr initializing
    for param_groups in optimizer.param_groups:
        param_groups['lr'] = param_groups['lr'] * (training_params['decay_rate']**(training_params["start_epoch"] - 1))

    print('------------------------------------------------------------------------------')

    for epoch in range(training_params["start_epoch"], training_params["end_epoch"] + 1) :

        # if epoch % training_epoch["decay_epoch_size"] == 0:
        for param_groups in optimizer.param_groups:
            param_groups['lr'] = param_groups['lr'] * training_params['decay_rate']
        print("Updated learning rate : ", optimizer.param_groups[0]['lr'])
        start_time = time.time()
        # TRAINING
        total_loss = 0
        Nationality_model.train()
        for idx, (data_sample) in enumerate(train_loader):
            Nationality_model.zero_grad()
            data_sample = [items.to(globals.device).long() for items in data_sample]
            output = Nationality_model(data_sample[:4])
            loss = criterion(output, data_sample[4])
            loss.backward()
            nn.utils.clip_grad_norm_(Nationality_model.parameters(), 5)
            optimizer.step()
            total_loss += loss.item()
            # print('Epoch : {}/{}, Iteration : {}/{}, Loss : {}' \
            #     .format(epoch, training_params["end_epoch"], idx + 1, len(train_loader),loss.item()))
        string = 'Training Epoch : {}/{}, Epoch Loss : {}' \
                .format(epoch, training_params["end_epoch"], total_loss/len(train_loader)) 
        print(string)
        train_logs.write(string+"\n")
        train_logs.flush()
        if epoch % 2 == 0 :
            torch.save(Nationality_model.state_dict(), os.path.join(globals.WEIGHTS_DIR, "LSTM_Model1_{}.pt".format(epoch)))

        ## VALIDATION
        Nationality_model.eval()
        with torch.no_grad() :
            accuracy = 0
            correct_list = []
            for idx, (data_sample) in enumerate(valid_loader):
                data_sample = [items.to(globals.device).long() for items in data_sample]
                output = Nationality_model(data_sample[:4])
                _, top1pred = torch.max(output, 1)
                if top1pred.item() == data_sample[4].item():
                    accuracy += 1
                correct_list.append((idx, top1pred.item(), data_sample[4].item()))
            if accuracy > max_accuracy :
                max_accuracy = accuracy
                best_metric_logs = open(os.path.join(globals.LOG_DIR, 'best_metric_results.txt'), 'w')
                best_metric_logs.write("EPOCH: {}/{}, Accuracy : {}\n"
                                        .format(epoch, training_params["end_epoch"], accuracy*100/len(valid_loader)))
                best_metric_logs.write('------------------------------------------------------------------\n\n')
                for items in correct_list :
                    best_metric_logs.write('Index : {}, Predicted : {}, Actual : {}\n'
                                        .format(items[0], idx2country[items[1]], idx2country[items[2]]))
                best_metric_logs.flush()
                best_metric_logs.close()
            string = 'Validating Epoch : {}/{}, Accuracy : {}'\
                        .format(epoch, training_params["end_epoch"], accuracy*100/len(valid_loader))
            print(string)
            valid_logs.write(string+'\n')
            valid_logs.flush()
        print("Time for completion of epoch : {} seconds".format((time.time()-start_time)))
        print("------------------------------------------------------------------")
    train_logs.close()
    valid_logs.close()

else :
    print("----------------------------------------------------------------")
    print("Mode : Test")

    # Creating Dataset
    print("Creating Testing Dataset")
    test_loader = dataset.create_dataloader('test', shuffle=False)

    test_logs = open(os.path.join(globals.LOG_DIR, 'testing_logs.txt'), 'w')

    Nationality_model.eval()
    print("Testing the model")
    with torch.no_grad() :
        accuracy = 0
        correct_list = []
        for idx, (data_sample) in enumerate(test_loader):
            data_sample = [items.to(globals.device).long() for items in data_sample]
            output = Nationality_model(data_sample[:4])
            _, top1pred = torch.max(output, 1)
            if top1pred.item() == data_sample[4].item():
                accuracy += 1
            correct_list.append((idx, top1pred.item(), data_sample[4].item()))
        string = 'Testing Accuracy after training for {} epochs : {}' \
                  .format(weights_list[-1], accuracy*100/len(test_loader))
        print(string)
        test_logs.write(string+'\n')
        test_logs.write('------------------------------------------------------------------\n\n')
        for items in correct_list :
            test_logs.write('Index : {}, Predicted : {}, Actual : {}\n'
                            .format(items[0], idx2country[items[1]], idx2country[items[2]]))
        test_logs.write('------------------------------------------------------------------\n\n')
        test_logs.flush()
        test_logs.close()

----------------------------------------------------------------
Mode : Train
Creating Training Dataset
Training the model
------------------------------------------------------------------------------
Updated learning rate :  0.0001326398781093821


IndexError: index out of range in self