## Import Dependencies

In [1]:
# Import Dependencies
import glob
import string
import numpy as np
import pandas as pd
import torch
import torch.nn as nn

## Load and Visualize Data

In [2]:
# Load and Visualize some data files
# Arabic Names
df = pd.read_csv('./data/names/Arabic.txt', header=None)
df.head()

Unnamed: 0,0
0,Khoury
1,Nahas
2,Daher
3,Gerges
4,Nazari


In [3]:
# Load and Visualize some data files
# English Names
df = pd.read_csv('./data/names/English.txt', header=None)
df.head()

Unnamed: 0,0
0,Abbas
1,Abbey
2,Abbott
3,Abdi
4,Abel


In [4]:
# Load and Visualize some data files
# French Names
df = pd.read_csv('./data/names/French.txt', header=None)
df.head()

Unnamed: 0,0
0,Abel
1,Abraham
2,Adam
3,Albert
4,Allard


## Create { Language: [Names] } Dictionary

In [5]:
categorical_names = {}
categories = []
for fileName in glob.glob('./data/names/*'):
    dictKey = fileName.split('/')[3].split('.')[0]
    categories.append(dictKey)
    txt = open(fileName, encoding='utf-8').read().strip().split('\n')
    categorical_names[dictKey] = txt

In [6]:
categorical_names['Czech'][:5]

['Abl', 'Adsit', 'Ajdrna', 'Alt', 'Antonowitsch']

In [7]:
categorical_names['Arabic'][:5]

['Khoury', 'Nahas', 'Daher', 'Gerges', 'Nazari']

In [8]:
categorical_names['English'][:5]

['Abbas', 'Abbey', 'Abbott', 'Abdi', 'Abel']

## Convert Names to Torch Tensors

In [9]:
# One Hot Encoding Names

# For each letter in a name: Create a One Hot Vector
# One hot Vector Size = [1 x num_letters], where num_letters => letters in English from [aA to zZ]

# Get all Letters in English [aA to zZ]
all_letters = string.ascii_letters + " .,;'"

def letterToTensor(letter=None):
    # Initialize tensor with all Zeros and size [1 x num_letters]
    letter_tensor = torch.zeros(1,len(all_letters), dtype=torch.long)
    # One Hot Encoding the letters in a Name
    # In this Tensor, find the index where the letter in word exists in the all_letters and make that as "1"
    letter_tensor[0][all_letters.find(letter)] = 1
    # Returns a single tensor with One hot Vector values for all letters in a name
    # ex. name="anuj", letter_tensor: One hot vector with "1" where the name matches
    return letter_tensor


# To get One hot Vector for Complete Name, join the Ohe Hot Vectors for all letters in a name in a 2-D matrix
# 2-D Matrix size: [name_length x batch_size x num_letters]
def nameToTensor(name=None):
    # Initialize Tensor with all Zeros and size [name_length x batch_size = 1 x num_letters]
    name_tensor = torch.zeros(len(name), 1, len(all_letters), dtype=torch.long)
    # Enumerate through Name, get the tensor for each letter in name and create a final tensor of size [name_length x batch_size x num_letters]
    for i, letter in enumerate(name):
        name_tensor[i][0][letterToTensor(letter=letter)] = 1
    # Returns name_tensor containing OHE vector for a name
    return name_tensor