<a href="https://colab.research.google.com/github/anoted/Assignment-4/blob/main/scripts/assignment_4_part_III.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Part III
Using the previous two tutorials, please answer the following using an encorder-decoder approach and an LSTM compared approach.

Please create a transformer-based classifier for English name classification into male or female.

There are several datasets for name for male or female classification. In subseuqent iterations, this could be expanded to included more classifications.

Below is the source from NLTK, which only has male and female available but could be used for the purposes of this assignment.

```
names = nltk.corpus.names
names.fileids()
['female.txt', 'male.txt']
male_names = names.words('male.txt')
female_names = names.words('female.txt')
[w for w in male_names if w in female_names]
['Abbey', 'Abbie', 'Abby', 'Addie', 'Adrian', 'Adrien', 'Ajay', 'Alex', 'Alexis',
'Alfie', 'Ali', 'Alix', 'Allie', 'Allyn', 'Andie', 'Andrea', 'Andy', 'Angel',
'Angie', 'Ariel', 'Ashley', 'Aubrey', 'Augustine', 'Austin', 'Averil', ...]
```

In [118]:
!pip install keras-nightly



In [119]:
### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) ###

import tensorflow as tf
import string
import numpy as np
import matplotlib.pyplot as plt
import keras
from keras import ops
from keras import layers
import random
import re
import nltk
nltk.download('names')

[nltk_data] Downloading package names to /root/nltk_data...
[nltk_data]   Package names is already up-to-date!


True

In [120]:
def get_angles(pos, i, d_model):
  angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
  return pos * angle_rates

def positional_encoding(position, d_model):
  angle_rads = get_angles(np.arange(position)[:, np.newaxis],
                          np.arange(d_model)[np.newaxis, :],
                          d_model)

  # apply sin to even indices in the array; 2i
  angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])

  # apply cos to odd indices in the array; 2i+1
  angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])

  pos_encoding = angle_rads[np.newaxis, ...]

  return tf.cast(pos_encoding, dtype=tf.float32)



___


In [139]:
# name settings
vocab_size = 27                     # letters + padding
maxlen = 9                          # for name
letters = string.ascii_lowercase    # vocabulary
names = nltk.corpus.names           # name database
#names.fileids()

male_names = names.words('male.txt')
female_names = names.words('female.txt')

random.shuffle(male_names)          # shuffling for better training
random.shuffle(female_names)

##### position and listing
x_train = []
y_train = []

for name in male_names[:len(male_names)-200]:
    name = re.sub(r'[^A-Za-z]', '', name) # letter only
    name_letters = list(name.lower())     # lower and list
    x_name = []
    for letter in name_letters:
        #print(letter)
        x_name.append(letters.index(letter)+1) # index as alphabet index 1-26
    x_train.append(x_name)
    y_train.append(1)                     # 1 -> male

for name in female_names[:len(female_names)-200]:
    name = re.sub(r'[^A-Za-z]', '', name) # letter only
    name_letters = list(name.lower())     # lower and list
    x_name = []
    for letter in name_letters:
        #print(letter)
        x_name.append(letters.index(letter)+1) # index as alphabet index 1-26
    x_train.append(x_name)
    y_train.append(0)                     # 1 -> female

# validation set
x_val = []
y_val = []

for name in male_names[len(male_names)-200:]:
    name = re.sub(r'[^A-Za-z]', '', name) # letter only
    name_letters = list(name.lower())     # lower and list
    x_name = []
    for letter in name_letters:
        #print(letter)
        x_name.append(letters.index(letter)+1) # index as alphabet index 1-26
    x_val.append(x_name)
    y_val.append(1)                     # 1 -> male

for name in female_names[len(female_names)-200:]:
    name = re.sub(r'[^A-Za-z]', '', name) # letter only
    name_letters = list(name.lower())     # lower and list
    x_name = []
    for letter in name_letters:
        #print(letter)
        x_name.append(letters.index(letter)+1) # index as alphabet index 1-26
    x_val.append(x_name)
    y_val.append(0)                     # 1 -> female

# print(x_train)
# print(x_val)
x_train = keras.utils.pad_sequences(x_train, maxlen=maxlen)
x_val = keras.utils.pad_sequences(x_val, maxlen=maxlen)

# numpy array or tensorflow tensor ...
y_train = np.array(y_train)
y_val = np.array(y_val)
#x_train = tf.convert_to_tensor(x_train, dtype=tf.float32)
#x_val = tf.convert_to_tensor(x_val, dtype=tf.float32)


In [140]:
# Model definition
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim) # attention layer
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]      # feed forward - dense
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)                    # normalizations
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output)
        return self.layernorm2(out1 + ffn_output)

class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super().__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)   # token embedding - self trained
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)         # positional embedding - self calculated

    def call(self, x):
        maxlen = ops.shape(x)[-1]
        positions = ops.arange(start=0, stop=maxlen, step=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions


In [141]:
embed_dim =  64  # Embedding size for each token
num_heads = 3   # Number of attention heads
ff_dim = 32     # Hidden layer size in feed forward network inside transformer

# model definition
inputs = layers.Input(shape=(maxlen,))
embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
x = embedding_layer(inputs)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.1)(x)
x = layers.Dense(20, activation="relu")(x)
x = layers.Dropout(0.1)(x)
outputs = layers.Dense(2, activation="softmax")(x)

model = keras.Model(inputs=inputs, outputs=outputs)

In [148]:
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
history = model.fit( x_train, y_train,
                     batch_size=32, epochs=10,
                     validation_data=(x_val, y_val) )


Epoch 1/10
[1m236/236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 15ms/step - accuracy: 0.7983 - loss: 0.4182 - val_accuracy: 0.7750 - val_loss: 0.4671
Epoch 2/10
[1m236/236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 13ms/step - accuracy: 0.7916 - loss: 0.4262 - val_accuracy: 0.7425 - val_loss: 0.5051
Epoch 3/10
[1m236/236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 17ms/step - accuracy: 0.7960 - loss: 0.4238 - val_accuracy: 0.7575 - val_loss: 0.4540
Epoch 4/10
[1m236/236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 13ms/step - accuracy: 0.7962 - loss: 0.4148 - val_accuracy: 0.7500 - val_loss: 0.4572
Epoch 5/10
[1m236/236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 13ms/step - accuracy: 0.8025 - loss: 0.4111 - val_accuracy: 0.7550 - val_loss: 0.4619
Epoch 6/10
[1m236/236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 14ms/step - accuracy: 0.8014 - loss: 0.4034 - val_accuracy: 0.7775 - val_loss: 0.4454
Epoch 7/10
[1m236/23

In [149]:
def prep_name(name, maxlen=maxlen):
    name = re.sub(r'[^A-Za-z]', '', name) # letter only
    name_letters = list(name.lower())     # lower and list
    x_name = []
    for letter in name_letters:
        #print(letter)
        x_name.append(letters.index(letter)+1) # index as alphabet index 1-26
    return keras.utils.pad_sequences([x_name], maxlen=maxlen)

In [156]:
test_names = ["Robin", "Sam",   "Abigail",   "Elliot",   "Haley", "Harvey", "Leah", "Penny"]

In [184]:
for name in test_names:
    predicted = model.predict(keras.utils.pad_sequences(prep_name(name), maxlen=maxlen), verbose=0)
    print(name, "    \t",   "male" if np.argmax(predicted) == 1 else "female" )
    print("------------------------")
### END CODE HERE ###

Robin     	 female
------------------------
Sam     	 male
------------------------
Abigail     	 female
------------------------
Elliot     	 male
------------------------
Haley     	 female
------------------------
Harvey     	 male
------------------------
Leah     	 female
------------------------
Penny     	 female
------------------------


# References
1. https://arxiv.org/pdf/2102.03692.pdf
2. https://alvinntnu.github.io/NTNU_ENC2045_LECTURES/exercise/13-attention.html
3. https://towardsdatascience.com/deep-learning-gender-from-name-lstm-recurrent-neural-networks-448d64553044
4. https://www.nltk.org/book/ch02.html#sec-lexical-resources