### Multi-Output Regressor (Transformer)

In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import joblib

from numpy import array, hstack, math
from numpy.random import uniform

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [2]:
df = pd.read_csv('./data/fl_2022_LastName.csv.gz', nrows = 1000)

In [3]:
# build n-gram list
NGRAMS = 2
vect = CountVectorizer(analyzer='char', max_df=0.3, min_df=3, ngram_range=(NGRAMS, NGRAMS), lowercase=False) 
a = vect.fit_transform(df.name_last)
vocab = vect.vocabulary_
len(vocab)

362

In [4]:
import operator
sorted_vocab = sorted(vocab.items(), key=operator.itemgetter(1))
cols = list(map(operator.itemgetter(0), sorted_vocab))

In [5]:
count_df = pd.DataFrame(a.todense(), columns=cols)
count_df

Unnamed: 0,A,B,C,D,E,F,G,H,J,K,...,ye,yn,yo,za,ze,zh,zi,zo,zu,zz
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
996,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
997,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
998,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [6]:
pd.set_option('display.max_rows', 20)
count_df.sum().sort_values(ascending=False)

bb    214
al    191
Aa    188
ad    185
ar    181
     ... 
Bu      3
gt      3
r       3
pu      3
ja      3
Length: 362, dtype: int64

In [7]:
# sort n-gram by freq (highest -> lowest)
words = [(a[:, c].sum(), b) for b, c in vocab.items()]
words.sort(reverse=True)

words_list = ['UNK'] + [w[1] for w in words]
num_words = len(words_list)
print(f"num_words = {num_words}")

def find_ngrams(text, n):
    a = zip(*[text[i:] for i in range(n)])
    word_indices = {w: idx for idx, w in enumerate(words_list)}
    wi = [word_indices.get(''.join(i), 0) for i in a]
    return wi

# build X from index of n-gram sequence
X = np.array(df.name_last.apply(lambda c: find_ngrams(c, NGRAMS)))

# check max/avg feature
X_len = [len(x) for x in X]
max_feature_len = np.max(X_len)
avg_feature_len = np.mean(X_len)

print("Max feature len = %d, Avg. feature len = %d" % (max_feature_len, avg_feature_len))
y = df.iloc[:, 1:6]

# Split train and test dataset
X_train,  X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21)

num_words = 363
Max feature len = 25, Avg. feature len = 8


In [8]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import LabelEncoder
from torch.nn.utils.rnn import pad_sequence

max_features = num_words # 20000
feature_len = 20 # avg_feature_len # cut texts after this number of words (among top max_features most common words)
batch_size = 32

print(len(X_train), 'train sequences')
print(len(X_test), 'test sequences')

print('Pad sequences (samples x time)')

sequences = [torch.tensor(seq) for seq in X_train]
X_train = pad_sequence(sequences, batch_first=True)
sequences = [torch.tensor(seq) for seq in X_test]
X_test = pad_sequence(sequences, batch_first=True)
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)

input_size = X_train.shape[1]
output_size = y_train.shape[1]

print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

800 train sequences
200 test sequences
Pad sequences (samples x time)
X_train shape: torch.Size([800, 25])
X_test shape: torch.Size([200, 19])
y_train shape: (800, 5)
y_test shape: (200, 5)


In [9]:
import math

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [11]:
# Set random seed for reproducibility
torch.manual_seed(42)

# Convert the input and output data to tensors
X_train = torch.tensor(X_train, dtype=torch.long)  # Convert to long datatype
y_train = torch.tensor(np.array(y_train.values), dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.long)  # Convert to long datatype
y_test = torch.tensor(np.array(y_test.values), dtype=torch.float32)

class Transformer(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, nhead, num_layers=1):
        super(Transformer, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.pos_encoder = PositionalEncoding(hidden_size)
        encoder_layers = nn.TransformerEncoderLayer(hidden_size, nhead, batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers)
        self.fc = nn.Linear(hidden_size, output_size)
        self.linear = nn.Linear(hidden_size, output_size)

    def forward(self, input):
        embedded = self.embedding(input.type(torch.IntTensor).to(input.device))
        pos_encoded = self.pos_encoder(embedded)
        out = self.transformer_encoder(pos_encoded) # (batch, seq, feature)
        out = out[:, -1, :] # drop seq
        out = self.fc(out)
        return out
    
# Define the LSTM model
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.lstm1 = nn.LSTM(hidden_size, hidden_size, batch_first=True)
        self.lstm2 = nn.LSTM(hidden_size, hidden_size, batch_first=True)
        self.linear = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        embedded = self.embedding(x)  # Remove .long() conversion here
        out, _ = self.lstm1(embedded)
        out, _ = self.lstm2(out)
        out = self.linear(out[:, -1, :])
        return out

# Define the model parameters
input_size = int(torch.max(torch.max(X_train), torch.max(X_test))) + 1  # Adjust input_size calculation
hidden_size = 64  # Number of hidden units
output_size = y_train.shape[1]  # Number of output variables

# Create an instance of the LSTM model
model = Transformer(input_size, hidden_size, output_size, nhead=4, num_layers=2)

# Define the loss function
criterion = nn.MSELoss()

# Define the optimizer
learning_rate = 0.01
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Set the batch size and number of epochs
batch_size = 32
num_epochs = 10

# Training loop
for epoch in range(num_epochs):
    # Shuffle the training data
    indices = torch.randperm(X_train.shape[0])
    X_train_shuffled = X_train[indices]
    y_train_shuffled = y_train[indices]

    # Mini-batch gradient descent
    for i in range(0, X_train.shape[0], batch_size):
        # Get the mini-batch
        inputs = X_train_shuffled[i:i+batch_size]
        targets = y_train_shuffled[i:i+batch_size]

        # Forward pass
        outputs = model(inputs)  # Remove unsqueeze(1) here
        loss = criterion(outputs, targets)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Print the training loss for this epoch
    if (epoch + 1) % 1 == 0:
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

# Evaluation on the test set
with torch.no_grad():
    model.eval()
    test_outputs = model(X_test)  # Remove unsqueeze(1) here
    test_loss = criterion(test_outputs, y_test)
    print(f"Test Loss: {test_loss.item():.4f}")

  X_train = torch.tensor(X_train, dtype=torch.long)  # Convert to long datatype
  X_test = torch.tensor(X_test, dtype=torch.long)  # Convert to long datatype


Epoch [1/10], Loss: 0.1482
Epoch [2/10], Loss: 0.1092
Epoch [3/10], Loss: 0.1403
Epoch [4/10], Loss: 0.1183
Epoch [5/10], Loss: 0.1270
Epoch [6/10], Loss: 0.1323
Epoch [7/10], Loss: 0.1147
Epoch [8/10], Loss: 0.1175
Epoch [9/10], Loss: 0.1463
Epoch [10/10], Loss: 0.1109
Test Loss: 0.1165


In [None]:
predictions = model(X_test)

In [None]:
(torch.argmax(y_test, dim = 1).float() == torch.argmax(predictions, dim = 1).float()).float().mean()

In [None]:
pd.crosstab(torch.argmax(y_test, dim = 1), torch.argmax(predictions, dim = 1))

In [None]:
(torch.argmax(y_test, dim = 1) == 3).float().mean()

In [None]:
print("asian MSE: %.4f" % mean_squared_error(y_test[:, 0].detach().numpy(), predictions[:, 0].detach().numpy()))
print("hispanic MSE: %.4f" % mean_squared_error(y_test[:, 1].detach().numpy(), predictions[:, 1].detach().numpy()))
print("nh_black MSE: %.4f" % mean_squared_error(y_test[:, 2].detach().numpy(), predictions[:, 2].detach().numpy()))
print("nh_white MSE: %.4f" % mean_squared_error(y_test[:, 3].detach().numpy(), predictions[:, 3].detach().numpy()))

In [None]:
y_test.shape

In [None]:
print("base asian MSE: %.4f" % mean_squared_error(y_test[:, 0], torch.mean(y_test[:, 0]).repeat(y_test.shape[0])))
print("base white MSE: %.4f" % mean_squared_error(y_test[:, 3], torch.mean(y_test[:, 3]).repeat(y_test.shape[0])))

In [None]:
torch.save(model, 'models/fl_last_name_multioutput.pt')