### Multi-Output Regressor (DL)

In [22]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import joblib

from numpy import array, hstack, math
from numpy.random import uniform

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.multioutput import MultiOutputRegressor

In [23]:
%%time
NGRAMS = 2

df = pd.read_csv("data/fl_reg_name_race_2022.csv.gz", engine = "pyarrow")
df.dropna(subset=['name_last'], inplace=True)

# We assume unknown as missing at random
sdf = df[df.race.isin(['unknown']) == False]
del df

# Setting consistent case for names
sdf['name_last'] = sdf.name_last.str.strip().str.lower()

sdf

CPU times: user 1min 3s, sys: 4.59 s, total: 1min 8s
Wall time: 1min 3s


Unnamed: 0,name_last,name_first,race
0,hessler-smith,Jason,nh_white
1,rogers,Renee,nh_white
2,bartolome,Crystal,nh_white
3,bailey,Donna,nh_white
4,carlson,Greggory,nh_white
...,...,...,...
15455105,ballew,Christina,nh_white
15455106,watts,Mark,nh_white
15455107,mcrae,Evelyn,nh_white
15455108,ward,Stephanie,nh_white


In [24]:
# Define the mapping for recoding
mapping = {'multi_racial': 'other', 'native_indian': 'other'}

# Recode the 'Category' column using replace()
sdf['race'] = sdf['race'].replace(mapping)
sdf['race'].value_counts()

race
nh_white    9446851
hispanic    2722580
nh_black    2086584
other        424309
asian        329047
Name: count, dtype: int64

In [25]:
gdf = sdf.groupby(['name_last','race'], as_index=False)['race'].agg(['count'])
gdf_sm = gdf[(gdf['count'] >= 5) & (gdf['name_last'].str.len() > 1)]
gdf_sm.shape

(246167, 3)

In [26]:
# creating a pivot table so that each name has a count of the # of races with that last name
gdf_pv = gdf_sm.pivot_table(values='count', columns='race',index='name_last')

# Converting NaN to zeros since that means there is no one that identifies with that race with that last name
gdf_pv = gdf_pv.fillna(0)

# Getting the totals of each last name
gdf_pv['total_n'] = gdf_pv.sum(axis=1)
gdf_pv.reset_index(inplace=True)
gdf_pv

race,name_last,asian,hispanic,nh_black,nh_white,other,total_n
0,a'hearn,0.0,0.0,0.0,6.0,0.0,6.0
1,aabel,0.0,0.0,0.0,7.0,0.0,7.0
2,aaberg,0.0,0.0,0.0,13.0,0.0,13.0
3,aadland,0.0,0.0,0.0,9.0,0.0,9.0
4,aagaard,0.0,0.0,0.0,32.0,0.0,32.0
...,...,...,...,...,...,...,...
202635,zysman,0.0,0.0,0.0,10.0,0.0,10.0
202636,zytczak,0.0,0.0,0.0,5.0,0.0,5.0
202637,zywczyk,0.0,0.0,0.0,6.0,0.0,6.0
202638,zywica,0.0,0.0,0.0,6.0,0.0,6.0


In [27]:
%%time
# Calculate the proportion of people with a particular last name
#  that identify with various races/ethnicity
gdf_pv.iloc[:, 1:] = gdf_pv.iloc[:, 1:].div(gdf_pv.total_n, axis=0)
gdf_pv

CPU times: user 13.7 ms, sys: 0 ns, total: 13.7 ms
Wall time: 12.6 ms


race,name_last,asian,hispanic,nh_black,nh_white,other,total_n
0,a'hearn,0.0,0.0,0.0,1.0,0.0,1.0
1,aabel,0.0,0.0,0.0,1.0,0.0,1.0
2,aaberg,0.0,0.0,0.0,1.0,0.0,1.0
3,aadland,0.0,0.0,0.0,1.0,0.0,1.0
4,aagaard,0.0,0.0,0.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...
202635,zysman,0.0,0.0,0.0,1.0,0.0,1.0
202636,zytczak,0.0,0.0,0.0,1.0,0.0,1.0
202637,zywczyk,0.0,0.0,0.0,1.0,0.0,1.0
202638,zywica,0.0,0.0,0.0,1.0,0.0,1.0


In [28]:
gdf_pv.shape

(202640, 7)

In [29]:
X = gdf_pv.name_last
y = gdf_pv.iloc[:, 1:6]
y

race,asian,hispanic,nh_black,nh_white,other
0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...
202635,0.0,0.0,0.0,1.0,0.0
202636,0.0,0.0,0.0,1.0,0.0
202637,0.0,0.0,0.0,1.0,0.0
202638,0.0,0.0,0.0,1.0,0.0


In [30]:
xtrain, xtest, ytrain, ytest=train_test_split(X, y, test_size=0.15)
print("xtrain:", xtrain.shape, "ytrain:", ytrain.shape)
print("xtest:", xtest.shape, "ytest:", ytest.shape)

xtrain: (172244,) ytrain: (172244, 5)
xtest: (30396,) ytest: (30396, 5)


In [31]:
vect = CountVectorizer(analyzer='char', max_df=0.3, min_df=3, ngram_range=(1, NGRAMS), lowercase=False) 

X_combined_transformed = vect.fit_transform(X)

# Transform the training data
xtrain = X_combined_transformed[:xtrain.shape[0]]

# Transform the test data
xtest = X_combined_transformed[xtrain.shape[0]:]

In [39]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import LabelEncoder

input_size = xtrain.toarray().shape[1]
output_size = ytrain.shape[1]

# Convert the data to PyTorch tensors
input_data = torch.tensor(xtrain.toarray(), dtype=torch.float32)
output_data = torch.tensor(ytrain.values, dtype=torch.float32)

testx_data = torch.tensor(xtest.toarray(), dtype=torch.float32)
testy_data = torch.tensor(ytest.values, dtype=torch.float32)

# Define the neural network model
class MultiOutputRegression(nn.Module):
    def __init__(self, input_size, output_size):
        super(MultiOutputRegression, self).__init__()
        self.linear1 = nn.Linear(input_size, 128)
        self.relu1 = nn.ReLU()
        self.linear2 = nn.Linear(128, 64)
        self.relu2 = nn.ReLU()
        self.linear3 = nn.Linear(64, output_size)
        
    def forward(self, x):
        x = self.linear1(x)
        x = self.relu1(x)
        x = self.linear2(x)
        x = self.relu2(x)
        x = self.linear3(x)
        return x

# Set random seed for reproducibility
torch.manual_seed(42)

# Define the model
model = MultiOutputRegression(input_size, output_size)

# Define the loss function
criterion = nn.MSELoss()

# Define the optimizer
optimizer = optim.Adam(model.parameters(), lr=0.01)

num_epochs = 5000
best_loss = float('inf')
patience = 2000
counter = 0

for epoch in range(num_epochs):
    # Forward pass
    outputs = model(input_data)
    loss = criterion(outputs, output_data)

    # Backward and optimize
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # Print progress
    if (epoch+1) % 100 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

    # Calculate test loss
    with torch.no_grad():
        test_outputs = model(testx_data)
        test_loss = criterion(testy_data, test_outputs)

    # Check for early stopping
    if test_loss < best_loss:
        best_loss = test_loss
        counter = 0
    else:
        counter += 1
        if counter >= patience:
            print(f'Early stopping at epoch {epoch+1}')
            break

Epoch [100/10000], Loss: 0.0630
Epoch [200/10000], Loss: 0.0530
Epoch [400/10000], Loss: 0.0471
Epoch [700/10000], Loss: 0.0451
Epoch [800/10000], Loss: 0.0447
Epoch [900/10000], Loss: 0.0433
Epoch [1000/10000], Loss: 0.0439
Early stopping at epoch 1047


In [40]:
ytrain.shape[1]

5

In [41]:
test_data = xtest.toarray()
test_data = torch.tensor(test_data, dtype=torch.float32)
predictions = model(test_data)

# Convert predictions to numpy array
predictions = predictions.detach().numpy()

In [42]:
(np.argmax(ytest.values, axis = 1) == np.argmax(predictions, axis = 1)).mean()

0.6924924332148967

In [43]:
pd.crosstab(np.argmax(ytest.values, axis = 1), np.argmax(predictions, axis = 1))

col_0,0,1,2,3
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0,97,0,602
1,0,527,6,3494
2,0,223,3,1596
3,1,3145,40,20519
4,0,18,0,125


In [44]:
normed = predictions/np.sum(predictions, axis = 1).reshape(-1, 1)

In [45]:
(np.argmax(ytest.values, axis = 1) == 3).mean()

0.7798723516252138

In [46]:
print("white normed MSE:%.4f" % mean_squared_error(ytest.iloc[:,3], normed[:,3]))

white normed MSE:0.3953


In [47]:
print("asian MSE:%.4f" % mean_squared_error(ytest.iloc[:,0], predictions[:,0]))
print("hispanic MSE:%.4f" % mean_squared_error(ytest.iloc[:,1], predictions[:,1]))
print("nh_black MSE:%.4f" % mean_squared_error(ytest.iloc[:,2], predictions[:,2]))
print("nh_white MSE:%.4f" % mean_squared_error(ytest.iloc[:,3], predictions[:,3]))

asian MSE:0.0193
hispanic MSE:0.2830
nh_black MSE:0.0566
nh_white MSE:0.3943


In [50]:
print("base asian MSE:%.4f" % mean_squared_error(ytest.iloc[:,0], [np.mean(ytest.iloc[:,0]) for _ in range(30396)]))
print("base white MSE:%.4f" % mean_squared_error(ytest.iloc[:,3], [np.mean(ytest.iloc[:,3]) for _ in range(30396)]))

base asian MSE:0.0189
base white MSE:0.1553


In [49]:
torch.save(model, 'v2/fl_last_name_multioutput.pt')