In [1]:
import tensorflow as tf
from tensorflow import keras
print(tf.__version__)
print(keras.__version__)

2.5.1
2.5.0


In [2]:
from nonconformist.nc import NcFactory
from nonconformist.cp import IcpClassifier

In [3]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix

NGRAMS = 2
SAMPLE = 1000000
EPOCHS = 15

# Florida voter
df = pd.read_csv('../dataverse_files/fl_reg_name_race.csv.gz')
df.dropna(subset=['name_first', 'name_last'], inplace=True)
sdf = df[df.race.isin(['multi_racial', 'native_indian', 'other', 'unknown']) == False].sample(SAMPLE, random_state=21)
del df

# Additional features
sdf['name_last'] = sdf.name_last.str.title()

sdf

Unnamed: 0,name_last,name_first,race
841323,Torres,Jose,hispanc
1408926,Da Silva,Amanda,nh_white
1733118,Mc Ghee,Sandra,nh_white
13104513,Karam,MELINDA,nh_white
9156114,Brewer,LAIA,nh_black
...,...,...,...
3076722,Antunez Avila,Robert,hispanc
10023679,Davis,WYATT,nh_white
5846252,Scott,Jacquelyn,nh_white
5959131,Parton,Douglas,nh_white


##  Preprocessing the input data

In [4]:
# last name only
sdf['name_last_name_first'] = sdf['name_last']

# build n-gram list
vect = CountVectorizer(analyzer='char', max_df=0.3, min_df=3, ngram_range=(NGRAMS, NGRAMS), lowercase=False) 
a = vect.fit_transform(sdf.name_last_name_first)
vocab = vect.vocabulary_

# sort n-gram by freq (highest -> lowest)
words = []
for b in vocab:
    c = vocab[b]
    #print(b, c, a[:, c].sum())
    words.append((a[:, c].sum(), b))
    #break
words = sorted(words, reverse=True)
words_list = [w[1] for w in words]
num_words = len(words_list)
print("num_words = %d" % num_words)


def find_ngrams(text, n):
    a = zip(*[text[i:] for i in range(n)])
    wi = []
    for i in a:
        w = ''.join(i)
        try:
            idx = words_list.index(w)
        except:
            idx = 0
        wi.append(idx)
    return wi

# build X from index of n-gram sequence
X = np.array(sdf.name_last_name_first.apply(lambda c: find_ngrams(c, NGRAMS)))

# check max/avg feature
X_len = []
for x in X:
    X_len.append(len(x))

max_feature_len = max(X_len)
avg_feature_len = int(np.mean(X_len))

print("Max feature len = %d, Avg. feature len = %d" % (max_feature_len, avg_feature_len))
y = np.array(sdf.race.astype('category').cat.codes)

# Split train and test dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y)
X_train, X_calib, y_train, y_calib = train_test_split(X_train, y_train, test_size=.4, random_state=10, stratify=y_train)

num_words = 1166
Max feature len = 26, Avg. feature len = 5


## Setting up the data

In [5]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding, Dropout, Activation
from keras.layers import LSTM
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.models import load_model

max_features = num_words # 20000
feature_len = 20 # avg_feature_len # cut texts after this number of words (among top max_features most common words)
batch_size = 32

print(len(X_train), 'train sequences')
print(len(X_calib), 'calibration sequences')
print(len(X_test), 'test sequences')

print('Pad sequences (samples x time)')
X_train = sequence.pad_sequences(X_train, maxlen=feature_len)
X_calib = sequence.pad_sequences(X_calib, maxlen=feature_len)
X_test = sequence.pad_sequences(X_test, maxlen=feature_len)
print('X_train shape:', X_train.shape)
print('X_calib shape:', X_calib.shape)
print('X_test shape:', X_test.shape)

num_classes = np.max(y_train) + 1
print(num_classes, 'classes')


###### Commenting out the conversion to categorical categories since nonconformist needs a single value
#print('Convert class vector to binary class matrix '
#      '(for use with categorical_crossentropy)')
#y_train = tf.keras.utils.to_categorical(y_train, num_classes)
#y_calib = tf.keras.utils.to_categorical(y_calib, num_classes)
#y_test = tf.keras.utils.to_categorical(y_test, num_classes)
print('y_train shape:', y_train.shape)
print('y_calib shape:', y_calib.shape)
print('y_test shape:', y_test.shape)

480000 train sequences
320000 calibration sequences
200000 test sequences
Pad sequences (samples x time)
X_train shape: (480000, 20)
X_calib shape: (320000, 20)
X_test shape: (200000, 20)
4 classes
y_train shape: (480000,)
y_calib shape: (320000,)
y_test shape: (200000,)


In [6]:
from nonconformist.base import ClassifierAdapter
from nonconformist.nc import ClassifierNc

class MyClassifierAdapter(ClassifierAdapter):
    def __init__(self, model, fit_params=None):
        super(MyClassifierAdapter, self).__init__(model, fit_params)

        
    def fit(self, x, y):
        '''
            x is a numpy.array of shape (n_train, n_features)
            y is a numpy.array of shape (n_train)
            
            Here, do what is necessary to train the underlying model
            using the supplied training data
        '''        
        model = Sequential()
        model.add(Embedding(num_words, 32, input_length=feature_len))
        model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
        model.add(Dense(num_classes, activation='softmax'))

        # try using different optimizers and different optimizer configs
        model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
        model.fit(x, y, batch_size=batch_size, epochs=EPOCHS,
          validation_split=0.1, verbose=1)
        
        model.save('../models/fl_voter_reg/lstm/fl_all_name_lstm_nonconform.h5')
    
    def predict(self, x):
        '''
            Obtain predictions from the underlying model
            
            Make sure this function returns an output that is compatible with
            the nonconformity function used. For default nonconformity functions,
            output from this function should be class probability estimates in
            a numpy.array of shape (n_test, n_classes)
        '''
        keras_model = keras.models.load_model('../models/fl_voter_reg/lstm/fl_all_name_lstm_nonconform.h5')
        results = keras_model.predict(x)
        return results
    
my_classifier = None # Initialize an object of your classifier's type
model = MyClassifierAdapter(my_classifier)
nc = ClassifierNc(model)

In [7]:
X_train = np.asarray(X_train).astype(np.float32)
y_train = np.asarray(y_train).astype(np.float32)

In [8]:
icp = IcpClassifier(nc)
icp.fit(X_train, y_train)
icp.calibrate(X_calib, y_calib)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [11]:
results = icp.predict(X_test)

In [12]:
results[:10]

array([[9.21821741e-02, 9.46809983e-02, 2.33383715e-01, 1.78622816e-01],
       [1.89687672e-02, 1.93639573e-02, 5.60322126e-01, 5.55208362e-02],
       [8.90947955e-02, 1.64423749e-01, 9.71109186e-02, 2.54510311e-01],
       [3.64824782e-02, 4.08745054e-02, 8.25648927e-02, 4.47478258e-01],
       [2.56222997e-02, 3.60127163e-02, 6.11281612e-02, 5.32809516e-01],
       [9.12607212e-02, 2.53478954e-01, 9.30939957e-02, 1.65131907e-01],
       [5.27466821e-02, 4.87196738e-02, 8.72436808e-02, 4.31135538e-01],
       [5.95494263e-03, 6.58230402e-03, 8.28853238e-03, 8.67547118e-01],
       [8.77181928e-02, 2.84729062e-01, 8.24941564e-02, 1.46698355e-01],
       [8.28211240e-04, 1.69237244e-03, 3.60688461e-04, 9.73239259e-01]])