### Conformal Prediction Set

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import joblib

In [2]:
%%time
NGRAMS = 2
SAMPLE = 2000000

# Florida voter
df = pd.read_csv('./data/fl_reg_name_race_2022.csv.gz')
df.dropna(subset=['name_first', 'name_last'], inplace=True)
df['race'] = df.race.map({'native_indian': 'other', 'asian': 'asian', 'nh_black': 'nh_black', 'hispanic': 'hispanic', 'nh_white': 'nh_white', 'other': 'other', 'multi_racial': 'other', 'unknown': 'unknown'})
df

CPU times: user 14.4 s, sys: 3.43 s, total: 17.8 s
Wall time: 18.1 s


Unnamed: 0,name_last,name_first,race
0,Hessler-Smith,Jason,nh_white
1,Rogers,Renee,nh_white
2,Bartolome,Crystal,nh_white
3,Bailey,Donna,nh_white
4,Carlson,Greggory,nh_white
...,...,...,...
15455105,Ballew,Christina,nh_white
15455106,Watts,Mark,nh_white
15455107,McRae,Evelyn,nh_white
15455108,Ward,Stephanie,nh_white


In [3]:
sdf = df[df.race.isin(['unknown']) == False].groupby(['race']).sample(int(SAMPLE/5), random_state=21)
del df

# Additional features
sdf['name_first'] = sdf.name_first.str.title()
sdf['name_last'] = sdf.name_last.str.title()
sdf

Unnamed: 0,name_last,name_first,race
8442799,Nguyen,Chat Thi,asian
14389117,Olinger,Yan,asian
11881266,Klinshaw,Catherine,asian
13434626,Webb,Vilma,asian
12819255,Pilapil,Michelle,asian
...,...,...,...
152411,Moorer,Patrick,other
6311028,Toll,David,other
11681567,Forrest,Jacobie,other
5030078,Virupaksha,Bharathi,other


In [4]:
rdf = sdf.groupby('race').agg({'name_last': 'count'})
rdf.to_csv('fl_name_race_2022.csv', columns=[])
rdf

Unnamed: 0_level_0,name_last
race,Unnamed: 1_level_1
asian,200000
hispanic,200000
nh_black,200000
nh_white,200000
other,200000


In [5]:
sdf.groupby('race').agg({'name_last': 'nunique'})

Unnamed: 0_level_0,name_last
race,Unnamed: 1_level_1
asian,50592
hispanic,55412
nh_black,29993
nh_white,66038
other,68407


In [6]:
### Preprocessing Inputs
# concat last name and first name
sdf['name'] = sdf['name_last'] + ' ' + sdf['name_first']
sdf['race_id'] = sdf.race.factorize()[0]
X = sdf.name
y = sdf.race_id

In [7]:
race_id_df = sdf[['race', 'race_id']].drop_duplicates().sort_values('race_id')
race_to_id = dict(race_id_df.values)
id_to_race = dict(race_id_df[['race_id', 'race']].values)

In [8]:
vect = CountVectorizer(analyzer='char', max_df=0.3, min_df=5, ngram_range=(1, NGRAMS), lowercase=False) 

features = vect.fit_transform(sdf.name).toarray()
labels = sdf.race_id
features.shape

(1000000, 1377)

In [9]:
# Split train and test dataset
X_train,  X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y)

In [10]:
X_combined_transformed = vect.fit_transform(X)

# Transform the training data
X_train = X_combined_transformed[:X_train.shape[0]]

# Transform the test data
X_test = X_combined_transformed[X_train.shape[0]:]

In [11]:
%%time
#Import Random Forest Model
from sklearn.ensemble import RandomForestClassifier
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
import numpy as np
from sklearn.svm import SVC
from nonconformist.cp import IcpClassifier
from nonconformist.nc import NcFactory

#Create a Gaussian Classifier
clf = RandomForestClassifier(n_estimators=10, criterion = 'entropy', random_state=21)
nc = NcFactory.create_nc(clf)# Create a default nonconformity function
icp = IcpClassifier(nc)

# Fit the conformal predictor using the training data
icp.fit(X_train, y_train)

CPU times: user 59min 39s, sys: 4.62 s, total: 59min 43s
Wall time: 59min 44s


In [12]:
icp.calibrate(X_test, y_test)

In [13]:
# Generate conformal predictions for the test data
predictions = icp.predict(X_test, significance=0.1)



In [14]:
predictions.sum(axis = 1).mean()

4.499035

In [12]:
import torch
import numpy as np
import pandas

In [11]:
lstm_mod = torch.load('/home/jupyter/notebooks/ethnicolr/v2/models/lstm_FullName_pytorch_81_acc_2layers.pt',
          map_location=torch.device('cpu'))