In [6]:
import pandas as pd
import re
sample_size = 50000

def preprocess_name(word):
    """
    Preprocess a given name string by:
    - Converting to lowercase
    - Removing any characters after '.' or whitespace
    - Removing any non-alphabetical characters
    - Collapsing consecutive spaces
    """
    processed_name = str(word).lower()
    processed_name = re.sub(r'[^a-z.\s].*', '', processed_name).strip()
    processed_name = re.sub(r'[^a-z ]+', '', processed_name).strip()
    processed_name = re.sub(r' +', ' ', processed_name).strip()
    return processed_name

def preprocess_dataset(filename):
    """
    Preprocess the dataset by:
    - Reading a sample from a CSV file
    - Removing entries with unspecified race, ethnic, or gender codes
    - Cleaning and preparing the 'name' field
    - Dropping duplicates and NaN values
    - Saving the cleaned dataset to a new CSV file
    """
    data = pd.read_csv(filename, nrows=sample_size)
    data.drop(data[(data.race_code == 'U') | (data.ethnic_code == 'UN') | (data.gender_code == 'U')].index, inplace=True)
    
    cleaned_data = pd.DataFrame()
    cleaned_data['last_name'] = data['last_name'].str.title()
    cleaned_data['first_name'] = data['first_name'].str.title()
    cleaned_data['name'] = (cleaned_data['last_name'] + ' ' + cleaned_data['first_name']).apply(preprocess_name)
    cleaned_data['race'] = data['race_code']
    
    print(filename, '\n')
    print('Before:', len(cleaned_data))
    print('Duplicates:', cleaned_data.duplicated().sum())
    print('Null:', cleaned_data.isna().sum())
    
    cleaned_data.drop_duplicates(inplace=True)
    cleaned_data.dropna(inplace=True)
    
    print('After:', len(cleaned_data))
    cleaned_data.to_csv('clean_data_before_same_length.csv', index=False)
    
    return cleaned_data


In [7]:
preprocess_dataset('nc_voter_name_gender_race_ethnic_age.csv')

nc_voter_name_gender_race_ethnic_age.csv 

Before: 35297
Duplicates: 2324
Null: last_name     0
first_name    0
name          0
race          0
dtype: int64
After: 32973


Unnamed: 0,last_name,first_name,name,race
0,Aabel,Ruth,aabel ruth,W
4,Aaron,Claudia,aaron claudia,W
6,Aaron,Kimberly,aaron kimberly,B
9,Aaron,Sandra,aaron sandra,W
10,Aaron,Willie,aaron willie,W
...,...,...,...,...
49992,Helms,Steve,helms steve,W
49993,Helms,Tyson,helms tyson,W
49994,Helms,Vonda,helms vonda,W
49997,Helsley,Kyla,helsley kyla,W


In [8]:
df= pd.read_csv('clean_data_before_same_length.csv')
df.groupby('race').agg({'name': 'count'})

Unnamed: 0_level_0,name
race,Unnamed: 1_level_1
A,213
B,6896
I,78
M,214
O,1600
W,23972


In [9]:
race_counts = df['race'].value_counts().sort_values(ascending=False)
second_max_size = race_counts.iloc[1] 
print(second_max_size)


6896


In [10]:
lst = []
for class_index, group in df.groupby('race'):
    if len(group) > second_max_size:
        resampled_group = group.sample(second_max_size, random_state=1)
    elif len(group) < second_max_size:
        resampled_group = group.sample(second_max_size, replace=True, random_state=1)
    else:
        resampled_group = group
    lst.append(resampled_group)
    
balanced_df = pd.concat(lst)

In [11]:
balanced_df.groupby('race').agg({'name': 'count'})


Unnamed: 0_level_0,name
race,Unnamed: 1_level_1
A,6896
B,6896
I,6896
M,6896
O,6896
W,6896


In [12]:
balanced_df.to_csv('clean_data.csv', index=False)

In [13]:
balanced_df.shape

(41376, 4)