#### This file is to serve as a data cleaning file so we can all be working on the same data for the project! It will import the original data file with the facial recognition data and clean it to be used for a classification model for analysis.

#### Author: Aidan Kleinman
#### Date: Feb. 7th, 2026

In [1]:
#Code Block 1
## import libraries

import os
import sys

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
#Code Block 2
## load the data
train_data = pd.read_csv('../../Data/fairface_label_train.csv')
val_data = pd.read_csv('../../Data/fairface_label_val.csv')

In [3]:
#Code Block 3
## view the training data

display(train_data.head())
train_data.info()

Unnamed: 0,file,age,gender,race,service_test
0,train/1.jpg,50-59,Male,East Asian,True
1,train/2.jpg,30-39,Female,Indian,False
2,train/3.jpg,3-9,Female,Black,False
3,train/4.jpg,20-29,Female,Indian,True
4,train/5.jpg,20-29,Female,Indian,True


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86744 entries, 0 to 86743
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   file          86744 non-null  object
 1   age           86744 non-null  object
 2   gender        86744 non-null  object
 3   race          86744 non-null  object
 4   service_test  86744 non-null  bool  
dtypes: bool(1), object(4)
memory usage: 2.7+ MB


In [4]:
#Code Block 4
## view the validation data

display(val_data.head())
val_data.info()

Unnamed: 0,file,age,gender,race,service_test
0,val/1.jpg,3-9,Male,East Asian,False
1,val/2.jpg,50-59,Female,East Asian,True
2,val/3.jpg,30-39,Male,White,True
3,val/4.jpg,20-29,Female,Latino_Hispanic,True
4,val/5.jpg,20-29,Male,Southeast Asian,False


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10954 entries, 0 to 10953
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   file          10954 non-null  object
 1   age           10954 non-null  object
 2   gender        10954 non-null  object
 3   race          10954 non-null  object
 4   service_test  10954 non-null  bool  
dtypes: bool(1), object(4)
memory usage: 353.1+ KB


In [5]:
#Code Block 5
## remove the service_test column

train_data.drop(columns=['service_test'], inplace=True)
val_data.drop(columns=['service_test'], inplace=True)

display(train_data.info())
display(val_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86744 entries, 0 to 86743
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   file    86744 non-null  object
 1   age     86744 non-null  object
 2   gender  86744 non-null  object
 3   race    86744 non-null  object
dtypes: object(4)
memory usage: 2.6+ MB


None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10954 entries, 0 to 10953
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   file    10954 non-null  object
 1   age     10954 non-null  object
 2   gender  10954 non-null  object
 3   race    10954 non-null  object
dtypes: object(4)
memory usage: 342.4+ KB


None

In [6]:
#Code Block 6
## check for missing values

print("Missing values in training data:")
print(train_data.isnull().sum())

print("\nMissing values in validation data:")
print(val_data.isnull().sum())

Missing values in training data:
file      0
age       0
gender    0
race      0
dtype: int64

Missing values in validation data:
file      0
age       0
gender    0
race      0
dtype: int64


In [7]:
#Code Block 7
## check for duplicates

print("Duplicate rows in training data:", train_data.duplicated().sum())
print("Duplicate rows in validation data:", val_data.duplicated().sum())

Duplicate rows in training data: 0
Duplicate rows in validation data: 0


In [9]:
#Code Block 8
## standardize the gender and race columns

display(train_data['gender'].head())
display(val_data['gender'].head())

train_data["gender"] = train_data["gender"].str.lower().str.strip()
val_data["gender"] = val_data["gender"].str.lower().str.strip()

display(train_data['gender'].head())
val_data['gender'].head()

0      Male
1    Female
2    Female
3    Female
4    Female
Name: gender, dtype: object

0      Male
1    Female
2      Male
3    Female
4      Male
Name: gender, dtype: object

0      male
1    female
2    female
3    female
4    female
Name: gender, dtype: object

0      male
1    female
2      male
3    female
4      male
Name: gender, dtype: object

In [10]:
#Code Block 9

train_data["race"] = train_data["race"].str.lower().str.strip()
val_data["race"] = val_data["race"].str.lower().str.strip()

display(train_data['race'].head())
val_data['race'].head()

0    east asian
1        indian
2         black
3        indian
4        indian
Name: race, dtype: object

0         east asian
1         east asian
2              white
3    latino_hispanic
4    southeast asian
Name: race, dtype: object

In [11]:
#Code Block 10
## check raw and normalized class imbalance
print("Raw class imbalance for gender:\n")
display(train_data['gender'].value_counts())
display(val_data['gender'].value_counts())
print("\nNormalized class imbalance for gender:\n")
display(train_data['gender'].value_counts(normalize=True))
display(val_data['gender'].value_counts(normalize=True))

Raw class imbalance for gender:



gender
male      45986
female    40758
Name: count, dtype: int64

gender
male      5792
female    5162
Name: count, dtype: int64


Normalized class imbalance for gender:



gender
male      0.530135
female    0.469865
Name: proportion, dtype: float64

gender
male      0.528757
female    0.471243
Name: proportion, dtype: float64

In [12]:
#Code Block 11
## check raw and normalized class imbalance for race

print("Raw class imbalance for race:\n")
display(train_data['race'].value_counts())
display(val_data['race'].value_counts())
print("\nNormalized class imbalance for race:\n")
display(train_data['race'].value_counts(normalize=True))
display(val_data['race'].value_counts(normalize=True))

Raw class imbalance for race:



race
white              16527
latino_hispanic    13367
indian             12319
east asian         12287
black              12233
southeast asian    10795
middle eastern      9216
Name: count, dtype: int64

race
white              2085
latino_hispanic    1623
black              1556
east asian         1550
indian             1516
southeast asian    1415
middle eastern     1209
Name: count, dtype: int64


Normalized class imbalance for race:



race
white              0.190526
latino_hispanic    0.154097
indian             0.142016
east asian         0.141647
black              0.141024
southeast asian    0.124447
middle eastern     0.106244
Name: proportion, dtype: float64

race
white              0.190341
latino_hispanic    0.148165
black              0.142049
east asian         0.141501
indian             0.138397
southeast asian    0.129177
middle eastern     0.110371
Name: proportion, dtype: float64