In [15]:
# data wrangling
import pandas as pd
import numpy as np

# visualization
import seaborn as sns
import matplotlib

# other
import sklearn
import fairlearn

In [16]:
# import functionality
import matplotlib.pyplot as plt
from fairlearn.metrics import MetricFrame, make_derived_metric
from sklearn.metrics import precision_score
from fairlearn.metrics import selection_rate, false_positive_rate, false_negative_rate
from fairlearn.metrics import demographic_parity_difference, equalized_odds_difference

In [17]:
# description
description = pd.read_csv('data/WiDS_Datathon_2020_Dictionary.csv')
description_dict = description.set_index('Variable Name').to_dict(orient='index')
# data
df = pd.read_csv('data/training_v2.csv')

df.head()

Unnamed: 0,encounter_id,patient_id,hospital_id,hospital_death,age,bmi,elective_surgery,ethnicity,gender,height,...,aids,cirrhosis,diabetes_mellitus,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,apache_3j_bodysystem,apache_2_bodysystem
0,66154,25312,118,0,68.0,22.73,0,Caucasian,M,180.3,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Sepsis,Cardiovascular
1,114252,59342,81,0,77.0,27.42,0,Caucasian,F,160.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Respiratory,Respiratory
2,119783,50777,118,0,25.0,31.95,0,Caucasian,F,172.7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Metabolic,Metabolic
3,79267,46918,118,0,81.0,22.64,1,Caucasian,F,165.1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Cardiovascular,Cardiovascular
4,92056,34377,33,0,19.0,,0,Caucasian,M,188.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Trauma,Trauma


Looking at demographics

In [18]:
display(df['gender'].value_counts())
print()
display(df['ethnicity'].value_counts())

M    49469
F    42219
Name: gender, dtype: int64




Caucasian           70684
African American     9547
Other/Unknown        4374
Hispanic             3796
Asian                1129
Native American       788
Name: ethnicity, dtype: int64

In [19]:
df['gender'].unique()

array(['M', 'F', nan], dtype=object)

In [36]:
df['gender'].value_counts()

M    49469
F    42219
Name: gender, dtype: int64

In [34]:
df['gender'].loc[df['gender'] != 'M'].loc[df['gender'] != 'F'].count()

0

In [20]:
df['ethnicity'].unique()

array(['Caucasian', nan, 'Hispanic', 'African American', 'Asian',
       'Native American', 'Other/Unknown'], dtype=object)

In [22]:
df['hospital_death'].unique()

array([0, 1], dtype=int64)

Demographic parity

In [21]:
# by choosing y_pred to be ground truth instead of predictions, we can easily compute the base rate in the data
mf = MetricFrame(metrics={'base rate' : selection_rate}, 
                 y_true=df['hospital_death'], 
                 y_pred=df['hospital_death'], 
                 sensitive_features=df['gender'])
display(mf.by_group)

# # summarize demographic parity as the max difference between groups
# print("base rate diff: %.2f" % mf.difference(method='between_groups'))

TypeError: argument must be a string or number

In [13]:
df['hospital_death'].dtypes

dtype('int64')

see difference in death reate male female

In [61]:
num_F = df['gender'].value_counts()['F']
# print(num_F)
num_M = df['gender'].value_counts()['M']
# num_tot = df.count()

num_F_dead = df.loc[(df['hospital_death'] == 1) & (df['gender'] == 'F')].count()['patient_id']
# print(num_F_dead)
num_M_dead = df.loc[(df['hospital_death'] == 1) & (df['gender'] == 'M')].count()['patient_id']

selection_rate_F = num_F_dead / num_F
print("female death rate")
print(selection_rate_F)

selection_rate_M = num_M_dead / num_M
print("male death rate")
print(selection_rate_M)

female death rate
0.0883725336933608
male death rate
0.0844165032646708


See difference in death rate per ethnicity

In [68]:
ethniciies = df['ethnicity'].unique()

ethniciies = np.delete(ethniciies, 1)

for ethnicity in ethniciies :
    print(ethnicity)
    num_ethnicity = df['ethnicity'].value_counts()[ethnicity]
    num_ethnicity_dead = df.loc[(df['hospital_death'] == 1) & (df['ethnicity'] == ethnicity)].count()['patient_id']
    selection_rate_ethnicity = num_ethnicity_dead / num_ethnicity
    print(selection_rate_ethnicity)

Caucasian
0.08726161507554751
Hispanic
0.09905163329820864
African American
0.07855870954226458
Asian
0.08237378210806023
Native American
0.08883248730964467
Other/Unknown
0.08070416095107454
