In [1]:
# data wrangling
import pandas as pd
import numpy as np

# visualization
import seaborn as sns
import matplotlib

# other
import sklearn
import fairlearn
import math

In [2]:
# import functionality
import matplotlib.pyplot as plt
from fairlearn.metrics import MetricFrame, make_derived_metric
from sklearn.metrics import precision_score
from fairlearn.metrics import selection_rate, false_positive_rate, false_negative_rate
from fairlearn.metrics import demographic_parity_difference, equalized_odds_difference

## Load data 
I used the raw data set. I still has a lot of nan values. I only remove nan entries for ethnicity

In [3]:
# description
description = pd.read_csv('data/WiDS_Datathon_2020_Dictionary.csv')
description_dict = description.set_index('Variable Name').to_dict(orient='index')
# data
df = pd.read_csv('data/training_v2.csv')

df.head()

Unnamed: 0,encounter_id,patient_id,hospital_id,hospital_death,age,bmi,elective_surgery,ethnicity,gender,height,...,aids,cirrhosis,diabetes_mellitus,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,apache_3j_bodysystem,apache_2_bodysystem
0,66154,25312,118,0,68.0,22.73,0,Caucasian,M,180.3,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Sepsis,Cardiovascular
1,114252,59342,81,0,77.0,27.42,0,Caucasian,F,160.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Respiratory,Respiratory
2,119783,50777,118,0,25.0,31.95,0,Caucasian,F,172.7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Metabolic,Metabolic
3,79267,46918,118,0,81.0,22.64,1,Caucasian,F,165.1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Cardiovascular,Cardiovascular
4,92056,34377,33,0,19.0,,0,Caucasian,M,188.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Trauma,Trauma


In [4]:
new_df = df.dropna(subset=['ethnicity'])

### Check base rates

In [5]:
# by choosing y_pred to be ground truth instead of predictions, we can easily compute the base rate in the data
mf = MetricFrame(metrics={'base rate' : selection_rate}, 
                 y_true=new_df['hospital_death'], 
                 y_pred=new_df['hospital_death'], 
                 sensitive_features=new_df['ethnicity'])
display(mf.by_group)

# summarize demographic parity as the max difference between groups
print("base rate diff: %.2f" % mf.difference(method='between_groups'))

Unnamed: 0_level_0,base rate
ethnicity,Unnamed: 1_level_1
African American,0.0785587
Asian,0.0823738
Caucasian,0.0872616
Hispanic,0.0990516
Native American,0.0888325
Other/Unknown,0.0807042


base rate diff: 0.02


In [6]:
mean_selection = mf.by_group.mean()

In [7]:
num_tot = len(new_df)  # Gives number of rows
# print(num_tot)
num_dead = len(new_df.loc[(new_df['hospital_death'] == 1)])
selection_rate_tot = num_dead / num_tot
print(selection_rate_tot)

0.0864722425208707


In [8]:
ethcounts = new_df['ethnicity'].value_counts() #ethcounts is used later
display(ethcounts)
ethcounts["Caucasian"]

Caucasian           70684
African American     9547
Other/Unknown        4374
Hispanic             3796
Asian                1129
Native American       788
Name: ethnicity, dtype: int64

70684

In [9]:
display(mf.group_max())
display(mf.group_min())

base rate    0.0990516
dtype: object

base rate    0.0785587
dtype: object

In [10]:
num_rows = len(new_df) #used later
num_rows

90318

### compute probability of dying or not in genreal P(Y=1) or P(Y=0)

In [11]:
PY1 = len(new_df.loc[(df['hospital_death'] == 1)])/num_rows
print(PY1)
PY0 = len(new_df.loc[(df['hospital_death'] == 0)])/num_rows
print(PY0)
print(PY1 + PY0)

0.0864722425208707
0.9135277574791293
1.0


### compute observed probabilities

In [12]:
# display(new_df['ethnicity'].value_counts())

ethniciies = new_df['ethnicity'].unique()  #get array of ethnicities

prob_obv_y1 = []
prob_obv_y0 = []

for ethnicity in ethniciies:
#     print(ethnicity)
#     print(new_df['ethnicity'].value_counts()[ethnicity])
#     num_eth_tot = new_df['ethnicity'].value_counts()[ethnicity]
    num_tot = num_rows
    num_eth_dead = len(new_df.loc[(new_df['ethnicity'] == ethnicity) & (new_df['hospital_death'] == 1)])
    num_eth_alive = len(new_df.loc[(new_df['ethnicity'] == ethnicity) & (new_df['hospital_death'] == 0)])
    obv_prop_Y1_eth = num_eth_dead/num_tot
    obv_prop_Y0_eth = num_eth_alive/num_tot
#     print(obv_prop_Y1_eth)
#     print(obv_prop_Y0_eth)
#     print(obv_prop_Y0_eth+ obv_prop_Y1_eth)
    prob_obv_y1.append(obv_prop_Y1_eth)
    prob_obv_y0.append(obv_prop_Y0_eth)
    
# print(prob_obv_y1)   
# print(prob_obv_y0)

probs = {
  "observed y=1": prob_obv_y1,
  "observed y=0": prob_obv_y0
}

df_probs = pd.DataFrame(probs, index = ethniciies)
display(df_probs)
df_probs["observed y=1"]['Caucasian']

Unnamed: 0,observed y=1,observed y=0
Caucasian,0.068292,0.714321
Hispanic,0.004163,0.037866
African American,0.008304,0.0974
Asian,0.00103,0.011471
Native American,0.000775,0.00795
Other/Unknown,0.003908,0.04452


0.06829203481033681

### compute weights

In [13]:
Information = {'ethnicity': ['Caucasian', 'Caucasian', 'Hispanic', 'Hispanic', 'African American', 'African American', 'Asian', 'Asian',
       'Native American', 'Native American', 'Other/Unknown', 'Other/Unknown'],
               'class': [0, 1, 0, 1,0, 1,0, 1,0, 1,0, 1],                
               'weights': [0]*12}

weights_df = pd.DataFrame(Information)

# weights_df.loc[weights_df['ethnicity'] == "Caucasian" & weights_df['class'] ==0]

# weights_df.loc[(weights_df['ethnicity'] == "Caucasian") & (weights_df['class'] == 1)]['weights']

weights_df = weights_df.set_index(["ethnicity", "class"]) #, "class"

weights_df.loc['Caucasian',1]#[1]

weights_df#['Caucasian']

for ethnicity in ethniciies:
    for i in range(2):
        if i==1:
#             print(ethnicity)
#             print(i)
            P_exp = PY1 * (ethcounts[ethnicity]/num_rows)
            P_obs = df_probs["observed y=1"][ethnicity]
#             print(P_exp)
        else:
#             print(ethnicity)
#             print(i)
            P_exp = PY0 * (ethcounts[ethnicity]/num_rows)
            P_obs = df_probs["observed y=0"][ethnicity]
        weight = P_exp/P_obs
        weights_df.loc[ethnicity,i] = weight
        
display(weights_df)
# weights_df.loc['Caucasian', 0]['weights']

Unnamed: 0_level_0,Unnamed: 1_level_0,weights
ethnicity,class,Unnamed: 2_level_1
Caucasian,0,1.000865
Caucasian,1,0.990954
Hispanic,0,1.013962
Hispanic,1,0.873002
African American,0,0.991412
African American,1,1.100734
Asian,0,0.995534
Asian,1,1.049754
Native American,0,1.00259
Native American,1,0.97343


### Compute number to sample

In [14]:
Information_sampledf = {'ethnicity': ['Caucasian', 'Caucasian', 'Hispanic', 'Hispanic', 'African American', 'African American', 'Asian', 'Asian',
       'Native American', 'Native American', 'Other/Unknown', 'Other/Unknown'],
               'class': [0, 1, 0, 1,0, 1,0, 1,0, 1,0, 1],                
               'number to be sampled': [0]*12}

sample_numbers_df = pd.DataFrame(Information_sampledf)

sample_numbers_df = sample_numbers_df.set_index(["ethnicity", "class"]) 

for ethnicity in ethniciies:
    for i in range(2):
        if i==1:
            sample_num = round(weights_df.loc[ethnicity, 1]['weights'] \
                            * len(new_df.loc[(new_df['ethnicity'] == ethnicity) & (new_df['hospital_death'] == 1)]))
        else:
            sample_num = round(weights_df.loc[ethnicity, 0]['weights'] * \
                               len(new_df.loc[(new_df['ethnicity'] == ethnicity) & (new_df['hospital_death'] == 0)]))
        sample_numbers_df.loc[ethnicity,i] = sample_num
        
display(sample_numbers_df)
sample_numbers_df.loc['Caucasian', 0]['number to be sampled']

Unnamed: 0_level_0,Unnamed: 1_level_0,number to be sampled
ethnicity,class,Unnamed: 2_level_1
Caucasian,0,64572
Caucasian,1,6112
Hispanic,0,3468
Hispanic,1,328
African American,0,8721
African American,1,826
Asian,0,1031
Asian,1,98
Native American,0,720
Native American,1,68


64572

### Now the actual sampling

In [15]:
final_df = pd.DataFrame()

for ethnicity in ethniciies:
    for i in range(2):
        if i==1:
#             df['num_legs'].sample(n=3, random_state=1)
            sample_num = sample_numbers_df.loc[ethnicity, 1]['number to be sampled']
            #if we need to oversample
            if sample_num > len(new_df.loc[(new_df['ethnicity'] == ethnicity) & (new_df['hospital_death'] == 1)]):
                temp_df = new_df.loc[(new_df['ethnicity'] == ethnicity) & (new_df['hospital_death'] == 1)]\
                        .sample(n=sample_numbers_df.loc[ethnicity, 1]['number to be sampled'], replace=True, random_state=1)
            else: #undersample
                temp_df = new_df.loc[(new_df['ethnicity'] == ethnicity) & (new_df['hospital_death'] == 1)]\
                    .sample(n=sample_numbers_df.loc[ethnicity, 1]['number to be sampled'], random_state=1)
            final_df = pd.concat([final_df, temp_df])
        else:
            sample_num = sample_numbers_df.loc[ethnicity, 0]['number to be sampled']
            #if we need to oversample
            if sample_num > len(new_df.loc[(new_df['ethnicity'] == ethnicity) & (new_df['hospital_death'] == 0)]):
                temp_df = new_df.loc[(new_df['ethnicity'] == ethnicity) & (new_df['hospital_death'] == 0)]\
                        .sample(n=sample_numbers_df.loc[ethnicity, 0]['number to be sampled'], replace=True, random_state=1)
            else: #undersample
                temp_df = new_df.loc[(new_df['ethnicity'] == ethnicity) & (new_df['hospital_death'] == 0)]\
                    .sample(n=sample_numbers_df.loc[ethnicity, 0]['number to be sampled'], random_state=1)
            final_df = pd.concat([final_df, temp_df])
        
final_df.head()

Unnamed: 0,encounter_id,patient_id,hospital_id,hospital_death,age,bmi,elective_surgery,ethnicity,gender,height,...,aids,cirrhosis,diabetes_mellitus,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,apache_3j_bodysystem,apache_2_bodysystem
89248,35599,39339,35,0,34.0,20.173155,0,Caucasian,M,188.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Sepsis,Cardiovascular
47499,24362,82815,21,0,66.0,22.855304,0,Caucasian,F,167.6,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Metabolic,Metabolic
18868,126611,64860,161,0,40.0,27.871313,0,Caucasian,F,157.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
6599,21292,30734,118,0,51.0,16.304874,0,Caucasian,F,167.6,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,Neurological,Neurologic
46834,115657,25821,99,0,52.0,24.30823,0,Caucasian,M,172.7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Respiratory,Respiratory


### Check bases rates again

In [16]:
# by choosing y_pred to be ground truth instead of predictions, we can easily compute the base rate in the data
mf = MetricFrame(metrics={'base rate' : selection_rate}, 
                 y_true=final_df['hospital_death'], 
                 y_pred=final_df['hospital_death'], 
                 sensitive_features=final_df['ethnicity'])
display(mf.by_group)

# summarize demographic parity as the max difference between groups
print("base rate diff: %.2f" % mf.difference(method='between_groups'))

Unnamed: 0_level_0,base rate
ethnicity,Unnamed: 1_level_1
African American,0.0865193
Asian,0.0868025
Caucasian,0.0864694
Hispanic,0.0864067
Native American,0.0862944
Other/Unknown,0.0864198


base rate diff: 0.00


In [17]:
print(len(df))
print(len(new_df))
print(len(final_df))

91713
90318
90318
