In [2]:
import pandas as pd
import numpy as np

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [13]:
IDS_MAPPING_FN = "./data/IDS_mapping.csv"
DIABETIC_FN = "./data/diabetic_data.csv"
DIABETES_ICD = 250

In [14]:
# read files
mapping = pd.read_csv(IDS_MAPPING_FN, header=None)
df = pd.read_csv(DIABETIC_FN)
print(df.shape)

(101766, 50)


## EDA

In [8]:
admission_type_dict = {}
discharge_disposition_dict = {}
admission_source_dict = {}

list1 = []
for i, j in zip(mapping[0].values, mapping[1].values):
  if len(str(i))>3:
    feature_name = i
  elif len(str(i))!=3:
    if feature_name == 'admission_type_id':
      admission_type_dict[int(i)] = j
    elif feature_name == 'discharge_disposition_id':
      discharge_disposition_dict[int(i)] = j
    elif feature_name == 'admission_source_id':
      admission_source_dict[int(i)] = j

In [25]:
admission_type_dict

{1: 'Emergency',
 2: 'Urgent',
 3: 'Elective',
 4: 'Newborn',
 5: 'Not Available',
 6: nan,
 7: 'Trauma Center',
 8: 'Not Mapped'}

In [22]:
df['admission_type'] = df['admission_type_id'].map(admission_type_dict).astype("O")
df['discharge_disposition'] = df['discharge_disposition_id'].map(discharge_disposition_dict).astype("O")
df['admission_source'] = df['admission_source_id'].map(admission_source_dict).astype("O")

In [23]:
for column in df.select_dtypes(include = ['O']).columns:
  if column == "diag_1" or column=="diag_2":
    print(df[df[column]=='250'].shape) # diabetic as diagnosis 1 or 2
  print('===========================')
  print(f'Column name: {column}')
  print('===========================')
  print(df[column].value_counts(dropna=False))
  print()

Column name: race
race
Caucasian          76099
AfricanAmerican    19210
?                   2273
Hispanic            2037
Other               1506
Asian                641
Name: count, dtype: int64

Column name: gender
gender
Female             54708
Male               47055
Unknown/Invalid        3
Name: count, dtype: int64

Column name: age
age
[70-80)     26068
[60-70)     22483
[50-60)     17256
[80-90)     17197
[40-50)      9685
[30-40)      3775
[90-100)     2793
[20-30)      1657
[10-20)       691
[0-10)        161
Name: count, dtype: int64

Column name: weight
weight
?            98569
[75-100)      1336
[50-75)        897
[100-125)      625
[125-150)      145
[25-50)         97
[0-25)          48
[150-175)       35
[175-200)       11
>200             3
Name: count, dtype: int64

Column name: payer_code
payer_code
?     40256
MC    32439
HM     6274
SP     5007
BC     4655
MD     3532
CP     2533
UN     2448
CM     1937
OG     1033
PO      592
DM      549
CH      146
WC      

In [32]:
df["readmitted"].value_counts(normalize=True)

readmitted
NO     0.539119
>30    0.349282
<30    0.111599
Name: proportion, dtype: float64

In [33]:
df.groupby("patient_nbr").agg('first')["readmitted"].value_counts(normalize=True)

readmitted
NO     0.601038
>30    0.310971
<30    0.087992
Name: proportion, dtype: float64

In [20]:
for column in df.select_dtypes(include = ['int64']).columns:
  if column == "diag_1" or column=="diag_2":
    print(df[df[column]=='250'].shape) # diabetic as diagnosis 1 or 2
  print('===========================')
  print(f'Column name: {column}')
  print('===========================')
  print(df[column].value_counts(dropna=False))
  print()

Column name: encounter_id
encounter_id
2278392      1
190792044    1
190790070    1
190789722    1
190786806    1
            ..
106665324    1
106657776    1
106644876    1
106644474    1
443867222    1
Name: count, Length: 101766, dtype: int64

Column name: patient_nbr
patient_nbr
88785891     40
43140906     28
1660293      23
88227540     23
23199021     23
             ..
11005362      1
98252496      1
1019673       1
13396320      1
175429310     1
Name: count, Length: 71518, dtype: int64

Column name: admission_type_id
admission_type_id
1    53990
3    18869
2    18480
6     5291
5     4785
8      320
7       21
4       10
Name: count, dtype: int64

Column name: discharge_disposition_id
discharge_disposition_id
1     60234
3     13954
6     12902
18     3691
2      2128
22     1993
11     1642
5      1184
25      989
4       815
7       623
23      412
13      399
14      372
28      139
8       108
15       63
24       48
9        21
17       14
16       11
19        8
10     

In [24]:
["encounter_id", "patient_nbr"]

## Predictor Variables

Filter out everything but emergency, urgent and elective from admission_type_id : Keep ids 1,2,3


{1: 'Emergency',
 2: 'Urgent',
 3: 'Elective',
 4: 'Newborn',
 5: 'Not Available',
 6: nan,
 7: 'Trauma Center',
 8: 'Not Mapped'}

In [38]:
predictor_columns = ["time_in_hospital", "admission_type", "num_lab_procedures", "num_procedures", "num_medications", "number_outpatient", "number_inpatient","number_emergency","number_diagnoses","diabetesMed", "insurance"]

# Binary Variables : "max_glu_serum", "A1Cresult"
# change_in_dosage: if dosage was updated
# change_in_medicine: if column "change"=1 and change_in_medicine=0 -> implies the generic name has changed


## Target
#binary "readmitted" within 30 days or not. 90/10 split

In [34]:
outcome = ["readmitted"] # binary whether the patient was readmitted within 30 days or not

## Protected Features

In [36]:
protected_features = ["Age", "Gender", "Race"]