In [138]:
import pandas as pd
import numpy as np

## Load Data

In [139]:
IDS_MAPPING_FN = "./data/IDS_mapping.csv"
DIABETIC_FN = "./data/diabetic_data.csv"

In [140]:
# read files
mapping = pd.read_csv(IDS_MAPPING_FN, header=None)
df = pd.read_csv(DIABETIC_FN)
print(df.shape)

(101766, 50)


Create dictionary of code-value mappings of `admission_type_id`, `discharge_disposition_id`, and `admission_source_id` using the mapping provided in the data-folder.

In [141]:
admission_type_dict = {}
discharge_disposition_dict = {}
admission_source_dict = {}

list1 = []
for i, j in zip(mapping[0].values, mapping[1].values):
  if len(str(i))>3:
    feature_name = i
  elif len(str(i))!=3:
    if feature_name == 'admission_type_id':
      admission_type_dict[int(i)] = j
    elif feature_name == 'discharge_disposition_id':
      discharge_disposition_dict[int(i)] = j
    elif feature_name == 'admission_source_id':
      admission_source_dict[int(i)] = j

Create columns in dataframe with where the codes are mapped to the values

In [142]:
df['admission_type'] = df['admission_type_id'].map(admission_type_dict).astype("O")
df['discharge_disposition'] = df['discharge_disposition_id'].map(discharge_disposition_dict).astype("O")
df['admission_source'] = df['admission_source_id'].map(admission_source_dict).astype("O")

## Explore Dataset

In [143]:
print(f"Number of unique encounters: {df.shape[0]}")
print(f"Number of columns: {df.shape[1]}")

Number of unique encounters: 101766
Number of columns: 53


In [144]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101766 entries, 0 to 101765
Data columns (total 53 columns):
 #   Column                    Non-Null Count   Dtype 
---  ------                    --------------   ----- 
 0   encounter_id              101766 non-null  int64 
 1   patient_nbr               101766 non-null  int64 
 2   race                      101766 non-null  object
 3   gender                    101766 non-null  object
 4   age                       101766 non-null  object
 5   weight                    101766 non-null  object
 6   admission_type_id         101766 non-null  int64 
 7   discharge_disposition_id  101766 non-null  int64 
 8   admission_source_id       101766 non-null  int64 
 9   time_in_hospital          101766 non-null  int64 
 10  payer_code                101766 non-null  object
 11  medical_specialty         101766 non-null  object
 12  num_lab_procedures        101766 non-null  int64 
 13  num_procedures            101766 non-null  int64 
 14  num_

In [145]:
for column in df.select_dtypes(include = ['O']).columns:
  if column == "diag_1" or column=="diag_2":
    print(df[df[column]=='250'].shape) # diabetic as diagnosis 1 or 2
  print('===========================')
  print(f'Column name: {column}')
  print('===========================')
  print(df[column].value_counts(dropna=False))
  print()

Column name: race
race
Caucasian          76099
AfricanAmerican    19210
?                   2273
Hispanic            2037
Other               1506
Asian                641
Name: count, dtype: int64

Column name: gender
gender
Female             54708
Male               47055
Unknown/Invalid        3
Name: count, dtype: int64

Column name: age
age
[70-80)     26068
[60-70)     22483
[50-60)     17256
[80-90)     17197
[40-50)      9685
[30-40)      3775
[90-100)     2793
[20-30)      1657
[10-20)       691
[0-10)        161
Name: count, dtype: int64

Column name: weight
weight
?            98569
[75-100)      1336
[50-75)        897
[100-125)      625
[125-150)      145
[25-50)         97
[0-25)          48
[150-175)       35
[175-200)       11
>200             3
Name: count, dtype: int64

Column name: payer_code
payer_code
?     40256
MC    32439
HM     6274
SP     5007
BC     4655
MD     3532
CP     2533
UN     2448
CM     1937
OG     1033
PO      592
DM      549
CH      146
WC      

In [146]:
for column in df.select_dtypes(include = ['int64']).columns:
  if column == "diag_1" or column=="diag_2":
    print(df[df[column]=='250'].shape) # diabetic as diagnosis 1 or 2
  print('===========================')
  print(f'Column name: {column}')
  print('===========================')
  print(df[column].value_counts(dropna=False))
  print()

Column name: encounter_id
encounter_id
2278392      1
190792044    1
190790070    1
190789722    1
190786806    1
            ..
106665324    1
106657776    1
106644876    1
106644474    1
443867222    1
Name: count, Length: 101766, dtype: int64

Column name: patient_nbr
patient_nbr
88785891     40
43140906     28
1660293      23
88227540     23
23199021     23
             ..
11005362      1
98252496      1
1019673       1
13396320      1
175429310     1
Name: count, Length: 71518, dtype: int64

Column name: admission_type_id
admission_type_id
1    53990
3    18869
2    18480
6     5291
5     4785
8      320
7       21
4       10
Name: count, dtype: int64

Column name: discharge_disposition_id
discharge_disposition_id
1     60234
3     13954
6     12902
18     3691
2      2128
22     1993
11     1642
5      1184
25      989
4       815
7       623
23      412
13      399
14      372
28      139
8       108
15       63
24       48
9        21
17       14
16       11
19        8
10     

## Preprocessing

As seen above some patients have many encounters (up to 40).

We only keep the first observation for each unique patient.

In [147]:
df = df.groupby("patient_nbr").agg('first')
print(f"Number of unique encounters after only keeping first encounter for each patient: {df.shape[0]}")

Number of unique encounters after only keeping first encounter for each patient: 71518


We only keep observations with `admission_type` Emergency, Urgent or Elective.

In [148]:
df = df[df['admission_type'].isin(['Emergency', 'Urgent', 'Elective'])]
print(f"Number of unique encounters: {df.shape[0]}")

Number of unique encounters: 63757


Make flags variables indicating whether a `max_glu_serum`, and a `AC1result` test was done.

In [150]:
df['max_glu_serum_flag'] = df['max_glu_serum'].notnull().astype(int)
df['A1C_flag'] = df['A1Cresult'].notnull().astype(int)

Make a binary variable for readmission, which indicates whether the patient was readmitted before 30 days. 

In [151]:
df['readmitted_flag'] = np.where(df['readmitted']=='<30', 1, 0)

Make a binary flag variable indicating whether any of the medicines had the value *Up* or *Down*.

In [152]:
medicine_columns = df.columns[23:46].tolist()
df['change_dosage'] = df[medicine_columns].isin(['Up', 'Down']).any(axis=1).astype(int)

The column `change` indicates if there was a change in diabetic medications (either dosage or generic name). So if `change` is marked as changed but `change_dosage` is none, then there must have been a change in the generic name, i.e. the chemical name of a medicine.

In [161]:
df['change_medicine'] = np.where((df['change'] == 'Ch') & (df['change_dosage'] == 0), 1, 0)

In [179]:
#Problem here
print(f"{df['change_medicine'].value_counts()[1] + df['change_medicine'].value_counts()[1]}") 
print(f"{df['change'].value_counts()[1]}")

26030
28660


  print(f"{df['change'].value_counts()[1]}")


## Define variables

In [186]:
feature_names = ["time_in_hospital", "admission_type", "num_lab_procedures", "num_procedures", "num_medications", 
                     "number_outpatient", "number_inpatient", "number_emergency", "number_diagnoses", "diabetesMed", "insurance", 'max_glu_serum_flag', 'A1C_flag', 'change_dosage', 'change_medicine', 'diabetesMed']

In [185]:
target_name = ["readmitted_flag"] 

In [187]:
protected_names = ['age', 'race', 'gender']

## EDA