In [None]:
import pandas as pd
import numpy as np

## Load Data

In [None]:
IDS_MAPPING_FN = "./data/IDS_mapping.csv"
DIABETIC_FN = "./data/diabetic_data.csv"

In [None]:
# read files
mapping = pd.read_csv(IDS_MAPPING_FN, header=None)
df = pd.read_csv(DIABETIC_FN)
print(df.shape)

Create dictionary of code-value mappings of `admission_type_id`, `discharge_disposition_id`, and `admission_source_id` using the mapping provided in the data-folder.

In [None]:
admission_type_dict = {}
discharge_disposition_dict = {}
admission_source_dict = {}

list1 = []
for i, j in zip(mapping[0].values, mapping[1].values):
  if len(str(i))>3:
    feature_name = i
  elif len(str(i))!=3:
    if feature_name == 'admission_type_id':
      admission_type_dict[int(i)] = j
    elif feature_name == 'discharge_disposition_id':
      discharge_disposition_dict[int(i)] = j
    elif feature_name == 'admission_source_id':
      admission_source_dict[int(i)] = j

Create columns in dataframe with where the codes are mapped to the values

In [None]:
df['admission_type'] = df['admission_type_id'].map(admission_type_dict).astype("O")
df['discharge_disposition'] = df['discharge_disposition_id'].map(discharge_disposition_dict).astype("O")
df['admission_source'] = df['admission_source_id'].map(admission_source_dict).astype("O")

## Explore Dataset

In [None]:
print(f"Number of unique encounters: {df.shape[0]}")
print(f"Number of columns: {df.shape[1]}")

In [None]:
df.info()

In [None]:
for column in df.select_dtypes(include = ['O']).columns:
  if column == "diag_1" or column=="diag_2":
    print(df[df[column]=='250'].shape) # diabetic as diagnosis 1 or 2
  print('===========================')
  print(f'Column name: {column}')
  print('===========================')
  print(df[column].value_counts(dropna=False))
  print()

In [None]:
for column in df.select_dtypes(include = ['int64']).columns:
  if column == "diag_1" or column=="diag_2":
    print(df[df[column]=='250'].shape) # diabetic as diagnosis 1 or 2
  print('===========================')
  print(f'Column name: {column}')
  print('===========================')
  print(df[column].value_counts(dropna=False))
  print()

## Preprocessing

As seen above some patients have many encounters (up to 40).

We only keep the first observation for each unique patient.

In [None]:
df = df.groupby("patient_nbr").agg('first')
print(f"Number of unique encounters after only keeping first encounter for each patient: {df.shape[0]}")

We only keep observations with `admission_type` Emergency, Urgent or Elective.

In [None]:
df = df[df['admission_type'].isin(['Emergency', 'Urgent', 'Elective'])]
print(f"Number of unique encounters: {df.shape[0]}")

Make flags variables indicating whether a `max_glu_serum`, and a `AC1result` test was done.

In [None]:
df['max_glu_serum_flag'] = df['max_glu_serum'].notnull().astype(int)
df['A1C_flag'] = df['A1Cresult'].notnull().astype(int)

Make a binary variable for readmission, which indicates whether the patient was readmitted before 30 days. 

In [None]:
df['readmitted_flag'] = np.where(df['readmitted']=='<30', 1, 0)

Make a binary flag variable indicating whether any of the medicines had the value *Up* or *Down*.

In [None]:
medicine_columns = df.columns[23:46].tolist()
df['change_dosage'] = df[medicine_columns].isin(['Up', 'Down']).any(axis=1).astype(int)

The column `change` indicates if there was a change in diabetic medications (either dosage or generic name). So if `change` is marked as changed but `change_dosage` is none, then there must have been a change in the generic name, i.e. the chemical name of a medicine.

In [None]:
df['change_medicine'] = np.where((df['change'] == 'Ch') & (df['change_dosage'] == 0), 1, 0)

In [None]:
#Problem here
print(f"{df['change_medicine'].value_counts()[1] + df['change_medicine'].value_counts()[1]}") 
print(f"{df['change'].value_counts()[1]}")

## Define variables

In [None]:
feature_names = ["time_in_hospital", "admission_type", "num_lab_procedures", "num_procedures", "num_medications", 
                     "number_outpatient", "number_inpatient", "number_emergency", "number_diagnoses", "diabetesMed", "insurance", 'max_glu_serum_flag', 'A1C_flag', 'change_dosage', 'change_medicine', 'diabetesMed']

In [None]:
target_name = ["readmitted_flag"] 

In [None]:
protected_names = ['age', 'race', 'gender']

## EDA