In [None]:
#sets the project id
PROJECT_ID = "mlrh-330919" #@param {type:"string"}

In [None]:
import os
from google.colab import auth
from IPython.display import display
 
#sets dateset
DATASET_PROJECT_ID = 'amsterdamumcdb'
DATASET_ID = 'version1_0_2'
LOCATION = 'eu'
 
#all libraries check this environment variable, so set it:
os.environ["GOOGLE_CLOUD_PROJECT"] = PROJECT_ID
 
auth.authenticate_user()
print('Authenticated')

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
os.chdir('/content/drive/MyDrive/MLRFH')

In [None]:
#Some preprocessing functions 

def to_cols(data):

  grouped = data.pivot_table(index=['admissionid', 'time'], 
          columns=['item'], values='value')

  return grouped
  

def to_cols_action(data):

  grouped = data.pivot_table(index=['admissionid', 'time'], 
            columns=['item'], values='administered')

  return grouped

def remove_outliers(grouped):
  #delete outliers
  outliers = grouped.reset_index() #return to single index

  #select outlier cols
  all_cols = ['Kalium (bloed)', 'ABP gemiddeld', 'Kreatinine (bloed)', 'Natrium (bloed)', 'UrineCAD', 'UrineSupraPubis', 'UrineSpontaan', 'UrineUP', 'Kreatinine', 'Nefrodrain re Uit', 'Nefrodrain li Uit', 'UrineIncontinentie']
  
  grouped['Kalium (bloed)'][grouped['Kalium (bloed)'] > 8.] = np.nan
  grouped['ABP gemiddeld'][grouped['ABP gemiddeld'] > 200.] = np.nan
  grouped['Kreatinine (bloed)'][grouped['Kreatinine (bloed)'] > 220.] = np.nan
  grouped['Natrium (bloed)'][grouped['Natrium (bloed)'] > 180.] = np.nan
  grouped['UrineCAD'][grouped['UrineCAD'] > 750.] = np.nan


  #return grouped[grouped[all_cols] >= 0]
  grouped[all_cols] = grouped[all_cols].applymap(lambda x: np.nan if x < 0 else x)
  return grouped

def remove_outliers_action(grouped):
  #delete outliers
  outliers = grouped.reset_index() #return to single index

  cols = ['Noradrenaline (Norepinefrine)', 'NaCl 0,45%/Glucose 2,5%']

  #select outlier cols
  grouped['Noradrenaline (Norepinefrine)'][grouped['Noradrenaline (Norepinefrine)'] > 10.] = np.nan
  grouped['NaCl 0,45%/Glucose 2,5%'][grouped['NaCl 0,45%/Glucose 2,5%'] > 500.] = np.nan

  grouped[cols] = grouped[cols].applymap(lambda x: np.nan if x < 0 else x)

  return grouped

def aggregate(outliers_removed):
  #per patient, average the values in 4h timeslots
  outliers_removed = outliers_removed.sort_values('time')
  data_agg = outliers_removed.groupby([pd.Grouper(level='admissionid'), 
            pd.Grouper(level='time', freq='4H')]
          ).mean()
  
  return data_agg

def interpolate(data_agg):
  #interpolate null values
  return data_agg.interpolate(limit_direction='both')


def process_statespace(data):
  data['time'] = pd.to_datetime(data['time'], unit='ms')
  print("data", data.columns)
  grouped = to_cols(data)
  grouped = remove_outliers(grouped)
  data_agg = aggregate(grouped)
  data_filled = interpolate(data_agg)

  return data_filled.reset_index()
  
def process_actionspace(data):
  data['time'] = pd.to_datetime(data['time'], unit='ms')
  print("data", data.columns)
  grouped = to_cols_action(data)
  grouped = remove_outliers_action(grouped)
  data_agg = aggregate(grouped)
  data_filled = interpolate(data_agg)

  return data_filled.reset_index()

In [None]:
#plot distribution of cols

def draw_histograms(df, variables, n_rows, n_cols):
    colors = ["pink", "orange", "yellow", "green", "blue", "purple", "black", "darkgreen", "darkblue", "grey", "lightblue", "red"]
    fig=plt.figure()
    for i, var_name in enumerate(variables):
        ax=fig.add_subplot(n_rows,n_cols,i+1)
        df[var_name].hist(bins=100,ax=ax, color=colors[i])
        ax.set_title(var_name)
    fig.tight_layout()  # Improves appearance a bit.
    plt.show()

#normalization, nice for plotting

def minmax(df):
    return (df - df.min()) / ( df.max() - df.min())

In [None]:
#data = pd.read_csv('new_new_dataset.csv')
data = pd.read_csv('state_v2.csv')

In [None]:
data.head()

In [None]:
#Plot before distributions
cols1 = ['Kalium (bloed)', 'ABP gemiddeld', 'Kreatinine (bloed)', 'Natrium (bloed)', 'UrineCAD', 'UrineSupraPubis']
cols2 =  ['UrineSpontaan', 'UrineUP', 'Kreatinine', 'Nefrodrain re Uit', 'Nefrodrain li Uit', 'UrineIncontinentie']
colors = ["pink", "orange", "yellow", "green", "blue", "purple", "black", "darkgreen", "darkblue", "grey", "lightblue", "red"]
fig=plt.figure()
for i, var_name in enumerate(cols1):
    ax=fig.add_subplot(3,3,i+1)
    data['value'][data['item'] == var_name].hist(bins=100,ax=ax, color=colors[i])
    ax.set_title(var_name)
fig.tight_layout()  # Improves appearance a bit.
plt.show()

In [None]:
fig=plt.figure()
for i, var_name in enumerate(cols2):
    ax=fig.add_subplot(3,3,i+1)
    data['value'][data['item'] == var_name].hist(bins=100,ax=ax, color=colors[i])
    ax.set_title(var_name)
fig.tight_layout()  # Improves appearance a bit.
plt.show()

In [None]:
data['item'].value_counts()

In [None]:
#patient id=0 before preprocessing
pd.set_option('display.max_rows', 100)
data['time'] = pd.to_datetime(data['measuredat'], unit='ms')
data[data['admissionid'] == 0].sort_values(by = "measuredat")

In [None]:
statespace = process_statespace(data)

In [None]:
#check for one patient whether the aggregations are correct
statespace[statespace['admissionid'] == 0]

In [None]:
from numpy.lib import histograms
variables = ['Kalium (bloed)', 'ABP gemiddeld', 'Kreatinine (bloed)', 'Natrium (bloed)', 'UrineCAD', 'UrineSupraPubis']

draw_histograms(statespace, variables, 3, 3)

In [None]:
variables2 = ['UrineSpontaan', 'UrineUP', 'Kreatinine', 'Nefrodrain re Uit', 'Nefrodrain li Uit', 'UrineIncontinentie']
draw_histograms(statespace, variables2, 3, 3)

In [None]:
statespace

**Action** **Space**

In [None]:
action = pd.read_csv('action_space.csv')
action['time'] = pd.to_datetime(action['stop'] - action['start'], unit='ms')
action = action.drop(columns = ['start', 'stop'])

In [None]:
action['administered'][action['item'] == 'Noradrenaline (Norepinefrine)'].mean()

In [None]:
action['administered'][action['item'] == 'NaCl 0,45%/Glucose 2,5%'].max()

In [None]:
#check for 1 patient
pd.set_option('display.max_rows', 135)
action[action['admissionid'] == 4251]

In [None]:
actionspace = process_actionspace(action)

In [None]:
actionspace['Noradrenaline (Norepinefrine)'].max()

In [None]:
actionspace[actionspace['admissionid'] == 4251]

**Combine two dataframes** 

In [None]:
#take first 48 hours per patient
statespace = statespace.sort_values(by=['admissionid', 'time'])
statespace48h = statespace.groupby('admissionid').head(12)

In [None]:
action.isnull().sum()

In [None]:
#we need to know the gender, otherwise we cannot compute AKI --> nans are dropped
genders = action[['admissionid', 'gender']].dropna()

In [None]:
#add gender to dataframe
def check_gender(admissionid):
  try:
    gender = genders['gender'][genders['admissionid'] == admissionid].head(1).item()
  except ValueError:
    gender = "Unknown"
  return gender

In [None]:
#add gender to dataframe
def check_age(admissionid):
  age = action['agegroup'][action['admissionid'] == admissionid].head(1).item()
  return age

In [None]:
statespace48h['gender'] = [check_gender(x) for x in statespace48h['admissionid']]

In [None]:
#remove unknowns, as we need to know the gender to compute the AKI
statespace48h = statespace48h[statespace48h.gender != 'Unknown']

In [None]:
statespace48h['agegroup'] = [check_age(x) for x in statespace48h['admissionid']]

In [None]:
statespace48h['agegroup'].value_counts()

In [None]:
#Agegroup is categorical --> encode them into a numerical variable
# agegroups = pd.get_dummies(statespace48h['agegroup'])
# statespace48h = pd.concat([statespace48h, agegroups], axis=1)

ages = {"agegroup": {"18-39": 1, "40-49": 2, "50-59": 3, "60-69":4, "70-79":5, "80+":6}}
statespace48h = statespace48h.replace(ages)
statespace48h

In [None]:
#df with 1 row per patient for demographic stats
demo = statespace48h.drop_duplicates(subset=['admissionid'], keep='first')

In [None]:
#add AKI feature

def AKI(kreatinine, gender):
  if gender == 'Vrouw':
    if kreatinine > 106 and kreatinine <= 134:
      return 1
    if kreatinine > 134 and kreatinine <= 205:
      return 2
    if kreatinine > 205:
      return 3
    else:
      return 0
  if gender == 'Man':
    if kreatinine > 119 and kreatinine <= 151:
      return 1
    if kreatinine > 151 and kreatinine <= 231:
      return 2
    if kreatinine > 231:
      return 3
    else:
      return 0

In [None]:
#add AKI to every row in statespace
statespace48h['AKI'] = statespace48h.apply(lambda row: AKI(row['Kreatinine (bloed)'], row['gender']), axis=1)

In [None]:
statespace48h['AKI'].value_counts()

In [None]:
#merge datasets --> left merge on actionspace, as states without actions are not useful for our model
space = actionspace.merge(statespace48h, on=["admissionid", "time"], how="left")

In [None]:
#all the null values are patients that are not in the state space and only in the action space --> we cannot use them so they are dropped
space.isnull().sum()
space = space.dropna()

In [None]:
#one hot encode gender
space = pd.get_dummies(space, columns = ['gender'])


In [None]:
#save final space in the drive
#space.to_csv("space_correct_demo7.csv")

In [None]:
space[space['admissionid'] == 4251]