In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

In [3]:
data = pd.read_csv("../data/raw/AVF3_RL_Adi_T2DM_HBA1C_V220220928034233_TimeSeries.csv")

In [4]:
data.columns

Index(['PRACTICE_PATIENT_ID', 'YEAR_START', 'YEAR_END', 'STATUS', 'SEX',
       'ETHNICITY', 'INTERVAL_OF_FOLLOWUP', 'DATE_OF_BIRTH', 'DEATH_DATE',
       'INDEX_DATE', 'EXIT_DATE', 'AGE', 'INSULIN_GOLD_OPTIMAL',
       'THIAZOLIDINEDIONE_GOLD_OPTIMAL', 'SULPHONYLUREAS_GOLD_OPTIMAL',
       'MEGLITINIDES_GOLD_OPTIMAL', 'METFORMIN_GOLD_OPTIMAL',
       'DPP4INHIBITORS_GOLD_OPTIMAL', 'DPP4INHIBITORS_OPTIMAL',
       'GLP1_GOLD_OPTIMAL', 'ACARBOSE_GOLD_OPTIMAL',
       'SGLT2INHIBITORS_GOLD_OPTIMAL', 'SGLT2INHIBITORS_OPTIMAL',
       'SYSTOLIC_BLOOD_PRESSURE', 'BMI', 'HIGH_DENSITY_LIPOPROTEIN',
       'LOW_DENSITY_LIPOPROTEIN', 'SERUM_CHOLESTEROL',
       'HBA1C_-_DIABETIC_CONTROL', 'TRIGLYCERIDES', 'SERUM_CREATININE'],
      dtype='object')

In [None]:
data = data[data['AGE'].between(35, 80, inclusive='both')]

In [None]:
demographic_features = ['PRACTICE_PATIENT_ID', 'SEX', 'AGE', 'BASESMOKER', 'STATUS']
interest_feature_list = ['EGFR', 'SYSTOLIC_BP', 'DIASTOLIC_BP', 'HBA1C', 
                         'SERUM_CHOLESTEROL', 'HDL', 'LDL', 'TRIGLYCERIDES', 'BMI']
treatment_list = ['METFORMIN', 'SULPHO', 'GLINIDES', 'GLITAZONE', 'DPP4I', 'GLP1', 'SGLT2', 'INSULIN', 'ACARBOSE']
baseline_features = ['B_ACR', 'B_TRIGLYCERIDES', 'B_SERUM_CHOL', 'B_HDL', 'B_LDL', 'B_DIA', 'B_SYS', 
                     'B_BMI', 'B_HBA1C']
baseline_feature_map = {
    'B_ACR': 'ALBUMIN_CREATININE_RATIO', 
    'B_TRIGLYCERIDES': 'TRIGLYCERIDES', 
    'B_SERUM_CHOL': 'SERUM_CHOLESTEROL', 
    'B_HDL': 'HDL', 
    'B_LDL': 'LDL', 
    'B_DIA': 'DIASTOLIC_BP', 
    'B_SYS': 'SYSTOLIC_BP', 
    'B_BMI': 'BMI', 
    'B_HBA1C': 'HBA1C'
}

extra_features = ['ACTION', 'REWARD_BP', 'REWARD_A1C', 'REWARD']

all_features = demographic_features+interest_feature_list+treatment_list

In [None]:
data = data[all_features]

In [None]:
freq_data = data.groupby(['PRACTICE_PATIENT_ID']).count()
pat_to_drop = freq_data[freq_data['SEX'] < 5].index.values
data = data.drop(data[data.PRACTICE_PATIENT_ID.isin(pat_to_drop)].index)

In [None]:
len(data['PRACTICE_PATIENT_ID'].unique())

In [None]:
for ftr in interest_feature_list:
    data[ftr] = data.groupby('PRACTICE_PATIENT_ID')[ftr].transform(lambda v: v.ffill())
    data[ftr] = data.groupby('PRACTICE_PATIENT_ID')[ftr].transform(lambda v: v.bfill())

In [None]:
for ftr in interest_feature_list:
    print(ftr, (data[ftr].isna().sum() / len(data[ftr])) * 100 )

In [None]:
def find_notnull(data):
    """ Finds the indices of all missing values.
    Parameters
    ----------
    data: numpy.ndarray
        Data to impute.
    Returns
    -------
    List of tuples
        Indices of all missing values in tuple format; (i, j)
    """
    null_xy = np.argwhere(~np.isnan(data))
    return null_xy


invalid_id = []

for patient, d in tqdm(data.groupby(['PRACTICE_PATIENT_ID'])):
    total_num_visits = d.shape[0]
    if sum(d[treatment_list].iloc[0].values) == 0:
        invalid_id.append(patient)
    for ftr in interest_feature_list:
        notnan_data_indices = find_notnull(d[ftr].values)
        if len(notnan_data_indices) == 0 or len(notnan_data_indices) < (total_num_visits/2):
            invalid_id.append(patient)
            

In [None]:
data = data.drop(data[data.PRACTICE_PATIENT_ID.isin(invalid_id)].index)

In [None]:
for ftr in interest_feature_list:
    print(ftr, (data[ftr].isna().sum() / len(data[ftr])) * 100 )

In [None]:
def foo(group):
    for i in range(1,len(group)):
        if sum(group.loc[group.index[i], treatment_list].values) == 0:
            group.iloc[i,group.columns.get_loc('METFORMIN')]= group.iloc[i-1,group.columns.get_loc('METFORMIN')]
            group.iloc[i,group.columns.get_loc('SULPHO')]= group.iloc[i-1,group.columns.get_loc('SULPHO')]
            group.iloc[i,group.columns.get_loc('GLINIDES')]= group.iloc[i-1,group.columns.get_loc('GLINIDES')]
            group.iloc[i,group.columns.get_loc('GLITAZONE')]= group.iloc[i-1,group.columns.get_loc('GLITAZONE')]
            group.iloc[i,group.columns.get_loc('DPP4I')]= group.iloc[i-1,group.columns.get_loc('DPP4I')]
            group.iloc[i,group.columns.get_loc('GLP1')]= group.iloc[i-1,group.columns.get_loc('GLP1')]
            group.iloc[i,group.columns.get_loc('SGLT2')]= group.iloc[i-1,group.columns.get_loc('SGLT2')]
            group.iloc[i,group.columns.get_loc('INSULIN')]= group.iloc[i-1,group.columns.get_loc('INSULIN')]
            group.iloc[i,group.columns.get_loc('ACARBOSE')]= group.iloc[i-1,group.columns.get_loc('ACARBOSE')]
            group.iloc[i,group.columns.get_loc('STATIN')]= group.iloc[i-1,group.columns.get_loc('STATIN')]
    return group
data = data.groupby('PRACTICE_PATIENT_ID').apply(foo)

In [None]:
def get_action(row):
    index_l = np.where(row == 1)
    combi_drugs = ""
    for idx in index_l[0]:
        combi_drugs += treatment_list[idx] + "+"
    if combi_drugs == "":
        return "DO_NOTHING"
    else:
        return combi_drugs[:-1]


data['ACTION_DESC'] = data.apply(lambda row: get_action(row[treatment_list]), axis=1)

In [None]:
data['N_DRUGS'] = data[treatment_list].apply(lambda x: np.sum(x.values), axis=1)

In [None]:
data['ACTION'] = pd.factorize(data['ACTION_DESC'])[0]

In [None]:
data['ACTION'] = data['ACTION'] + 1

In [None]:
data.head(20)

In [None]:
for ftr in interest_feature_list:
    data[ftr] = data.groupby('PRACTICE_PATIENT_ID')[ftr].transform(lambda v: v.ffill())
    data[ftr] = data.groupby('PRACTICE_PATIENT_ID')[ftr].transform(lambda v: v.bfill())

In [None]:
data["MEDICATION"] = data.groupby('PRACTICE_PATIENT_ID').ACTION.shift(1,fill_value=0)

In [None]:
for ftr in interest_feature_list:
    print(ftr, (data[ftr].isna().sum() / len(data[ftr])) * 100 )

In [None]:
data.describe()

In [None]:
# Standardise ranges
data["EGFR"] = data.EGFR.clip(0.0, 90.0)
data["SYSTOLIC_BP"] = data.SYSTOLIC_BP.clip(0.0, 240.0)
data["DIASTOLIC_BP"] = data.DIASTOLIC_BP.clip(0.0, 160.0)
data["HBA1C"] = data.HBA1C.clip(0.0, 300.0)
data["SERUM_CHOLESTEROL"] = data.SERUM_CHOLESTEROL.clip(0.0, 20.0)
data["HDL"] = data.HDL.clip(0.0, 30.0)
data["LDL"] = data.LDL.clip(0.0, 30.0)
data["TRIGLYCERIDES"] = data.TRIGLYCERIDES.clip(0.0, 50.0)
data["BMI"] = data.BMI.clip(0.0, 120.0)

In [None]:
def improvement(x, lower):
    x = x.values
    if x[0] <= lower and x[1] <= lower:
        return(0.0)
    else:
        return(x[0] - x[1] - 0.01)

def bmi_improvement(x):
    x = x.values
    if 18.5 <= x[0] <= 24.9 and 18.5 <= x[1] <= 24.9:
        return (0.0)
    if x[0] < 18.5  or x[1] < 18.5:
        return (x[1] - x[0] - 0.01)
    else:
        return(x[0] - x[1] - 0.01)

In [None]:
f_s = lambda x: x.rolling(2).apply(lambda x: improvement(x, 120)).shift(-1,fill_value=0) # last visit has 0 rewards
f_a1c = lambda x: x.rolling(2).apply(lambda x: improvement(x, 42)).shift(-1,fill_value=0) # last visit has 0 rewards 
f_bmi = lambda x: x.rolling(2).apply(lambda x: bmi_improvement(x)).shift(-1,fill_value=0) # last visit has 0 rewards

#data['REWARD_BP'] = data.groupby('PRACTICE_PATIENT_ID').SYSTOLIC_BP.apply(f_s)
#data['REWARD_A1C'] = data.groupby('PRACTICE_PATIENT_ID').HBA1C.apply(f_a1c)
data_sys = data.groupby('PRACTICE_PATIENT_ID').SYSTOLIC_BP.apply(f_s)
data_a1c = data.groupby('PRACTICE_PATIENT_ID').HBA1C.apply(f_a1c)
data_bmi = data.groupby('PRACTICE_PATIENT_ID').BMI.apply(f_bmi)
data_n_drugs = data.STATUS.apply(lambda x: -x)

In [None]:
data_a1c_scaled = (2*((data_a1c-min(data_a1c))/(max(data_a1c)-min(data_a1c))))-1
data_sys_scaled = (2*((data_sys-min(data_sys))/(max(data_sys)-min(data_sys))))-1
data_bmi_scaled = (2*((data_bmi-min(data_bmi))/(max(data_bmi)-min(data_bmi))))-1

In [None]:
print("a1c: ", min(data_a1c_scaled), max(data_a1c_scaled))
print("sys: ", min(data_sys_scaled), max(data_sys_scaled))
print("bmi: ", min(data_bmi_scaled), max(data_bmi_scaled))
print("drugs: ", min(data_n_drugs), max(data_n_drugs))

In [None]:
data['REWARD_BP'] = data_sys_scaled
data['REWARD_A1C'] = data_a1c_scaled
data['REWARD_BMI'] = data_bmi_scaled
data['REWARD_CVD'] = data_n_drugs
data['REWARD'] = data_sys_scaled + data_a1c_scaled + data_bmi_scaled

In [None]:
print(min(data['REWARD']), max(data['REWARD']))

In [None]:
def normalise(df):
    result = df.copy()
    features = interest_feature_list + ['AGE']
    for feature_name in features:
        max_value = df[feature_name].max()
        min_value = df[feature_name].min()
        print(feature_name, max_value, min_value)
        result[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
    return result

In [None]:
#(Xi - Xmin)/(Xmax-Xmin)

In [None]:
scaled_dataframe = normalise(data)
#EGFR 90.0 3.97
#SYSTOLIC_BP 240.0 60.0
#DIASTOLIC_BP 144.0 20.0
#HBA1C 300.0 0.0
#SERUM_CHOLESTEROL 17.55 0.97
#HDL 24.0 0.2
#LDL 15.8 0.2
#TRIGLYCERIDES 50.0 0.1
#BMI 79.8 12.0
#AGE 80.0 35.0

In [None]:
scaled_dataframe['SEX'] = pd.factorize(scaled_dataframe['SEX'])[0]
# M = 0, F = 1

In [None]:
scaled_dataframe

In [None]:
def discount_cumsum(x, gamma):
    disc_cumsum = np.zeros_like(x)
    disc_cumsum[-1] = x[-1]
    for t in reversed(range(x.shape[0]-1)):
        disc_cumsum[t] = x[t] + gamma * disc_cumsum[t+1]
    return disc_cumsum


In [None]:
scaled_dataframe['REWARDS_TO_GOAL'] = scaled_dataframe['REWARD'].values
def rtg(group):
    rewards = group['REWARD'].values
    discounted_rewards = discount_cumsum(rewards, 1.0)
    for i in range(0,len(group)):
        group.iloc[i,group.columns.get_loc('REWARDS_TO_GOAL')]= discounted_rewards[i]
    return group
scaled_dataframe = scaled_dataframe.groupby('PRACTICE_PATIENT_ID').apply(rtg)

In [None]:
scaled_dataframe['REWARDS_TO_GOAL'] = (2*((scaled_dataframe['REWARDS_TO_GOAL'].values-min(scaled_dataframe['REWARDS_TO_GOAL'].values))/(max(scaled_dataframe['REWARDS_TO_GOAL'].values)-min(scaled_dataframe['REWARDS_TO_GOAL'].values))))-1

In [None]:
print(min(scaled_dataframe['REWARDS_TO_GOAL']), max(scaled_dataframe['REWARDS_TO_GOAL']))

In [None]:
scaled_dataframe.to_csv("../../data/processed.csv", index=False)

In [None]:
len(scaled_dataframe['PRACTICE_PATIENT_ID'].unique())

In [None]:
ids = scaled_dataframe['PRACTICE_PATIENT_ID'].unique()

In [None]:
np.random.shuffle(ids)
training, test = ids[:55535], ids[55535:]

In [None]:
np.max(scaled_dataframe['ACTION'])

In [None]:
train_df = scaled_dataframe[scaled_dataframe['PRACTICE_PATIENT_ID'].isin(training)]
test_df = scaled_dataframe[scaled_dataframe['PRACTICE_PATIENT_ID'].isin(test)]

In [None]:
train_df.to_csv("../../data/train.csv", index=False)
test_df.to_csv("../../data/test.csv", index=False)

In [None]:
data = pd.read_csv("../data/processed.csv")

In [None]:
min(data['ACTION'].values)

In [None]:
max(data['ACTION'].values)

In [None]:
desc = data['ACTION_DESC'].unique()

In [None]:
desc

In [None]:
act = data['ACTION'].unique()
sorted(act)

In [None]:
len(data[data['ACTION'].isin([0, 6])])/len(data)

In [None]:
len(desc)

In [None]:
for i in range(244):
    d = data[data['ACTION'] == i]['ACTION_DESC'].values
    if len(d) > 0:
        print(d[0])
    else:
        print(d, i)
        