# Task 2

In [24]:
# Import required packages
import csv
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from tabulate import tabulate
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, Bidirectional #Embedding, Attention
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.impute import SimpleImputer

In [2]:
from google.colab import drive

drive.mount('/content/drive', force_remount=True)

path="/content/drive/MyDrive/project/(1999250)_EHR"

Mounted at /content/drive


In [3]:
path = path + '/data/preprocessed'

In [None]:
#trajectories = pd.read_csv(path + '/trajectories.csv')

In [4]:
prescriptions_of_non_diabetes_drugs = pd.read_csv(path + '/prescriptions_of_non_diabetes_drugs.csv')
prescriptions_of_diabetes_drugs = pd.read_csv(path + '/prescriptions_of_diabetes_drugs.csv')
patients_diets_and_blood_glucose_controls = pd.read_csv(path + '/patients_diets_and_blood_glucose_controls.csv')
medical_tests = pd.read_csv(path + '/medical_tests.csv')
laboratory_tests = pd.read_csv(path + '/laboratory_tests.csv')
laboratory_tests_calculated_parameters = pd.read_csv(path + '/laboratory_tests_calculated_parameters.csv')
diagnosis_tests = pd.read_csv(path + '/diagnosis_tests.csv')
active_patients_info = pd.read_csv(path + '/active_patients_info.csv')

In [5]:
laboratory_tests_calculated_parameters.head()

Unnamed: 0,id_center,id_ana,date,amd_code,value,stitch_code,birth_year,death_year
0,149,260,2005-09-07,AMD927,26.64,STITCH001,1947,
1,149,260,2006-12-05,AMD927,25.61,STITCH001,1947,
2,149,260,2007-06-08,AMD927,26.99,STITCH001,1947,
3,149,260,2008-02-12,AMD927,26.99,STITCH001,1947,
4,149,260,2008-08-01,AMD927,27.16,STITCH001,1947,


## Task 2.1 - Class Imbalance 1

Not all patients will have a cardiovascular event within the stabilised six-month period. Thus, we would expect that the class distribution is highly imbalanced. For each patient $p_i$ such that $y(p_i) = 0$, eliminate the last six months of history to avoid giving the model prediction
hints into the future. For each patient $p_i$ such that $y(p_i) = 1$, create $m$ copies ${{p^1_j,...,p^m_j}}$g such that all the cardiovascular events in the last six months of $p^j_i \forall{i} \in [1, P] \forall{j} \in [1, m]$ are eliminated, and the other events are shu ed and cancelled at random. In this way, you have a sort of balancing criterion (i.e., up-sampling the minority class).

In [None]:
active_patients_info.head()

Unnamed: 0,id_center,id_ana,sex,diagnosis_of_diabetes_year,type_of_diabetes,education,marital_status,profession,origin,birth_year,first_access_year,death_year,label
0,100,3080,M,1988.0,5,,,,,1936,1991.0,,1
1,160,2537,F,2002.0,5,,,,,1938,2002.0,,0
2,41,97,M,2002.0,5,,,9.0,,1937,2003.0,2017.0,1
3,89,19992,M,2001.0,5,,,,,1951,2017.0,,0
4,100,20342,M,2011.0,5,,,,,1933,2011.0,,0


The guidelines demand us to treat the patients with label = 1 differently than those with label = 0. For the first ones, we should create copies in order to up-sample this class.

In [6]:
# Create a DF that encompasses all the possible events, to identify the latest event for each patient and clear its 6 last months of data
compendium_events = pd.concat([prescriptions_of_non_diabetes_drugs[['id_center','id_ana','date']],
                               prescriptions_of_diabetes_drugs[['id_center','id_ana','date']],
                               patients_diets_and_blood_glucose_controls[['id_center','id_ana','date']],
                               medical_tests[['id_center','id_ana','date']],
                               laboratory_tests[['id_center','id_ana','date']],
                               laboratory_tests_calculated_parameters[['id_center','id_ana','date']],
                               diagnosis_tests[['id_center','id_ana','date']]])

In [7]:
# Identify the date of the latest event, by patient
latest_event = compendium_events.groupby(['id_center','id_ana'])['date'].max().reset_index()
latest_event.head()

Unnamed: 0,id_center,id_ana,date
0,1,5,2013-12-31
1,1,36,2010-06-25
2,1,38,2015-12-18
3,1,61,2019-05-15
4,1,65,2017-11-09


In [8]:
del compendium_events

In [9]:
def remove_last_six_months(df_to_filter, df_last_event):
  '''
  Function used to remove last six months of data, given a dataframe to filter
  and another df containing, for each patient, its corresponding latest day
  when an event took place.
  '''
  # Attaching the latest date of an event for each patient, regardless of the events contained in df_to_filter
  df_to_filter_latest_date = df_to_filter.merge(df_last_event, on = ['id_center','id_ana'] ,how = 'left')

  # Making sure the dtypes are correct
  df_to_filter_latest_date['date_x'] = pd.to_datetime(df_to_filter_latest_date['date_x'])
  df_to_filter_latest_date['date_y'] = pd.to_datetime(df_to_filter_latest_date['date_y'])

  # Creating a mask, 1 for events that took place within the latest 6 months of history, thus they should be dropped. 0 otherwise.
  within_dropping_mask = df_to_filter_latest_date['date_x'] > (df_to_filter_latest_date['date_y'] - pd.DateOffset(months=6))

  # Drop events that took place within the latest 6 months of history of each patient
  df_to_filter_latest_date = df_to_filter_latest_date[~within_dropping_mask] #~

  # Drop the column corresponding to the latest day for the patient.
  df_to_filter_latest_date = df_to_filter_latest_date.drop('date_y', axis = 1)

  return df_to_filter_latest_date

We proceed to remove all events that took place within the last 6 months of history of each patient, by filtering all the tables with the remove_last_six_months. This is done regardless of the patient's label, as the removal of the last six months of data isn't required for just patients with a certain label but for all of them.

In [10]:
names_tables = ["prescriptions_of_non_diabetes_drugs",
                "prescriptions_of_diabetes_drugs",
                "patients_diets_and_blood_glucose_controls",
                "medical_tests",
                "laboratory_tests",
                "laboratory_tests_calculated_parameters",
                "diagnosis_tests"]

tables = [prescriptions_of_non_diabetes_drugs,
    prescriptions_of_diabetes_drugs,
    patients_diets_and_blood_glucose_controls,
    medical_tests,
    laboratory_tests,
    laboratory_tests_calculated_parameters,
    diagnosis_tests,
    active_patients_info
]
shapes_before = [file.shape for file in tables]

In [11]:
# Removing the last six months of history
prescriptions_of_non_diabetes_drugs = remove_last_six_months(prescriptions_of_non_diabetes_drugs, latest_event)
prescriptions_of_diabetes_drugs = remove_last_six_months(prescriptions_of_diabetes_drugs, latest_event)
patients_diets_and_blood_glucose_controls = remove_last_six_months(patients_diets_and_blood_glucose_controls, latest_event)
medical_tests = remove_last_six_months(medical_tests, latest_event)
laboratory_tests = remove_last_six_months(laboratory_tests, latest_event)
laboratory_tests_calculated_parameters = remove_last_six_months(laboratory_tests_calculated_parameters, latest_event)
diagnosis_tests = remove_last_six_months(diagnosis_tests, latest_event)

In [12]:
tables = [prescriptions_of_non_diabetes_drugs,
    prescriptions_of_diabetes_drugs,
    patients_diets_and_blood_glucose_controls,
    medical_tests,
    laboratory_tests,
    laboratory_tests_calculated_parameters,
    diagnosis_tests,
    active_patients_info
]
shapes_after = [file.shape for file in tables]

In [13]:
table_shapes = list(zip(names_tables,shapes_before,shapes_after))
table = tabulate(table_shapes, headers=["Table Name", "Shape before", "Shape after"], tablefmt="grid")
print(table)

+-------------------------------------------+----------------+---------------+
| Table Name                                | Shape before   | Shape after   |
| prescriptions_of_non_diabetes_drugs       | (149422, 7)    | (131316, 7)   |
+-------------------------------------------+----------------+---------------+
| prescriptions_of_diabetes_drugs           | (1980368, 9)   | (1783856, 9)  |
+-------------------------------------------+----------------+---------------+
| patients_diets_and_blood_glucose_controls | (1984358, 7)   | (1753899, 7)  |
+-------------------------------------------+----------------+---------------+
| medical_tests                             | (289176, 7)    | (261702, 7)   |
+-------------------------------------------+----------------+---------------+
| laboratory_tests                          | (7337343, 7)   | (6488795, 7)  |
+-------------------------------------------+----------------+---------------+
| laboratory_tests_calculated_parameters    | (22684

Now, we need to up-sample the patients with label = 1 (i.e those who had a cardiovascular event in the last 6 months).

In [14]:
# Filter only those patients with label = 1
pos_patients = active_patients_info[active_patients_info['label'] == 1]

In [15]:
def replicating_pos_patients(positive_patients_df, m = 1):
  '''
  We change the id_center to 500 and consecutive numbers of it, in order to make the patients unique.
  The number 500 is taken as it is the highest number of id_center encountered in the active patients table
  We take this particular number because we might want to keep track of which patients are copies and
  which are the original patients with a positive label.
  '''
  # m is the number of copies made for each patient
  # We define the list that is going to contain the copies of the patients
  replicated_pos_patients = pd.DataFrame()

  # make m copies
  for i in range(m):
    # copy the patients with label = 1
    replica_patients = positive_patients_df.copy()
    # modify the id_center of all the copies to make them unique patients
    replica_patients['id_replica'] = 501 + i # 500 as it is the max id_center in the table
    # concatenate to the dataframe of copies
    replicated_pos_patients = pd.concat([replicated_pos_patients, replica_patients])


  return replicated_pos_patients

In [16]:
replicas_pos_patients = replicating_pos_patients(pos_patients, 1)

In [17]:
def filter_by_replicas(table_to_filter, replicas_df, perc_removal = 0.05):
  # Set seed
  seed_replica = 50
  #
  merged_df = table_to_filter.merge(replicas_df[['id_center','id_ana', 'id_replica']], how='inner', on=['id_center','id_ana'])
  # Set id_center as id_replica
  merged_df['id_center'] = merged_df['id_replica']
  # Drop the id_replica column
  merged_df = merged_df.drop('id_replica', axis = 1)
  # Renaming with the original names
  merged_df.columns = table_to_filter.columns
  # Shuffling the rows
  merged_df = merged_df.sample(frac=1, random_state=seed_replica).reset_index(drop=True)
  # Dropping some rows
  merged_df = merged_df.drop(merged_df.sample(None, perc_removal, False, random_state=seed_replica).index).reset_index(drop=True)

  return merged_df

See that these data corresponds to those patients with label = 1 that were replicated. This means that this DF DOESN'T contain the **original** patients with label = 1 (those from whom the replicates were made) and also not the patients with label = 0. The data of these patients should be concatenated in the end.

In [18]:
prescriptions_of_non_diabetes_drugs.head()

Unnamed: 0,id_center,id_ana,date_x,amd_code,value,birth_year,death_year
0,1,5,2008-06-20,AMD152,,1942,2014.0
4,1,36,2005-01-17,AMD090,S,1924,2011.0
5,1,36,2006-02-24,AMD086,S,1924,2011.0
6,1,36,2006-02-24,AMD090,S,1924,2011.0
7,1,36,2006-09-04,AMD090,S,1924,2011.0


In [19]:
prescriptions_of_non_diabetes_drugs = pd.concat([prescriptions_of_non_diabetes_drugs,filter_by_replicas(prescriptions_of_non_diabetes_drugs, replicas_pos_patients)])
prescriptions_of_diabetes_drugs = pd.concat([prescriptions_of_diabetes_drugs,filter_by_replicas(prescriptions_of_diabetes_drugs, replicas_pos_patients)])
patients_diets_and_blood_glucose_controls = pd.concat([patients_diets_and_blood_glucose_controls,filter_by_replicas(patients_diets_and_blood_glucose_controls, replicas_pos_patients)])
medical_tests = pd.concat([medical_tests,filter_by_replicas(medical_tests, replicas_pos_patients)])
laboratory_tests = pd.concat([laboratory_tests,filter_by_replicas(laboratory_tests, replicas_pos_patients)])
laboratory_tests_calculated_parameters = pd.concat([laboratory_tests_calculated_parameters,filter_by_replicas(laboratory_tests_calculated_parameters, replicas_pos_patients)])
diagnosis_tests = pd.concat([diagnosis_tests,filter_by_replicas(diagnosis_tests, replicas_pos_patients)])


In [20]:
replicas_pos_patients['id_center'] = replicas_pos_patients['id_replica']
# Drop the id_replica column
replicas_pos_patients = replicas_pos_patients.drop('id_replica', axis = 1)

In [21]:
active_patients_info['label'].value_counts()

0    32370
1    15427
Name: label, dtype: int64

In [22]:
active_patients_info = pd.concat([active_patients_info,replicas_pos_patients])

In [23]:
active_patients_info['label'].value_counts()

0    32370
1    30854
Name: label, dtype: int64

# Task 2.2

## Balancing Strategy #2 (not completely required)

In [None]:
active_patients_info.label.value_counts()

0    32370
1    30854
Name: label, dtype: int64

Balanced but not quite, let's undersample the 0 class.

In [None]:
np.random.seed(346)
# number of label 0 patients to drop
num_to_drop = active_patients_info.label.value_counts()[0] - active_patients_info.label.value_counts()[1]
# randomly select patients
uninteresting_patients = np.random.choice(active_patients_info.loc[active_patients_info.label == 0][['id_center','id_ana']].index, num_to_drop, replace = False)
uninteresting_patients = active_patients_info.loc[uninteresting_patients][['id_center','id_ana']]
# convert to list of tuples
uninteresting_patients = list(zip(*map(uninteresting_patients.get, uninteresting_patients)))

In [None]:
def filter_dataframe(target, uninteresting_patients):
  return target.set_index(['id_center','id_ana']).drop(uninteresting_patients, axis = 0, errors = 'ignore').reset_index()

active_patients_info = filter_dataframe(active_patients_info, uninteresting_patients)
diagnosis_tests = filter_dataframe(diagnosis_tests, uninteresting_patients)
laboratory_tests_calculated_parameters = filter_dataframe(laboratory_tests_calculated_parameters, uninteresting_patients)
laboratory_tests = filter_dataframe(laboratory_tests, uninteresting_patients)
medical_tests = filter_dataframe(medical_tests, uninteresting_patients)
prescriptions_of_diabetes_drugs = filter_dataframe(prescriptions_of_diabetes_drugs, uninteresting_patients)
prescriptions_of_non_diabetes_drugs = filter_dataframe(prescriptions_of_non_diabetes_drugs, uninteresting_patients)
patients_diets_and_blood_glucose_controls = filter_dataframe(patients_diets_and_blood_glucose_controls, uninteresting_patients)

del uninteresting_patients

In [None]:
active_patients_info.label.value_counts()

1    30854
0    30854
Name: label, dtype: int64

That's better! Let's go on.

In [None]:
trajectories = pd.concat(
    [diagnosis_tests[["id_center", "id_ana", "date_x", "amd_code"]].rename({'date_x': 'date', 'amd_code': 'code'}, axis=1),
     laboratory_tests[["id_center", "id_ana", "date_x", "amd_code"]].rename({'date_x': 'date', 'amd_code': 'code'}, axis=1),
     laboratory_tests_calculated_parameters[["id_center", "id_ana", "date_x", "stitch_code"]].rename({'date_x': 'date', 'stitch_code': 'code'}, axis=1),
     medical_tests[["id_center", "id_ana", "date_x", "amd_code"]].rename({'date_x': 'date', 'amd_code': 'code'}, axis=1),
     prescriptions_of_non_diabetes_drugs[["id_center", "id_ana", "date_x", "amd_code"]].rename({'date_x': 'date', 'amd_code': 'code'}, axis=1),
     patients_diets_and_blood_glucose_controls[["id_center", "id_ana", "date_x", "amd_code"]].rename({'date_x': 'date', 'amd_code': 'code'}, axis=1),
     prescriptions_of_diabetes_drugs[["id_center", "id_ana", "date_x", "atc_code"]].rename({'date_x': 'date', 'atc_code': 'code'}, axis=1)])

In [None]:
trajectories.head()

Unnamed: 0,id_center,id_ana,date,code
0,1,5,1980-01-01,AMD247
1,1,5,1986-01-01,AMD247
2,1,5,1987-01-01,AMD083
3,1,5,1987-01-01,AMD247
4,1,5,1997-12-01,AMD247


In [None]:
del diagnosis_tests, laboratory_tests

In [None]:
del laboratory_tests_calculated_parameters, medical_tests, prescriptions_of_non_diabetes_drugs, patients_diets_and_blood_glucose_controls, prescriptions_of_diabetes_drugs

In [None]:
del replicating_pos_patients

#### Adding the age of the patient at the moment of the event.

In [None]:
trajectories = pd.merge(trajectories, active_patients_info[['id_center','id_ana','birth_year','label']], on = ['id_center','id_ana'], how = 'left')

In [None]:
del active_patients_info

In [None]:
trajectories

Unnamed: 0,id_center,id_ana,date,code,birth_year,label
0,1,5,1980-01-01,AMD247,1942,1
1,1,5,1986-01-01,AMD247,1942,1
2,1,5,1987-01-01,AMD083,1942,1
3,1,5,1987-01-01,AMD247,1942,1
4,1,5,1997-12-01,AMD247,1942,1
...,...,...,...,...,...,...
23503856,501,1871,2015-03-16,A10BX02,1935,1
23503857,501,1871,2015-03-16,A10BX02,1946,1
23503858,501,1871,2015-03-16,A10BX02,1947,1
23503859,501,1871,2015-03-16,A10BX02,1928,1


In [None]:
trajectories['birth_year'] = pd.to_datetime(trajectories['birth_year'], format='%Y').dt.year

In [None]:
trajectories['age_at_event'] = trajectories['date'].dt.year - trajectories['birth_year']
trajectories.head()

Unnamed: 0,id_center,id_ana,date,code,birth_year,label,age_at_event
0,1,5,1980-01-01,AMD247,1942,1,38
1,1,5,1986-01-01,AMD247,1942,1,44
2,1,5,1987-01-01,AMD083,1942,1,45
3,1,5,1987-01-01,AMD247,1942,1,45
4,1,5,1997-12-01,AMD247,1942,1,55


In [None]:
trajectories = trajectories.drop('birth_year', axis = 1)
trajectories.head()

Unnamed: 0,id_center,id_ana,date,code,label,age_at_event
0,1,5,1980-01-01,AMD247,1,38
1,1,5,1986-01-01,AMD247,1,44
2,1,5,1987-01-01,AMD083,1,45
3,1,5,1987-01-01,AMD247,1,45
4,1,5,1997-12-01,AMD247,1,55


# LSTM

In [None]:
def easy_LSTM(trajectories_data, epochs_i = 5, batch_size_i = 128):
  # Encoding the code variable, which should be treated as categorical
  label_encoder = LabelEncoder()
  trajectories_data['code'] = label_encoder.fit_transform(trajectories_data['code'])

  # Splitting the trajectories_data in features and label to predict
  X = trajectories_data[['id_center', 'id_ana', 'code']]
  y = trajectories_data['label']

  # Splitting trajectories_data into training and testing trajectories_datasets. 20% of trajectories_data to test.
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=50)

  # Defining the very simple LSTM model
  model = Sequential()
  model.add(LSTM(units=64, input_shape=(X_train.shape[1], 1)))
  model.add(Dense(units=1, activation='sigmoid')) # as it is a dummy

  # Compiling the model
  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

  # Training the model
  model.fit(X_train, y_train, epochs = epochs_i, batch_size= batch_size_i)

  # Evaluating the model
  loss, accuracy = model.evaluate(X_test, y_test)
  print(f'Test Loss: {loss:.4f}')
  print(f'Test Accuracy: {accuracy:.4f}')
  return loss,accuracy

In [None]:
def medium_LSTM(trajectories_data, epochs_i = 5, batch_size_i = 128):
  # Encoding the code variable, which should be treated as categorical
  label_encoder = LabelEncoder()
  trajectories_data['code'] = label_encoder.fit_transform(trajectories_data['code'])

  # Splitting the trajectories_data in features and label to predict
  X = trajectories_data[['id_center', 'id_ana', 'code']]
  y = trajectories_data['label']

  # Splitting trajectories_data into training and testing trajectories_datasets. 20% of trajectories_data to test.
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=50)

  # Scaling the features (gradient convergence)
  scaler = StandardScaler()
  X_train = scaler.fit_transform(X_train)
  X_test = scaler.transform(X_test)

  # Reshaping to avoid errors after the scaling
  X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
  X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

  # Defining the medium-level complexity model
  model = Sequential()
  model.add(LSTM(units=128, return_sequences=True, input_shape=(X_train.shape[1], 1)))
  model.add(Dropout(0.2))
  model.add(LSTM(units=64))
  model.add(Dropout(0.2))
  model.add(Dense(units=1, activation='sigmoid'))

  # Compiling the model
  optimizer = Adam(learning_rate=0.01) #adjusting learning rate for faster but bouncier convergence
  model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

  # Defining the early stop phase with a patience of 3 epochs
  early_stopping = EarlyStopping(patience=3, restore_best_weights=True)

  # Training the model
  model.fit(X_train, y_train, epochs=epochs_i, batch_size=batch_size_i, validation_data=(X_test, y_test), callbacks=[early_stopping])

  # Evaluating the model
  loss, accuracy = model.evaluate(X_test, y_test)
  print(f'Test Loss: {loss:.4f}')
  print(f'Test Accuracy: {accuracy:.4f}')
  return loss, accuracy

In [None]:
def complex_LSTM(trajectories_data, epochs_i=5, batch_size_i=128):
    label_encoder = LabelEncoder()
    trajectories_data['code'] = label_encoder.fit_transform(trajectories_data['code'])

    # Splitting the trajectories_data into features and label to predict
    X = trajectories_data[['id_center', 'id_ana', 'code']]
    y = trajectories_data['label']

    # Splitting the trajectories_data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=50)

    del X
    del y
    # Feature scaling (better convergence!)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Padding the input sequences
    max_sequence_length = max(X_train.shape[0], X_test.shape[0])
    X_train_padded = np.zeros((max_sequence_length, X_train.shape[1]))
    X_test_padded = np.zeros((max_sequence_length, X_test.shape[1]))

    X_train_padded[:X_train.shape[0], :] = X_train
    X_test_padded[:X_test.shape[0], :] = X_test

    # Reshaping the input trajectories_data
    X_train_padded = X_train_padded[:, :, np.newaxis]
    X_test_padded = X_test_padded[:, :, np.newaxis]

    # Defining the LSTM model (using bidirectional, without attention)
    model = Sequential()
    model.add(Bidirectional(LSTM(units=128, return_sequences=True), input_shape=(X_train_padded.shape[1], 1)))
    model.add(Dropout(0.2))
    model.add(Bidirectional(LSTM(units=64, return_sequences=True)))
    model.add(Dropout(0.2))
    model.add(Dense(units=1, activation='sigmoid'))

    # Compiling
    optimizer = Adam(learning_rate=0.01)
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

    # Defining the early stop phase with a patience of 3 epochs (not gonna be used)
    early_stopping = EarlyStopping(patience=3, restore_best_weights=True)

    # Training
    model.fit(X_train_padded[:X_train.shape[0]], y_train, epochs=epochs_i, batch_size=batch_size_i, validation_split=0.2, callbacks=[early_stopping])

    # Evaluating...
    loss, accuracy = model.evaluate(X_test_padded[:X_test.shape[0]], y_test)
    print(f'Test Loss: {loss:.4f}')
    print(f'Test Accuracy: {accuracy:.4f}')

    return loss, accuracy

In [None]:
def LSTM_f(trajectories_data, version_to_run = 'simple', n_epochs = 5):
  if version_to_run == 'simple':
    loss, accuracy = easy_LSTM(trajectories_data, epochs_i = n_epochs)
  elif version_to_run == 'medium':
    loss, accuracy = medium_LSTM(trajectories_data, epochs_i = n_epochs)
  else:
    loss, accuracy = complex_LSTM(trajectories_data, epochs_i = n_epochs)

  return loss,accuracy

In [None]:
x, y = LSTM_f(trajectories_data = trajectories, version_to_run = 'complex', n_epochs = 1)

Test Loss: 0.3547
Test Accuracy: 0.8201


# T-LSTM

In [None]:
def t_lstm(trajectories_data, epochs_i = 5, batch_size_i = 128):
  label_encoder = LabelEncoder()
  trajectories_data['code'] = label_encoder.fit_transform(trajectories_data['code'])

  # need to give T-LSTM the days that have passed for each event since the mionimum date of each patient
  # this as it expects only numerical.

  # guarantee variable is treated as date
  # Preprocess the trajectories_x
  trajectories_data['date'] = pd.to_datetime(trajectories_data['date'])

  # computing for each patient the number of days that have passed for each event from its minimum date
  trajectories_data['days_since_min_date'] = trajectories_data.groupby(['id_center', 'id_ana'])['date'].transform(lambda x: (x - x.min()).dt.days)

  # splitting data into features and variable to predict
  X = trajectories_data.drop(['label', 'date'], axis=1).values.astype(float)
  y = trajectories_data['label'].values

  # splitting into train and test data (20% for testing)
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=50)

  # need to reshape. set to 1 as each sample must be treated as single time step-
  # this is the key to capture the temporal dependencies between events. Requirement of T-LSTM
  X_train = np.reshape(X_train, (X_train.shape[0], 1, X_train.shape[1]))
  X_test = np.reshape(X_test, (X_test.shape[0], 1, X_test.shape[1]))

  # replicating the architecture of the simplest LSTM we implemented
  model = Sequential()
  model.add(LSTM(units=64, input_shape=(1, X_train.shape[2])))
  model.add(Dense(units=1, activation='sigmoid')) # as it is a dummy

  model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

  # Training
  model.fit(X_train, y_train, epochs=epochs_i, batch_size=batch_size_i)

  # Evaluating...
  loss, accuracy = model.evaluate(X_test, y_test)
  print(f'Test Loss: {loss:.4f}')
  print(f'Test Accuracy: {accuracy:.4f}')

  return loss, accuracy

In [None]:
x, y = t_lstm(trajectories_data = trajectories,epochs_i = 1)

Test Loss: 0.3677
Test Accuracy: 0.8128


# PubMedBERT

In [None]:
# Filtering just some events as it is incredibly time consuming
trajectories_x = trajectories[1:10000]

In [None]:
#!pip install transformers

In [None]:
from transformers import AutoTokenizer, AutoModel

In [None]:
def embeddings_BERT(input_data):
  # Loading the pre-trained PubMedBERT model
  model_name = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext"
  #Initializing the tokenizer that we're gonna use
  tokenizer = AutoTokenizer.from_pretrained(model_name)
  # Defining the model based on the pretrained model
  model = AutoModel.from_pretrained(model_name)

  # Creating the string representation of each event to be used in the upcoming processing
  patient_records = input_data["id_center"].astype(str) + " " + \
    input_data["id_ana"].astype(str) + " " + \
    input_data["date"].astype(str) + " " + \
    input_data["code"].astype(str)

  # Create the embeddings!
  embeddings_l = []
  for idx, record in enumerate(patient_records):
    # tokeninzing the records and adding special tokens at beginning and end
    inputs = tokenizer.encode_plus(record, add_special_tokens=True, return_tensors="pt")
    # unpacks and passes input to the model 1 at a time
    outputs = model(**inputs)
    # reducing to a single vector the info captured from an aggregation of the input tokens (average pooling)
    record_embedding = outputs.last_hidden_state.squeeze(dim=0).mean(dim=0)
    # storing the embeddings in a list so we can have access to them to use them as input to train a model
    embeddings_l.append(record_embedding.detach().numpy())
    print(f"Patient {idx+1}")
  return embeddings_l

In [None]:
embeddings = embeddings_BERT(trajectories_x)

In [None]:
# Combinining embeddings with labels
data_with_embeddings = pd.concat([trajectories_x, pd.DataFrame(embeddings)], axis=1)

In [None]:
# We write the embeddings
#data_with_embeddings.to_csv("patient_data_with_embeddings.csv", index=False)

# Training Models based on the embeddings

In [None]:
data_with_embeddings = pd.read_csv("patient_data_with_embeddings.csv")

## Logistic Regression

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline

In [None]:
def logistic_regression_with_embeddings(data_plus_embeddings):
  # isolating the embeddings
  embeddings = data_plus_embeddings.iloc[:, 7:].values
  # isolating the label
  labels = data_plus_embeddings["label"].values
  # imputing missing values both in the target variable as in the late pipeline
  nan_indices = np.isnan(labels)
  if np.any(nan_indices):
    imputer = SimpleImputer(strategy="most_frequent")
    labels = imputer.fit_transform(labels.reshape(-1, 1)).flatten()
  # 20% for test
  X_train, X_test, y_train, y_test = train_test_split(embeddings, labels, test_size=0.2, random_state=50)
  # creating the pipeline and imputing here. Defining the pipeline as a logistic regression
  pipeline = make_pipeline(SimpleImputer(strategy="mean"), LogisticRegression())
  # training the just defined pipeline
  pipeline.fit(X_train, y_train)
  # using the trained pipeline to make predictions
  y_pred = pipeline.predict(X_test)

  return y_test, y_pred

In [None]:
y_test, y_pred = logistic_regression_with_embeddings(data_with_embeddings)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.942


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Using the Embeddings but running a more complex model to predict

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.impute import SimpleImputer

data_with_embeddings = pd.read_csv("patient_data_with_embeddings.csv")

embeddings = data_with_embeddings.iloc[:, 7:].values
labels = data_with_embeddings["label"].values

# Impute missing values both in the embeddings and in the variable to predict
imputer = SimpleImputer(strategy="mean")
embeddings = imputer.fit_transform(embeddings)
labels = imputer.fit_transform(labels.reshape(-1, 1)).flatten()

X_train, X_test, y_train, y_test = train_test_split(embeddings, labels, test_size=0.2, random_state=42)

# Very simple NN
model = Sequential()
model.add(Dense(64, activation="relu", input_shape=(embeddings.shape[1],)))
model.add(Dropout(0.2))
model.add(Dense(32, activation="relu"))
model.add(Dropout(0.2))
model.add(Dense(1, activation="sigmoid"))


model.compile(loss="binary_crossentropy", optimizer=Adam(learning_rate=0.001), metrics=["accuracy"])


model.fit(X_train, y_train, batch_size=128, epochs=20, validation_data=(X_test, y_test))


_, accuracy = model.evaluate(X_test, y_test)
print("Accuracy:", accuracy)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Accuracy: 0.9649999737739563
