In [1]:
# Import necessary lib
import numpy as np
from pandas import read_csv
import pandas as pd
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype
import pickle
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

# Helping functions

In [2]:
# Function to determine unique values in a dataframe's column based on the column's name.
def unique_value_function(df, feature_name):
    if feature_name in df.columns:
        _values = df[[feature_name]]
        unique_values = _values.values
        unique_values = np.unique(unique_values)
        num_of_unique_values = np.unique(unique_values).shape[0]
        #print('Number of unique values in '+feature_name+' is: ', num_of_unique_values)
        return unique_values
    else:
        print(str(feature_name)+' is not a feature name in this dataframe.')
        return -1

In [3]:
# Function to check if the dataset has all necessary features and been preprocessed carefully.
def check_longitudinal_dataset(df):
    # chech if the dataframe is empty
    if df.empty:
        print('Dataset is empty.')
        return -1
    
    # Check for missing values
    if df.isnull().sum().any():
        print('Dataset has NAN values and needs to preprocess.')
        return -1
    
    # Check for the existence of important features
    features_name = df.columns
    if not('RID' in features_name) or not('VISCODE' in features_name) or not('DX' in features_name):
        print('Dataset does not have necessary feature/s')
        return -1
    
    # chech if the RID's is numeric
    if not(is_numeric_dtype(df['RID'])):
        print('Patient ID should be numeric.')
        return -1
    
    # Check for the existence of only Dementia and MCI as diagnosis
    unique_diagnosis = unique_value_function(df, 'DX')
    if (len(unique_diagnosis) != 2) or not('Dementia' in unique_diagnosis) or not('MCI' in unique_diagnosis):
        print('Dataset does not have correct diagnosis or unique number of diagnosis.')
        return -1
    
    # Check if the dataframe has at least one longitudinal feature other than RID, VISCODE, and DX
    if not(df.shape[1] > 3):
        print('Dataset does not have enough features')
        return -1
    
    # Check if the longitudinal data is numiric
    features_name = df.columns
    flag = False
    for i in range(len(features_name)):
        if not(features_name[i] in ['RID', 'DX', 'VISCODE']):
            if not(is_numeric_dtype(df[features_name[i]])):
                flag = True
    if flag:
        print('Data should be numeric.')
        return -1
            
    
    # return 1 if the dataset is ready
    return 1

In [4]:
# Function to check if the dataset has all necessary features and been preprocessed carefully.
def check_demographic_dataset(df):
    # chech if the dataframe is empty
    if df.empty:
        print('Dataset is empty.')
        return -1
    
    # Check for missing values
    if df.isnull().sum().any():
        print('Dataset has NAN values and needs to preprocess.')
        return -1
    
    # Check for the existence of important features
    features_name = df.columns
    if not('RID' in features_name):
        print('Dataset does not have necessary feature/s')
        return -1
    
    # chech if the RID's is numeric
    if not(is_numeric_dtype(df['RID'])):
        print('Patient ID should be numeric.')
        return -1
    
    # Check if the dataframe has at least one demographic feature other than RID
    if not(df.shape[1] > 1):
        print('Dataset does not have enough features')
        return -1
    
    # return 1 if the dataset is ready
    return 1

In [5]:
# Function to prepare longitudinal data (VISCODE)
def visit_code_preperation(df):
    unique_visitcode = unique_value_function(df, 'VISCODE')
    unique_id = unique_value_function(df, 'RID')
    
    columns_name = list(df.columns)
    new_df = pd.DataFrame(columns = columns_name)
    
    for i in range(len(unique_id)):
        temp_data = df[df["RID"] == unique_id[i]]
        temp_data.reset_index(drop=True,inplace=True)
        size = temp_data.shape[0]
        
        for j in range(size):
            temp_data.loc[j, 'VISCODE'] = j*6
            new_row = temp_data.iloc[j,:]
            new_df.loc[len(new_df)] = new_row
    
    return new_df

In [6]:
def num_patients_visits_function(df, _list):
    # Calculate maximum number of visits
    visit_size = 0
    for i in range(len(_list)):
        temp_data = df[df["RID"] == _list[i]]
        size = (temp_data.shape)[0]
        if size > visit_size:
            visit_size = size

    # Calculate how many patients in each number of visit groups for removed patients
    visit_size = np.zeros((visit_size),int)
    for i in range(len(_list)):
        temp_data = df[df["RID"] == _list[i]]
        size = (temp_data.shape)[0]
        visit_size[size-1] = visit_size[size-1] + 1

    for i in range(len(visit_size)):
        print (i+1,'_Visit = ', visit_size[i])

In [7]:
# Encoding Diagnosis DX (MCI = 0 and Dementia = 1)
def encode_diagnosis(df):
    for i in range(len(df)):
        if df.loc[i, 'DX'] == 'MCI':
            df.loc[i, 'DX'] = 0
        else:
            df.loc[i, 'DX'] = 1
    return df

In [8]:
# Normalize longitudinal data using min-max normalization
def min_max_normalization(df):
    columns = list(df.columns)
    new_arrangment_for_columns = ['RID','VISCODE']
    for col in columns:
        if not(col in new_arrangment_for_columns) and col != 'DX':
            new_arrangment_for_columns.append(col)
    new_arrangment_for_columns.append('DX')
    
    df = df[new_arrangment_for_columns]
    
    for i in range(2, len(new_arrangment_for_columns)-1):   
        temp_data = df.iloc[:,i]
        max_value = temp_data.max()
        min_value = temp_data.min()
        for j in range(len(df)):
            df.iat[j, i] = (df.iloc[j, i]-min_value)/(max_value - min_value)
            
    return df

In [9]:
# Function to group patients together based on number of visits they have
def group_patients_according_number_of_visits(df):
    unique_id = unique_value_function(df, 'RID')
    visits_dic = {}
    
    for i in range(len(unique_id)):
        temp_data = df[df["RID"] == unique_id[i]]
        temp_data.reset_index(drop=True,inplace=True)
        size = temp_data.shape[0]
        
        if size in visits_dic:
            visits_dic[size] = pd.concat([visits_dic[size], temp_data])
            visits_dic[size].reset_index(drop=True, inplace=True)
        else:
            visits_dic[size] = temp_data
            
    # sort the dictionary based on the key
    sorted_dic = {}
    for key in sorted(visits_dic):
        sorted_dic[key] = visits_dic[key]
    return sorted_dic

In [10]:
# Function to transpose the longitudinal dataset
def transpose_longitudinal_data(group_longitudinal_data_dic, features_to_be_in_columns):
    transposed_lonitudinal_data_dic = {}
    for key in (group_longitudinal_data_dic):
        transposed_lonitudinal_data_dic[key] = group_longitudinal_data_dic[key].pivot(index = 'RID', columns= 'VISCODE',
                                                                                      values= features_to_be_in_columns)
        
    new_columns_names_dic = {}
    for key in (group_longitudinal_data_dic):
        new_columns_names_dic[key] = ['RID']
    for key in (group_longitudinal_data_dic):
        time_points = key
        
        for i in range(time_points):
            for j in range(1, int(len(transposed_lonitudinal_data_dic[key].columns)/time_points+1)):
                column_idex = i + (key * j) - key
                new_columns_names_dic[key].append(transposed_lonitudinal_data_dic[key].columns[column_idex][0] + '_'+ str(transposed_lonitudinal_data_dic[key].columns[column_idex][1]))
                
    final_longitudinal_data_dic = {}
    for key in (group_longitudinal_data_dic):
        time_points = key
        unique_rid = unique_value_function(group_longitudinal_data_dic[key], 'RID')

        final_longitudinal_data_dic[key] = pd.DataFrame(columns = new_columns_names_dic[key])
        for x in range(len(transposed_lonitudinal_data_dic[key])):
            new_time_point_data = []
            new_time_point_data.append(unique_rid[x])
            for i in range(time_points):
                for j in range(1, int(len(transposed_lonitudinal_data_dic[key].columns)/time_points+1)):
                    column_idex = i + (time_points * j) - time_points
                    new_time_point_data.append(transposed_lonitudinal_data_dic[key].iloc[x, column_idex])
            final_longitudinal_data_dic[key].loc[len(final_longitudinal_data_dic[key])] = new_time_point_data
            
    return final_longitudinal_data_dic

In [11]:
# Apply one-hot-encoding only for categorical demographics features
def demographic_one_hot_encoding(demographic_df):
    
    demographic_data = demographic_df
    categorical_columns = []
    all_columns = list(demographic_data.columns)
    for i in range(len(all_columns)):
        if all_columns[i] != 'RID' and all_columns[i] != 'PTEDUCAT':
            categorical_columns.append(all_columns[i])

    for c in range(len(categorical_columns)):
        tempdf = pd.get_dummies(demographic_data[categorical_columns[c]], prefix=categorical_columns[c])
        demographic_data = pd.concat([demographic_data, tempdf], axis=1)
        demographic_data = demographic_data.drop(columns=categorical_columns[c])

    categorical_columns_will_be_used = list(demographic_data.columns)

    temp_columns = demographic_data.columns
    temp_keep_these_columns = []
    for c in range(len(temp_columns)):
        for k in range(len(categorical_columns_will_be_used)):
            if categorical_columns_will_be_used[k] in temp_columns[c]:
                temp_keep_these_columns.append(temp_columns[c])
    demographic_data = demographic_data[temp_keep_these_columns]
    
    return demographic_data

In [12]:
# Function to split the longitudinal data into training and test data 70% and 30% respectively
def split_longitudinal_data(longitudinal_data_dic):
    train_data = {}
    test_data = {}
    
    for key in longitudinal_data_dic:
        X_train, X_test = train_test_split(longitudinal_data_dic[key], test_size=0.3, random_state=42)
        if key in train_data:
            train_data[key] = pd.concat([train_data[key], X_train])
            train_data[key].reset_index(drop=True, inplace=True)
            
            test_data[key] = pd.concat([test_data[key], X_test])
            test_data[key].reset_index(drop=True, inplace=True)
        else:
            train_data[key] = X_train
            test_data[key] = X_test
            
    return train_data, test_data

In [13]:
# Global veriable
num_features_in_each_time_step = 1
time_steps = 2
demographic_features = 1

# Training data lists
dataset = []
demographic_train = []
target_1 = []

# Test data lists
Testset = []
demographic_test = []
target_2 = []

In [14]:
# Function to create training lists (longitudinal, demographics, label)
def create_train_lists(longitudinal_df, demographic_df, tp, ftp):
    global dataset
    global demographic_train
    global target_1
    
    uid = unique_value_function(longitudinal_df, 'RID')
    temp_demographic_df = pd.DataFrame(columns = list(demographic_df.columns)[1::])
    for i in range(len(uid)):
        temp_data = demographic_df[demographic_df["RID"] == uid[i]]
        temp_data.reset_index(drop=True, inplace=True)
        new_row = temp_data.iloc[0,1:]
        temp_demographic_df.loc[len(temp_demographic_df)] = new_row
    
    
    num_feature_in_tp = num_features_in_each_time_step
    df1 = longitudinal_df[longitudinal_df.columns]
    
    diagnosis_columns_names = []
    all_columns = list(longitudinal_df.columns)
    for i in range(len(all_columns)):
        if 'DX_' in all_columns[i]:
            diagnosis_columns_names.append(all_columns[i])
    
    # dataframe at least has one time point for data and ftp for prediction 
    if (df1.shape[1] - 1) / (num_feature_in_tp + 1) >= ftp+1:
        Features = df1.loc[:, ~df1.columns.isin(diagnosis_columns_names)]
        
        Labels = df1.loc[:, df1.columns.isin(diagnosis_columns_names)]

                    
        # dataframe has tp and ftp
        if (df1.shape[1] - 1) / (num_feature_in_tp + 1) >= tp+ftp:
            for i in range(len(df1)):
                dataset.append(list(Features.iloc[i,1:tp*num_feature_in_tp+1]))
                demographic_train.append(list(temp_demographic_df.iloc[i,:]))
                target_1.append(list(Labels.iloc[i,tp:tp+ftp]))
        else:
            for i in range(len(df1)):
                dataset.append(list(Features.iloc[i,1:Features.shape[1] - (ftp*num_feature_in_tp)]))
                demographic_train.append(list(temp_demographic_df.iloc[i,:]))
                target_1.append(list(Labels.iloc[i,Labels.shape[1]-ftp:]))

In [15]:
# Function to create test lists (longitudinal, demographics, label)
def create_test_lists(longitudinal_df, demographic_df, tp, ftp):
    global Testset
    global target_2
    global demographic_test
    global target_2_prev
    
    uid = unique_value_function(longitudinal_df, 'RID')
    temp_demographic_df = pd.DataFrame(columns = list(demographic_df.columns)[1::])
    for i in range(len(uid)):
        temp_data = demographic_df[demographic_df["RID"] == uid[i]]
        temp_data.reset_index(drop=True, inplace=True)
        new_row = temp_data.iloc[0,1:]
        temp_demographic_df.loc[len(temp_demographic_df)] = new_row
    
    
    num_feature_in_tp = num_features_in_each_time_step
    df1 = longitudinal_df[longitudinal_df.columns]
    
    diagnosis_columns_names = []
    all_columns = list(longitudinal_df.columns)
    for i in range(len(all_columns)):
        if 'DX_' in all_columns[i]:
            diagnosis_columns_names.append(all_columns[i])
    
    # dataframe must have tp+ftp visits 
    if (df1.shape[1] - 1) / (num_feature_in_tp + 1) >= tp+ftp:
        Features = df1.loc[:, ~df1.columns.isin(diagnosis_columns_names)]
        
        Labels = df1.loc[:, df1.columns.isin(diagnosis_columns_names)]

        for i in range(len(df1)):
            Testset.append(list(Features.iloc[i,1:tp*num_feature_in_tp+1]))
            demographic_test.append(list(temp_demographic_df.iloc[i,:]))
            target_2.append(list(Labels.iloc[i,tp:tp+ftp]))

In [16]:
# Function to create the train dataset
def create_dataset_train(train_data_list, ts, fts, demographic_df):
    global time_steps
    global demographic_features
    global dataset
    global demographic_train
    global target_1
    
    dataset = []
    demographic_train = []
    target_1 = []

    time_steps = ts
    
    train_df_list = []
    
    for i in range(len(train_data_list)):
        train_df_list.append(train_data_list[i])
    
    #create_train_lists(df1_train,time_steps)
    for i in range(len(train_df_list)):
        create_train_lists(train_df_list[i], demographic_df, time_steps, fts)
        
    # Train Padding
    padded1 = pad_sequences(dataset, padding='post',dtype='float', value=-1)

    num_samples = len(padded1)
    num_features = padded1.shape[1]
    time_steps = int(num_features / num_features_in_each_time_step)
    dataset = padded1
    padded_ = pad_sequences(target_1, padding='post',dtype='float', value=-1)
    target_1 = padded_
    num_labels = padded_.shape[1]
    # data and target are reshaped into the 3D format expected by LSTMs, namely [samples, timesteps, features].
    dataset = np.reshape(dataset, (num_samples, time_steps, num_features_in_each_time_step))
    target_1 = np.reshape(target_1, (num_samples, num_labels, 1))

    
    return dataset, target_1, demographic_train

In [17]:
# Function to create test dataset
def create_dataset_test(test_data_list, ts, fts, demographic_df):
    global time_steps
    global demographic_features
    global Testset
    global target_2
    global demographic_test
    global target_2_prev
    
    Testset = []
    target_2 = []
    demographic_test = []
    target_2_prev = []

    time_steps = ts
    
    test_df_list = []
    
    for i in range(len(test_data_list)):
        test_df_list.append(test_data_list[i])
    
    #create_train_lists(df1_train,time_steps)
    for i in range(len(test_df_list)):
        create_test_lists(test_df_list[i], demographic_df, time_steps, fts)
        
    # Test Padding
    padded2 = pad_sequences(Testset, padding='post',dtype='float', value=-1)

    T_num_samples = len(padded2)
    Testset = padded2
    target_2 = np.array(target_2)

    # Test data and target are reshaped into the 3D format expected by LSTMs, namely [samples, timesteps, features].
    Testset = np.reshape(Testset, (T_num_samples, time_steps, num_features_in_each_time_step))
    target_2 = np.reshape(target_2, (T_num_samples, fts, 1))

    
    return Testset, target_2, demographic_test

In [20]:
# Main function
def pkl_files_creator():
    global num_features_in_each_time_step
    global time_steps
    global demographic_features
    # Read csv files for longitudinal and demographic data
    # Longitudinal data
    file_name = 'longitudinal_data.csv'
    longitudinal_df = read_csv(file_name, header=0)

    # Demographic data
    file_name = 'demographic_data.csv'
    demographic_df = read_csv(file_name, header=0)
    
    # working on longitudinal data
    if check_longitudinal_dataset(longitudinal_df) == -1:
        return -1
    if check_demographic_dataset(demographic_df) == -1:
        return -1
    longitudinal_df = visit_code_preperation(longitudinal_df)
    longitudinal_df = encode_diagnosis(longitudinal_df)
    longitudinal_df = min_max_normalization(longitudinal_df)
    longitudinal_df_dic = group_patients_according_number_of_visits(longitudinal_df)
    features_to_be_in_columns = 0
    for key in longitudinal_df_dic:
        features_to_be_in_columns = list(longitudinal_df_dic[key].columns)[2::]
        break
    longitudinal_df_dic = transpose_longitudinal_data(longitudinal_df_dic, features_to_be_in_columns)
    longitudinal_train_data, longitudinal_test_data = split_longitudinal_data(longitudinal_df_dic)
    
    # working on demographic data
    demographic_df = demographic_one_hot_encoding(demographic_df)
    
    # user choises
    number_of_training_visits = input("Please enter number of visits that you want to use for training the model:\n")
    while not(number_of_training_visits.isdigit()):
        number_of_training_visits = input("Please enter integer values:\n")
    number_of_training_visits = int(number_of_training_visits)
    
    number_of_future_visits = input("Please enter number of future visits that you want to predict:\n")
    while not(number_of_future_visits.isdigit()):
        number_of_future_visits = input("Please enter integer values:\n")
    number_of_future_visits = int(number_of_future_visits)
    
    key_list = []
    for key in longitudinal_df_dic:
        key_list.append(key)
    minimum_visit = key_list[0]
    maximum_visit = key_list[-1]
    if (number_of_future_visits + number_of_training_visits) > maximum_visit:
        print('The Dataset does not have enough visits for this selection')
        return -1
    if number_of_training_visits < minimum_visit or number_of_future_visits < 1:
        print('Wrong selection')
        return -1
    
    num_features_in_each_time_step = longitudinal_df.shape[1] - 3
    time_steps = 0
    demographic_features = demographic_df.shape[1] - 1
    
    # train longitudinal data
    lon_train_data_list = []
    ###############################
    # train demographics data
    dem_train_data_list = []
    ###############################
    #test longitudinal data
    lon_test_data_list = []
    ###################################
    #test demographic data
    dem_test_data_list = []
    ###################################
    train_label_list = []
    ###################################
    test_label_list = []
    
    train = []
    test = []
    for key in longitudinal_train_data:
        train.append(longitudinal_train_data[key])
    for key in longitudinal_test_data:
        test.append(longitudinal_test_data[key])
        
    X_train, y_train, demographic_train_data = create_dataset_train(train, number_of_training_visits,number_of_future_visits,
                                                                    demographic_df)
    # train data
    lon_train_data_list.append(X_train)
    dem_train_data_list.append(demographic_train_data)
    train_label_list.append(y_train)
    
    X_test, y_test, demographic_test_data = create_dataset_test(test, number_of_training_visits, number_of_future_visits,
                                                                demographic_df)
    # test data
    lon_test_data_list.append(X_test)
    dem_test_data_list.append(demographic_test_data)
    test_label_list.append(y_test)
        
    f = open('longitudinal_data_train.pkl', 'wb')
    pickle.dump(lon_train_data_list, f)
    f.close()
    f = open('label_train.pkl', 'wb')
    pickle.dump(train_label_list, f)
    f.close()
    f = open('demographic_data_train.pkl', 'wb')
    pickle.dump(dem_train_data_list, f)
    f.close()
    f = open('longitudinal_data_test.pkl', 'wb')
    pickle.dump(lon_test_data_list, f)
    f.close()
    f = open('label_test.pkl', 'wb')
    pickle.dump(test_label_list, f)
    f.close()
    f = open('demographic_data_test.pkl', 'wb')
    pickle.dump(dem_test_data_list, f)
    f.close()
    return 0   
    

# To call helping functions and generate pkl files.

In [21]:
if pkl_files_creator() == -1:
    print('There is an error! Please run it again.')
else:
    print('Data is ready as pkl files.')
    

  uniques = Index(uniques)
  uniques = Index(uniques)
  uniques = Index(uniques)
  uniques = Index(uniques)
  uniques = Index(uniques)
  uniques = Index(uniques)
  uniques = Index(uniques)
  uniques = Index(uniques)
  uniques = Index(uniques)
  uniques = Index(uniques)
  uniques = Index(uniques)
  uniques = Index(uniques)
  uniques = Index(uniques)
  uniques = Index(uniques)


Please enter number of visits that you want to use for training the model:
2
Please enter number of future visits that you want to predict:
2
Data is ready as pkl files.
