In [1]:
import numpy as np
import pandas as pd
from collections import defaultdict
from sklearn.model_selection import train_test_split
import pickle
import csv

In [2]:
#Original Data Extraction

DATASET_PATH = './Data-for-Project.csv'
dataset = pd.read_csv(DATASET_PATH, encoding="ISO-8859-1", dtype={"RID": float, "VISCODE": "string", "AGE": float, "PTGENDER": float, "PTEDUCAT": float, "APOE4": float, "ABETA": float, "TAU": float, "Ventricles": float, "Hippocampus": float,
  "WholeBrain": float, "Entorhinal": float, "Fusiform": float,  "MidTemp": float, "ICV": float,"ICV_bl": float,"ADAS11": float, "ADAS13": float,  "MMSE": float, "DX": float,})

features = ['RID', 'VISCODE', 'AGE','PTGENDER', 'PTEDUCAT', 'Ventricles', 'Hippocampus', 'WholeBrain', 'Entorhinal', 'Fusiform', 'MidTemp', 'ICV_bl']
labels = ['RID', 'VISCODE', 'MMSE', 'ADAS11', 'ADAS13']
cols = ['RID', 'VISCODE', 'AGE','PTGENDER', 'PTEDUCAT', 'MMSE', 'ADAS11', 'ADAS13','ICV', 'DX', 'Ventricles', 'Hippocampus', 'WholeBrain', 'Entorhinal', 'Fusiform', 'MidTemp', 'ICV_bl']
df = pd.DataFrame(dataset, columns=cols)
vis_codes = sorted(df.VISCODE.unique(), key=lambda x: (len(x), x)) 
vis_codes_counts = df.VISCODE.value_counts()
patient_codes = df.RID.unique()

In [3]:
#Transform data to time steps with format: columns
def getDefaultMap():
    return [[vis_codes[i]] for i in range(len(vis_codes))]

timestepIndex = {}
for i in range(len(vis_codes)):
    timestepIndex[vis_codes[i]] = i

patientsDf = defaultdict(getDefaultMap)

for i, row in df.iterrows():
    patientsDf[row['RID']][timestepIndex[row['VISCODE']]] = list(row)

#populate data into a list of patients with all the available timesteps
patientsDataAll = []
for patient in patientsDf.keys():
    for i in range(len(vis_codes)):
        if len(patientsDf[patient][i]) == 1:
            patientsDf[patient][i] = [patient] + patientsDf[patient][i]
        patientsDataAll.append(patientsDf[patient][i])

In [4]:
#find number number of patients for each number of records
#conditionSet = ['bl', 'm12', 'm24', 'm36', 'm48','m60', 'm72','m84', 'm96', 'm108', 'm120', 'm132','m144', 'm156']
conditionSet = ['bl', 'm12', 'm24', 'm36', 'm48','m60', 'm72','m84', 'm96', 'm108', 'm120']
MIN_SEQUENCES = 3
patientsSequences = defaultdict(set)

def hasAllEntries(arr):
    return not pd.DataFrame(arr).isnull().any().any()

def getPatientsWithNTimeSteps(n):
    patientsData = {}
    for patient in patientsDf.keys():
        currPatient = []
        
        for condition in conditionSet:
            i = timestepIndex[condition]
            if len(patientsDf[patient][i]) == len(cols):
                currPatient.append(patientsDf[patient][i])
                if len(currPatient) == n:
                    break
            else:
                currPatient = []
        
        if len(currPatient) == n:
            patientsData[patient] = currPatient

    return patientsData

def getPatientsWithBL():
    patientsData = {}
    patientsWithBL = set()
    for patient in patientsDf.keys():
        currPatient = []
        visits = 0
        diagx = None
        for condition in conditionSet:
            i = timestepIndex[condition]
            if patientsDf[patient][i][1] == 'bl':
                patientsWithBL.add(patientsDf[patient][i][0])
                icvbl = patientsDf[patient][i][16]
                diagx = patientsDf[patient][i][9]
            if len(patientsDf[patient][i]) > 9:
                if diagx != patientsDf[patient][i][9] and patientsDf[patient][i][9] != float('nan'):
                    diagx = patientsDf[patient][i][9]
            if patientsDf[patient][i][0] not in patientsWithBL:
                icvbl = float('nan')
            if len(patientsDf[patient][i]) == len(cols) and patientsDf[patient][i][0] in patientsWithBL:
                currPatient.append(patientsDf[patient][i])
                visits = visits + 1
            else:
                emptyArray = [float('nan')] * len(cols)
                emptyArray[0] = patientsDf[patient][i][0]
                emptyArray[1] = patientsDf[patient][i][1]
                emptyArray[9] = diagx
                emptyArray[16] = icvbl
                currPatient.append(emptyArray)
        if patientsDf[patient][i][0] in patientsWithBL and visits >= 3:
            patientsData[patient] = currPatient
        currPatient = []
    return patientsData
test = getPatientsWithBL()
print(test[2.0])
print(len(test))

[[2.0, 'bl', 74.3, 1.0, 16.0, 28.0, 10.67, 18.67, 1984660.0, 1.0, 118233.0, 8336.0, 1229740.0, 4177.0, 16559.0, 27936.0, 1984660.0], [2.0, 'm12', nan, nan, nan, nan, nan, nan, nan, 1.0, nan, nan, nan, nan, nan, nan, 1984660.0], [2.0, 'm24', nan, nan, nan, nan, nan, nan, nan, 1.0, nan, nan, nan, nan, nan, nan, 1984660.0], [2.0, 'm36', 74.3, 1.0, 16.0, 29.0, 12.0, 20.0, nan, 1.0, nan, nan, nan, nan, nan, nan, 1984660.0], [2.0, 'm48', nan, nan, nan, nan, nan, nan, nan, 1.0, nan, nan, nan, nan, nan, nan, 1984660.0], [2.0, 'm60', 74.3, 1.0, 16.0, 28.0, 14.0, 23.0, nan, 1.0, nan, nan, nan, nan, nan, nan, 1984660.0], [2.0, 'm72', 74.3, 1.0, 16.0, 23.0, 12.0, 21.0, nan, 1.0, nan, nan, nan, nan, nan, nan, 1984660.0], [2.0, 'm84', 74.3, 1.0, 16.0, 24.0, 9.0, 14.0, nan, 2.0, nan, nan, nan, nan, nan, nan, 1984660.0], [2.0, 'm96', 74.3, 1.0, 16.0, 25.0, 10.0, 18.0, nan, 1.0, nan, nan, nan, nan, nan, nan, 1984660.0], [2.0, 'm108', 74.3, 1.0, 16.0, 28.0, nan, nan, nan, nan, nan, nan, nan, nan, nan, n

In [5]:
# choose how many timesteps will be used
total_steps = 11
sequence_key = total_steps-MIN_SEQUENCES

#Create time series - (samples, time steps, features)
N = len(test)
i = 0
n_cols = len(cols)-2

cleaned_data = np.zeros((N, total_steps, n_cols))

patientSet = test
print('patients #', len(patientSet))
skipped = 0

for patient in patientSet:
    currPatient = patientSet[patient]
    
    # filter patients with no ICV_bl or no DX
    currPatient = np.array(currPatient)
    if np.isnan(np.array(currPatient[:, 16]).astype(float)).any(axis=0) or np.isnan(np.array(currPatient[:, 9]).astype(float)).any():
        skipped+=1
        continue

    for timestep in range(total_steps):
        dfcols = len(currPatient[0])
        cleaned_data[i][timestep] =currPatient[timestep][2:dfcols]
    i+=1

# delete bottom N-i rows
cleaned_data = cleaned_data[:i]
print('skipped', skipped)
print('i', i)
print("resulting data", cleaned_data.shape)
print('ex: ', cleaned_data[0][4])

# delete patients with no ICV_bl
""" ICV_bl = cleaned_data[:,:,14]
cleaned_data = cleaned_data[~np.isnan(ICV_bl).any()]
print("elete patients with no ICV_bl", cleaned_data.shape)
print('ex: ', cleaned_data[0][4]) """

cleaned_data = np.where(np.isnan(cleaned_data), 0, cleaned_data)

patients # 1354
skipped 234
i 1120
resulting data (1120, 11, 15)
ex:  [        nan         nan         nan         nan         nan         nan
         nan 3.00000e+00         nan         nan         nan         nan
         nan         nan 1.92069e+06]


In [6]:
# check on data
import sys
np.set_printoptions(threshold=sys.maxsize)
print(cleaned_data)

[[[8.13000e+01 1.00000e+00 1.80000e+01 2.00000e+01 2.20000e+01
   3.10000e+01 1.92069e+06 3.00000e+00 8.45990e+04 5.31900e+03
   1.12983e+06 1.79100e+03 1.55060e+04 1.84220e+04 1.92069e+06]
  [8.13000e+01 1.00000e+00 1.80000e+01 1.70000e+01 2.40000e+01
   3.50000e+01 1.90382e+06 3.00000e+00 9.00990e+04 5.15700e+03
   1.09564e+06 1.59600e+03 1.46170e+04 1.73300e+04 1.92069e+06]
  [8.13000e+01 1.00000e+00 1.80000e+01 1.90000e+01 2.56700e+01
   3.76700e+01 1.90342e+06 3.00000e+00 9.74200e+04 5.13900e+03
   1.08856e+06 1.17500e+03 1.40330e+04 1.63980e+04 1.92069e+06]
  [0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00
   0.00000e+00 0.00000e+00 3.00000e+00 0.00000e+00 0.00000e+00
   0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 1.92069e+06]
  [0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00
   0.00000e+00 0.00000e+00 3.00000e+00 0.00000e+00 0.00000e+00
   0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 1.92069e+06]
  [0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00

In [7]:
# split into 80/10/10 train, validate, and test sets
RANDOM_STATE = 1
SHUFFLE = False
validate_and_test_size = int(N*0.3)
train, validate_and_test = train_test_split(cleaned_data, test_size=validate_and_test_size, shuffle=SHUFFLE, random_state=RANDOM_STATE)
test_size = int(validate_and_test.shape[0] * 0.5)
validate, test = train_test_split(validate_and_test, test_size=test_size, shuffle=SHUFFLE, random_state=RANDOM_STATE)
test_mask = []
for i in range(len(test)):
    test_mask.append([])
    for j in range(len(test[0])):
        if test[i][j][0] == 0:
            test_mask[i].append(0)
        else:
            test_mask[i].append(1)

test_mask = np.array(test_mask)
print(test_mask)
print("train", train.shape)
print("validate", validate.shape)
print("test", test.shape)


[[1 1 1 0 0 0 0 0 0 0 0]
 [1 1 1 1 1 1 0 0 0 0 0]
 [1 1 1 0 0 0 0 0 0 0 0]
 [1 1 1 1 0 0 0 0 0 0 0]
 [1 1 1 1 1 1 0 1 0 0 0]
 [1 1 1 0 1 0 1 0 0 0 0]
 [1 1 1 1 1 1 1 1 0 0 0]
 [1 1 1 0 0 0 0 0 0 0 0]
 [1 1 1 0 0 0 0 0 0 0 0]
 [1 1 1 1 1 0 1 0 0 0 0]
 [1 1 1 1 1 1 1 1 0 0 0]
 [1 1 1 1 1 0 1 0 0 0 0]
 [1 1 1 0 1 0 1 1 0 0 0]
 [1 1 1 0 1 0 0 0 0 0 0]
 [1 1 1 0 0 0 0 0 0 0 0]
 [1 1 1 0 0 0 0 0 0 0 0]
 [1 1 1 1 1 0 1 1 0 0 0]
 [1 1 1 1 1 1 1 1 0 0 0]
 [1 1 1 1 1 1 0 0 0 0 0]
 [1 1 1 1 1 1 0 0 0 0 0]
 [1 1 1 1 1 0 1 1 0 0 0]
 [1 1 1 0 1 0 0 0 0 0 0]
 [1 1 1 1 0 0 0 0 0 0 0]
 [1 1 1 0 0 0 0 0 0 0 0]
 [1 1 1 1 0 0 0 1 0 0 0]
 [1 1 1 0 1 0 1 0 0 0 0]
 [1 1 1 0 0 0 0 0 0 0 0]
 [1 1 1 0 0 0 0 0 0 0 0]
 [1 1 1 1 1 0 0 0 0 0 0]
 [1 1 1 1 1 1 0 1 0 0 0]
 [1 1 1 0 1 0 0 0 0 0 0]
 [1 1 1 0 1 0 0 0 0 0 0]
 [1 1 1 0 1 0 0 0 0 0 0]
 [1 1 1 1 1 1 1 1 0 0 0]
 [1 1 1 0 1 0 1 0 0 0 0]
 [1 1 1 0 0 0 0 0 0 0 0]
 [1 1 1 0 0 0 0 0 0 0 0]
 [1 1 1 0 1 0 1 1 0 0 0]
 [1 1 1 1 1 1 0 0 0 0 0]
 [1 1 1 0 0 0 0 1 0 0 0]


In [8]:
# format data into X, Y
train_X = train[:,:total_steps,:]
validate_X = validate[:,:total_steps,:]
test_X = test[:,:total_steps,:]

""" print("train_X", train_X.shape)
print("validate_X", validate_X.shape)
print("test_X", test_X.shape)
print('ex train_X:', train_X[0]) """


def get_one_hot_encoding(dx):
    '''
        Returns one-hot encoding of a diagnosis [CN, MCI, AD]
        CN = 1, MCI = 2, AD = 3
    '''
    if dx == 1:
        return np.array([1, 0, 0]).astype(float)
    elif dx == 2:
        return np.array([0, 1, 0]).astype(float)
    elif dx == 3:
        return np.array([0, 0, 1]).astype(float)
    else:
        return np.array([0, 0, 0]).astype(float)

def get_Y_encodings(train):
    '''
    input: NxT np array of DXs where N is # of patients and T is the timesteps
    output: NxTx3 np array of one-hot encoded DXs
    '''  
    N, T = train.shape
    Y = np.zeros((N, T, 3))

    for i in range(N):
        for j in range(T):
            Y[i][j] = get_one_hot_encoding(train[i][j])

    return Y


def get_Y(train):
    '''
    input: NxT np array of DXs where N is # of patients and T is the timesteps
    output: NxTx3 np array of DXs
    '''  
    N, T = train.shape
    Y = np.zeros((N, T-1))

    for i in range(N):
        for j in range(1,T):
            Y[i][j-1] = train[i][j]

    return Y
    

train_Y = get_Y(train[:,:,7])
validate_Y = get_Y(validate[:,:,7])
test_Y = get_Y(test[:,:,7])
train_encoding_Y = get_Y_encodings(train[:,:,7])
validate_encoding_Y = get_Y_encodings(validate[:,:,7])
test_encoding_Y = get_Y_encodings(test[:,:,7])

In [9]:
# check on data
import sys
np.set_printoptions(threshold=sys.maxsize)
print(train_Y)

[[3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]
 [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]
 [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]
 [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 2. 2. 2. 2. 2.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]
 [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [2. 3. 3. 3. 3. 3. 3. 3. 3. 3.]
 [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]
 [2. 2. 2. 2. 2. 3. 3. 3. 3. 3.]
 [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]
 [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]
 [1. 1. 1. 2. 2. 2. 2. 2. 2. 2.]
 [1. 1. 1. 1. 1. 1. 2. 1. 1. 1.]
 [2. 3. 3. 3. 3. 3. 3. 3. 3. 3.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 2. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 2. 3. 3. 3.]
 [1. 1. 1. 2. 2. 2. 2. 2. 2. 2.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1.

In [10]:
# export into a pickle file in Jung's format
FILENAME = 'ADNI_DATA_JUNG_FORMAT_11_0s_test_masked_final.pkl'
DATA = {'Train_data': train_X, 'Valid_data': validate_X, 'Test_data': test_X,
'Train_label': train_Y, 'Valid_label': validate_Y, 'Test_label': test_Y, 'Mask_label': test_mask
, 'Train_Encoding': train_encoding_Y, 'Valid_Encoding': validate_encoding_Y, 'Test_Encoding': test_encoding_Y
}
# DATA = {'Train_data': train_X, 'Valid_data': validate_X, 'Test_data': test_X,
# 'Train_label': train_Y, 'Valid_label': validate_Y, 'Test_label': test_Y
# }

with open(FILENAME, 'wb') as handle:
    pickle.dump(DATA, handle, protocol=pickle.HIGHEST_PROTOCOL)