# BUILDING A TFRECORDS DATABASE

In this notebook, a TFRecords database is built from the MRI dataset that has already been organized in folders, according to the possible labels:

* CN
* MCI
* AD

In [None]:
! pip install SimpleITK
! pip install dltk

Collecting SimpleITK
[?25l  Downloading https://files.pythonhosted.org/packages/9c/6b/85df5eb3a8059b23a53a9f224476e75473f9bcc0a8583ed1a9c34619f372/SimpleITK-2.0.2-cp37-cp37m-manylinux2010_x86_64.whl (47.4MB)
[K     |████████████████████████████████| 47.4MB 63kB/s 
[?25hInstalling collected packages: SimpleITK
Successfully installed SimpleITK-2.0.2
Collecting dltk
[?25l  Downloading https://files.pythonhosted.org/packages/7f/55/bc7d72866c61f34f298e494d01c7c69eb26e8af711f5a2694c6bb9dceaca/dltk-0.2.1.tar.gz (294kB)
[K     |████████████████████████████████| 296kB 5.0MB/s 
Collecting argparse
  Downloading https://files.pythonhosted.org/packages/f2/94/3af39d34be01a24a6e65433d19e107099374224905f1e0cc6bbe1fd22a2f/argparse-1.4.0-py2.py3-none-any.whl
Building wheels for collected packages: dltk
  Building wheel for dltk (setup.py) ... [?25l[?25hdone
  Created wheel for dltk: filename=dltk-0.2.1-py2.py3-none-any.whl size=37271 sha256=9e0f77eba4910741c66c52ee27d9fa21c8ee5a8f5bbdb2559f16c0b

In [None]:
import tensorflow as tf
from tensorflow import keras
from matplotlib import pyplot as plt
import numpy as np
import SimpleITK as sitk
from dltk.io import preprocessing
import os
import collections
import pandas as pd


In [None]:
def _int64_feature(value):
  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def _float_feature(value):
  return tf.train.Feature(float_list=tf.train.FloatList(value=value))
  
def _bytes_feature(value):
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

In [None]:
##basic raw databases with skull stripped images
DB_SS_PATH = '/content/drive/MyDrive/SKULL_STRIPPED/'

# the data description file
DESCRIPTION_FILE = '/content/drive/MyDrive/ADNI1_Complete_1Yr_1.5T_6_11_2021.csv'

# data subfolders (labels)
CLASS_SUBFOLDERS = ['MCI/', 'AD/', 'CN/']
BINARY_CLASS_SUBFOLDERS = ['AD/', 'CN/']

# 3D supervised TFRecords database
DB_TF_3D_PATH = '/content/drive/MyDrive/TF_RECORDS/TF_RECORDS_3D/'

##tfrecords files
TFREC_3D_SS_TRAIN = 'train.3D.skull_stripped.tfrecords'
TFREC_3D_SS_TEST = 'test.3D.skull_stripped.tfrecords'
TFREC_3D_SS_VAL = 'validation.3D.skull_stripped.tfrecords'

# 2D supervised TFRecords database
DB_TF_2D_PATH = '/content/drive/MyDrive/TF_RECORDS/TF_RECORDS_2D/'

TFREC_2D_SS_TRAIN = 'train.2D.skull_stripped.tfrecords'
TFREC_2D_BIN_TRAIN = 'train.2D.binary.tfrecords'
TFREC_2D_SS_TEST = 'test.2D.skull_stripped.tfrecords'
TFREC_2D_BIN_TEST = 'test.2D.binary.tfrecords'
TFREC_2D_SS_VAL = 'validation.2D.skull_stripped.tfrecords'
TFREC_2D_BIN_VAL = 'validation.2D.binary.tfrecords'

Identifiers for the three different classes are needed. Also save the shape of the images, in case that information is needed.

In [None]:
# label mapping
LABELS = {'CN': 0, 'MCI': 1, 'AD': 2}
BINARY_LABELS = {'CN': 0, 'AD': 1}

# shape of the images, both 3D and 2D
IMG_SHAPE = (78, 110, 86)
IMG_2D_SHAPE = (IMG_SHAPE[1] * 4, IMG_SHAPE[2] * 4)


Define the percentage of the data that are going to be used as a test and validation set. When using TFRecords, data has to be separated in different files, because they cannot be splitted later in training.

In [None]:
TEST_SPLIT = 0.15
VALIDATION_SPLIT = 0.15

### Train/Test supervised data split

Load the path of every file in a list, and then split the list so the references of training, validation and test data are separated.

In [None]:
# array for saving the filenames
filenames = np.array([])

# iterate all three class folders in the db
for subf in CLASS_SUBFOLDERS:
  # using the skull stripped data
  path = DB_SS_PATH + subf
  for name in os.listdir(path):
    complete_name = os.path.join(path, name)
    if os.path.isfile(complete_name):
      filenames = np.concatenate((filenames, complete_name), axis=None)

In [None]:
filenames.shape

(2294,)

In [None]:
for i in range(1000):
  np.random.shuffle(filenames)
  
test_margin = int(len(filenames) * TEST_SPLIT)
training_set, test_set = filenames[test_margin:], filenames[:test_margin]

validation_margin = int(len(training_set) * VALIDATION_SPLIT)
training_set, validation_set = training_set[validation_margin:], training_set[:validation_margin]

print('Training set:', training_set.shape)
print('Validation set:', validation_set.shape)
print('Test set:', test_set.shape)

Training set: (1658,)
Validation set: (292,)
Test set: (344,)



### 3D TFRecords database for supervised learning

Let´s build the 3D TFRecords database for supervised learning.

Load the data description file.

In [None]:
description = pd.read_csv(DESCRIPTION_FILE)
description.head()

Unnamed: 0,Image Data ID,Subject,Group,Sex,Age,Visit,Modality,Description,Type,Acq Date,Format,Downloaded
0,I97341,941_S_1311,MCI,M,70,3,MRI,MPR-R; GradWarp; B1 Correction; N3; Scaled,Processed,9/27/2007,NiFTI,6/05/2021
1,I97327,941_S_1311,MCI,M,69,1,MRI,MPR; GradWarp; B1 Correction; N3; Scaled,Processed,3/02/2007,NiFTI,6/05/2021
2,I112538,941_S_1311,MCI,M,70,4,MRI,MPR; GradWarp; B1 Correction; N3; Scaled,Processed,6/01/2008,NiFTI,6/05/2021
3,I75150,941_S_1202,CN,M,78,3,MRI,MPR; GradWarp; B1 Correction; N3; Scaled,Processed,8/24/2007,NiFTI,6/04/2021
4,I63874,941_S_1202,CN,M,78,1,MRI,MPR-R; GradWarp; B1 Correction; N3; Scaled,Processed,1/30/2007,NiFTI,6/04/2021


Now, design a method that loads a 3D `.nii` image and some of its information. Taking the absolute path, split the name by directories to get the image name. With that, obtain the class label. Also, obtain the subject ID from the image file name. Finally, read the image using `SimpleITK`, transform into a `numpy` array and return the image, the label and the subject ID.

In [None]:
def load_image_3D(abs_path):
  ''' Load an image (.nii) and its label, from its absolute path.
      
      Parameters:
        abs_path -- Absolute path, filename included
        
      Returns:
        img -- The .nii image, converted into a numpy array
        label -- The label of the image
        
  '''
  
  # obtain the label from the path (it is the last directory name)
  split_path = abs_path.split('/')
  label = LABELS[split_path[-2]]
  
  # obtain the ID of the subject
  img_name = split_path[-1]
  subject = '_'.join(img_name.split('_')[1:4])
  
  # load the image with SimpleITK
  sitk_image = sitk.ReadImage(abs_path)
  
  # transform into a numpy array
  img = sitk.GetArrayFromImage(sitk_image)
  
  return img, label, subject

Now, create a new method for creating `.tfrecords` files. It would be necessary to specifiy the filenames of all the images that are going to be stored in the `.tfrecords`, as well as the name for this file.
In the method, several extra data, besides the image and label, are stored for each example (subject, age, sex, preprocessing and image ID). This was stored just in case these data were needed in forward steps

In [None]:
def create_tf_record(img_filenames, tf_rec_filename):
  ''' Create a TFRecord file, including the information
      of the specified images
      
      Parameters:
        img_filenames -- Array with the path to every
                         image that is going to be included
                         in the TFRecords file.
        tf_rec_filename -- Name of the TFRecords file.
  '''
  
  # open the file
  writer = tf.io.TFRecordWriter(tf_rec_filename)
  
  # iterate through all .nii files
  for meta_data in img_filenames:

    # load the image and label
    img, label, subject = load_image_3D(meta_data)
    
    # also save the preprocessing information and the subject age and sex
    meta_data_split = meta_data.split('/')
    filename_split = meta_data_split[-1].split('_')
    
    # save the preprocessing technique used
    preprocessing = '_'.join(filename_split[5:-3])
    
    # get the image ID
    if filename_split[-1].endswith('.gz'): image_ID = int(filename_split[-1][1:-7])
    else: image_ID = int(filename_split[-1][1:-4])
      
    # get the age and sex of the subject
    age_and_sex = description.loc[description['Image Data ID'] == image_ID, ['Age', 'Sex']].iloc[0]
    
    # create a feature
    feature = {'label': _int64_feature(label),
               'subject': _bytes_feature(subject.encode('utf-8')),
               'preprocessing': _bytes_feature(preprocessing.encode('utf-8')),
               'subject_age': _int64_feature(age_and_sex[0]),
               'subject_sex': _bytes_feature(age_and_sex[1].encode('utf-8')),
               'image_id': _int64_feature(image_ID),
               'image': _float_feature(img.ravel())}

    # create an example protocol buffer
    example = tf.train.Example(features=tf.train.Features(feature=feature))

    # serialize to string and write on the file
    writer.write(example.SerializeToString())
    
  writer.close()

Define the complete path names for the `.tfrecords` files.

In [None]:
train_tfrec = os.path.join(DB_TF_3D_PATH, TFREC_3D_SS_TRAIN)
test_tfrec = os.path.join(DB_TF_3D_PATH, TFREC_3D_SS_TEST)
val_tfrec = os.path.join(DB_TF_3D_PATH, TFREC_3D_SS_VAL)

In [None]:
create_tf_record(training_set, train_tfrec)
reate_tf_record(test_set, test_tfrec)
create_tf_record(validation_set, val_tfrec)

In [None]:
train_tfrec 

'/content/drive/MyDrive/TF_RECORDS/TF_RECORDS_3D/train.3D.skull_stripped.tfrecords'

In [None]:
training_set

array(['/content/drive/MyDrive/SKULL_STRIPPED/CN/ADNI_062_S_0578_MR_MPR__GradWarp__B1_Correction__N3__Scaled_Br_20070424114540207_S15035_I50459.nii.gz',
       '/content/drive/MyDrive/SKULL_STRIPPED/MCI/ADNI_018_S_0080_MR_MPR____N3__Scaled_Br_20070821181307557_S24963_I69594.nii.gz',
       '/content/drive/MyDrive/SKULL_STRIPPED/AD/ADNI_023_S_0916_MR_MPR__GradWarp__B1_Correction__N3__Scaled_2_Br_20081001154612955_S30418_I118884.nii.gz',
       ...,
       '/content/drive/MyDrive/SKULL_STRIPPED/AD/ADNI_082_S_1377_MR_MPR__GradWarp__B1_Correction__N3__Scaled_Br_20071101192159389_S40021_I80388.nii.gz',
       '/content/drive/MyDrive/SKULL_STRIPPED/AD/ADNI_012_S_0689_MR_MPR____N3__Scaled_2_Br_20081001125017695_S24938_I118740.nii.gz',
       '/content/drive/MyDrive/SKULL_STRIPPED/AD/ADNI_029_S_0999_MR_MPR-R__GradWarp__B1_Correction__N3__Scaled_Br_20070805144703422_S23248_I64898.nii.gz'],
      dtype='<U148')

### 2D TFRecords database for supervised learning

Let´s build the 2D TFRecords database for supervised learning. 


In this case, images need to be transformed to 2D. The following method does exactly that, taking multiple horizontal slices and putting them in a 2D matrix. In the final version, 16 slices were used. Some considerations:

* The top slice was selected manually, after some tests. Higher cuts did not show any useful information.
* The same for the bottom slice. Below slices only showed some of the brainstem. 
* If 16 cuts were wanted, every two slices from 30 to 60 has to be selected.

In [None]:
def slices_matrix_2D(img):
  ''' Transform a 3D MRI image into a 2D image, by obtaining 9 slices 
      and placing them in a 4x4 two-dimensional grid.
      
      All 16 cuts are from a horizontal/axial view. They are selected
      from the 30th to the 60th level of the original 3D image.
      
      Parameters:
        img -- np.ndarray with the 3D image
        
      Returns:
        np.ndarray -- The resulting 2D image
  '''
  
  # create the final 2D image 
  image_2D = np.empty(IMG_2D_SHAPE)
  
  # set the limits and the step
  TOP = 60
  BOTTOM = 30
  STEP = 2
  N_CUTS = 16
  
  # iterator for the cuts
  cut_it = TOP
  # iterator for the rows of the 2D final image
  row_it = 0
  # iterator for the columns of the 2D final image
  col_it = 0
  
  for cutting_time in range(N_CUTS):
    
    # cut
    cut = img[cut_it, :, :]
    cut_it -= STEP
    
    # reset the row iterator and move the
    # col iterator when needed
    if cutting_time in [4, 8, 12]:
      row_it = 0
      col_it += cut.shape[1]
    
    # copy the cut to the 2D image
    for i in range(cut.shape[0]):
      for j in range(cut.shape[1]):
        image_2D[i + row_it, j + col_it] = cut[i, j]
    row_it += cut.shape[0]
  
  # return the final 2D image, with 3 channels
  # this is necessary for working with most pre-trained nets
  return np.repeat(image_2D[None, ...], 3, axis=0).T
  #return image_2D

The following method uses the previous 2D transformation to load the 3D images from disk and transforms them. Also returns the image label.


In [None]:
def load_image_2D(abs_path, labels):
  ''' Load an image (.nii) and its label, from its absolute path.
      Transform it into a 2D image, by obtaining 16 slices and placing them
      in a 4x4 two-dimensional grid.
      
      Parameters:
        abs_path -- Absolute path, filename included
        labels -- Label mapper
        
      Returns:
        img -- The .nii image, converted into a numpy array
        label -- The label of the image (from argument 'labels')
        
  '''
  
  # obtain the label from the path (it is the last directory name)
  label = labels[abs_path.split('/')[-2]]
  
  # load the image with SimpleITK
  sitk_image = sitk.ReadImage(abs_path)
  
  # transform into a numpy array
  img = sitk.GetArrayFromImage(sitk_image)
  
  # apply whitening
  img = preprocessing.whitening(img)
  
  # make the 2D image
  img = slices_matrix_2D(img)
  
  return img, label

In [None]:
train_tfrec2D = os.path.join(DB_TF_2D_PATH, TFREC_2D_SS_TRAIN)
test_tfrec2D = os.path.join(DB_TF_2D_PATH, TFREC_2D_SS_TEST)
val_tfrec2D = os.path.join(DB_TF_2D_PATH, TFREC_2D_SS_VAL)

In [None]:
def create_tf_record_2D(img_filenames, tf_rec_filename, labels):
  ''' Create a TFRecord file, including the information
      of the specified images, after converting them into 
      a 2D grid.
      
      Parameters:
        img_filenames -- Array with the path to every
                         image that is going to be included
                         in the TFRecords file.
        tf_rec_filename -- Name of the TFRecords file.
        labels -- Label mapper
  '''
  
  # open the file
  writer = tf.io.TFRecordWriter(tf_rec_filename)
  
  # iterate through all .nii files
  for meta_data in img_filenames:

    # load the image and label
    img, label = load_image_2D(meta_data, labels)

    # create a feature
    feature = {'label': _int64_feature(label),
               'image': _float_feature(img.ravel())}

    # create an example protocol buffer
    example = tf.train.Example(features=tf.train.Features(feature=feature))

    # serialize to string and write on the file
    writer.write(example.SerializeToString())
    
  writer.close()

In [None]:
create_tf_record_2D(training_set, train_tfrec2D, LABELS)
create_tf_record_2D(test_set, test_tfrec2D, LABELS)
create_tf_record_2D(validation_set, val_tfrec2D, LABELS)