# Setup

In [1]:
import os
import shutil
import re
import pandas as pd
import numpy as np
from dotenv import load_dotenv
import nibabel as nib
import matplotlib as mpl
import matplotlib.pyplot as plt

load_dotenv()

True

Helper functions for making it easier to visualize model training

In [None]:
def progressCount(current:int, total:int, title=""):
  """
  Prints a loading progress bar.

  Parameters:
    current (int): Current number out of total
    total (int): Total
    title (str) : (Optional) Title to appear alongside loading bar. Defaults to no title.
  """
  num_bars = 20
  pcent = 1.*current/total
  blocks = int((num_bars * pcent) - (num_bars * pcent) % 1)
  lines = num_bars - blocks
  print(f'{title} ┝\033[92m{blocks*"█"}\033[0m{lines*"━"}┥\t{current}/{total}\t({100*pcent:.1f}%)')


# Preprocessing

## Convert Raw Data to .nii

Original OASIS-2 values are provided as .nifti.img and .nifiti.hdr pairs. Each pair is converted to a single .nii file.

.env variables:

- `MOVE_RAW` -- if 1, original OASIS-2 files will be converted to .nii files. Otherwise, this function is ignored.
- `RAW` -- The source directory containing the original OASIS-2 files
- `NII` -- The destination directory to hold the converted .nii files


In [3]:
raw_dir = os.getenv("RAW")
nii_dir = os.getenv("NII")

def moveRaw(raw_dir, nii_dir):
    """
    Function for converting files in the raw OASIS-2 directory to .nii, then moving them to the specified 'nii_dir' directory

    Parameters:
        raw_dir (str): Source directory containing the raw OASIS-2 files
        nii_dir (str): Destination directory to copy the converted .nii files into

    Returns:
        None
    """


    i=0
    if not os.path.exists(nii_dir):
        os.makedirs(nii_dir)

    # Regex pattern to get the MRI id and visit number
    rawDirPat = r"(?:[\W\S]+?)OAS2_([0-9]{4})_MR([0-2]{1})/RAW"
    for root, dir, files in os.walk(raw_dir):
        r_match = re.findall(rawDirPat, root)
        if len(r_match) > 0:
            subID = r_match[0][0]
            session = r_match[0][1]
            new_name = f"{subID}_{session}"
            for f in files:
                # Get file extension
                fname, fext = os.path.splitext(f)
                # If the file is a .img, it should be converted to .nii
                if fext == ".img":
                    # Get mpr number from file name
                    f_match = re.findall(r"mpr-([0-2]{1}).nifti", fname)
                    if len(f_match) > 0:
                        f_num = f_match[0]

                        # Get full path of .img file
                        img_name = os.path.join(root, (fname + ".img"))

                        # Create new name for .nii file and put it in the right path
                        nii_name = os.path.join(
                            nii_dir, f"OAS2_{subID}_MR{session}_F{f_num}.nifti.nii"
                        )

                        # Load .img image using nibabel
                        img = nib.load(img_name)

                        # Save .nii image in nii directory with new name
                        nib.save(img, nii_name)
                        progressCount(i, 587, "Converted to .nii")

                        i+=1



if os.getenv("MOVERAW")=="1":
    moveRaw(raw_dir, nii_dir)

Converted to .nii ┝[92m[0m━━━━━━━━━━━━━━━━━━━━┥	0/587	(0.0%)
Converted to .nii ┝[92m[0m━━━━━━━━━━━━━━━━━━━━┥	1/587	(0.2%)
Converted to .nii ┝[92m[0m━━━━━━━━━━━━━━━━━━━━┥	2/587	(0.3%)
Converted to .nii ┝[92m[0m━━━━━━━━━━━━━━━━━━━━┥	3/587	(0.5%)
Converted to .nii ┝[92m[0m━━━━━━━━━━━━━━━━━━━━┥	4/587	(0.7%)
Converted to .nii ┝[92m[0m━━━━━━━━━━━━━━━━━━━━┥	5/587	(0.9%)
Converted to .nii ┝[92m[0m━━━━━━━━━━━━━━━━━━━━┥	6/587	(1.0%)
Converted to .nii ┝[92m[0m━━━━━━━━━━━━━━━━━━━━┥	7/587	(1.2%)
Converted to .nii ┝[92m[0m━━━━━━━━━━━━━━━━━━━━┥	8/587	(1.4%)
Converted to .nii ┝[92m[0m━━━━━━━━━━━━━━━━━━━━┥	9/587	(1.5%)
Converted to .nii ┝[92m[0m━━━━━━━━━━━━━━━━━━━━┥	10/587	(1.7%)
Converted to .nii ┝[92m[0m━━━━━━━━━━━━━━━━━━━━┥	11/587	(1.9%)
Converted to .nii ┝[92m[0m━━━━━━━━━━━━━━━━━━━━┥	12/587	(2.0%)
Converted to .nii ┝[92m[0m━━━━━━━━━━━━━━━━━━━━┥	13/587	(2.2%)
Converted to .nii ┝[92m[0m━━━━━━━━━━━━━━━━━━━━┥	14/587	(2.4%)
Converted to .nii ┝[92m[0m━━━━━━━━━━━━━━━━━━━━┥	

Move all files to one directory

## Convert .nii Files to .jpg

Functions for converting to .jpg

In [4]:
from sklearn.model_selection import train_test_split

def makeTVTSplit(df:pd.DataFrame, training=0.8, testing=0.12):
  """
  Makes a new training, testing, and validation split and returns a dataframe containing the split set for each sample. The portion of samples in the validation set is 1 - training - testing. Default size of validation set is 0.08 (8%)

  Parameters:
    df (pandas.DataFrame): DataFrame containing the samples to be split into different sets
    training (float): Portion of the samples to be put into the training set. Default is 0.8 (80%)
    testing (float): Portion of the samples to be put into the validation set. Default is 0.12 (12%)

  Returns:
    out (pandas.DataFrame) : Copy of the DataFrame provided, with the splitset for each sample added in the column "Split"
  """
  validation = (1.0-training-testing)/(1.0-training)

  x_train, x_temp, y_train, y_temp = train_test_split(df.drop(columns=["Group"]), df["Group"], test_size=1-training, stratify=df[['Group',"Sex_F"]])

  strat = pd.DataFrame(x_temp)
  strat["Group"] = y_temp

  x_test, x_val, y_test, y_val = train_test_split(x_temp, y_temp, stratify=strat[['Group',"Sex_F"]],test_size=validation)
  train = x_train.copy(deep=True)
  train["Split"] = ["train"]*train.shape[0]
  train["Group"] = y_train.values

  validate = x_val.copy(deep=True)
  validate["Split"] = ["validate"]*validate.shape[0]
  validate["Group"] = y_val.values

  test = x_test.copy(deep=True)
  test["Split"] = ["test"]*test.shape[0]
  test["Group"] = y_test.values
  print(f"Split:")
  print(f'\tTest: {len(test)}')
  print(f'\tTrain: {len(train)}')
  print(f'\tValidate: {len(validate)}')

  df_new = pd.merge(train, test, how="outer")
  df_new = pd.merge(df_new, validate, how="outer")

  return df_new

def getImgGroup(filename, df):
  """
   Get the group (demented or nondemented) of the sample with the filename provided

   Parameters:
      filename (str): Name of the file
      df (pandas.DataFrame): DataFrame containing the list of MRI IDs and their groups

   Returns:
      out (int): 0 if nondemented, 1 if demented, -1 if no group matching the file could be found.
  """

  filePattern = r"(OAS2_[0-9]{4}_MR[0-9]{1})"
  r_match = re.match(filePattern, filename)
  split = df[df["MRI ID"]==r_match.groups()[0]]["Group"].values
  if len(split) == 0:
     return -1
  else:
     return split[0]


def toRGB(data):
  """
  Function for converting a numpy array to the proper format to be saved as an RGB image.

  Parameters
  ---------
  data : numpy.ndarray
      Array of data to be converted to image
  """

  x, y = data.shape[:2]
  data = (data-data.min())/(data.max()-data.min())
  img_arr = np.empty(shape=(x,y,4))
  img_arr[:, :, :3] = data
  img_arr[:, :, 3] = 1.
  return img_arr

def cropImg(img_array):
   """
   Crop the image array to a height of 240. All other dimensions are unchanged

   Parameters:
      img_array (np.ndarray) : numpy.ndarray of the original image

   Returns:
      out (np.ndarray) : Cropped image
   """
   img_array = img_array[:240, :, :]
   return img_array

def getImgSplit(filename, split_df):
  """
   Get the split (testing, training, or validation) of the sample with the filename provided

   Parameters:
      filename (str): Name of the file
      df (pandas.DataFrame): DataFrame containing the list of MRI IDs and their splits

   Returns:
      out: str of the split or -1 if no split matching the file could be found
  """
  filePattern = r"(OAS2_[0-9]{4}_MR[0-9]{1})"
  r_match = re.match(filePattern, filename)
  split = split_df[split_df["MRI ID"]==r_match.groups()[0]]["Split"].values
  if len(split) == 0:
     return -1
  else:
     return split[0]

def convertToJPG(nii_dir, jpg_dir, split_df, sliceStart, numSlices=1, spacing=2):
    """
    Convert .nii files from the nii_dir directory to jpg images of various slices in the jpg_dir, and save them according to their TVT split set and group.

    Parameters:
      nii_dir (str): Source directory containing .nii files
      jpg_dir (str): Destination directory to save .jpg images into
      split_df (pandas.DataFrame): DataFrame containing the MRI ID and Split set for each sample
      sliceStart (int): Transverse slice number to begin slicing at
      numSlices (int): (Optional) Number of slices to extract per file. Default is 2.
      spacing (int): (Optional) Number of transverse slices between each extracted slice. Default is 2.

   Returns:
      out (tuple): Shape of the images extracted
    """

    mult = 2
    i = 0
    shape = (0,0,0,0)



    if not os.path.exists(jpg_dir):
       os.makedirs(jpg_dir)

    for root, dir, files in os.walk(nii_dir):

        for f in files:
            fbase, fext = os.path.splitext(f)
            if fext == ".nii":
                i += 1
                fname = os.path.join(root, f)
                img = nib.load(fname)
                for i in range(numSlices):
                  data = img.get_fdata()[sliceStart+i*mult,:,:]
                  img_arr = cropImg(toRGB(data))
                  shape = img_arr.shape
                  print(fbase)
                  split=getImgSplit(fbase, split_df)
                  group=getImgGroup(fbase, split_df)
                  if split!=-1 and group!=-1:
                    if False:
                      jpg_name = os.path.join(jpg_dir, split, f'{fbase.replace(".nifti","")}_{sliceStart+i*mult}.jpg')
                    else:
                       jpg_name = os.path.join(jpg_dir, split, f'class_{group}', f'{fbase.replace(".nifti","")}_{sliceStart+i*mult}.jpg')
                    plt.imsave(jpg_name, img_arr)

    return shape

In [5]:
# Get normalized data
df = pd.read_excel(os.getenv("NORMTAB"))

# If SPLITTVT == 1, make a new split set and save it.
# Otherwise, use an existing split set that's already been saved.
if os.getenv("SPLITTVT") == "1":
  df_split = makeTVTSplit(df)
  df_split.to_excel(os.getenv("SPLITTAB"))
else:
  df_split = pd.read_excel(os.getenv("SPLITTAB"))

# Get the destination directory to save the .jpg files
# If the directory doesn't exist, make it
jpg_dir = os.getenv("JPG")
if not os.path.exists(os.path.join(jpg_dir, "test")):
   os.makedirs(os.path.join(jpg_dir, "test"))

# See if the split set directories exist in the .jpg destination directory
# If not, make them
for split in ["validate", "train", "test"]:
   if not os.path.exists(os.path.join(jpg_dir, split, "class_0")):
      os.makedirs(os.path.join(jpg_dir, split, "class_0"))
   if not os.path.exists(os.path.join(jpg_dir, split, "class_1")):
      os.makedirs(os.path.join(jpg_dir, split, "class_1"))

# Set default data shape
data_shape = (240, 128, 4)

# If CONVERTJPG == 1, convert the .nii files to .jpg files
if os.getenv("CONVERTJPG") == "1":
   print("convert")
   data_shape = convertToJPG(nii_dir, jpg_dir, df_split, 80, int(os.getenv("NUMSLICES")), 2)

convert
OAS2_0001_MR1_F1.nifti
OAS2_0001_MR1_F1.nifti
OAS2_0001_MR1_F1.nifti
OAS2_0001_MR1_F1.nifti
OAS2_0001_MR1_F2.nifti
OAS2_0001_MR1_F2.nifti
OAS2_0001_MR1_F2.nifti
OAS2_0001_MR1_F2.nifti
OAS2_0001_MR2_F1.nifti
OAS2_0001_MR2_F1.nifti
OAS2_0001_MR2_F1.nifti
OAS2_0001_MR2_F1.nifti
OAS2_0001_MR2_F2.nifti
OAS2_0001_MR2_F2.nifti
OAS2_0001_MR2_F2.nifti
OAS2_0001_MR2_F2.nifti
OAS2_0002_MR1_F1.nifti
OAS2_0002_MR1_F1.nifti
OAS2_0002_MR1_F1.nifti
OAS2_0002_MR1_F1.nifti
OAS2_0002_MR1_F2.nifti
OAS2_0002_MR1_F2.nifti
OAS2_0002_MR1_F2.nifti
OAS2_0002_MR1_F2.nifti
OAS2_0002_MR2_F1.nifti
OAS2_0002_MR2_F1.nifti
OAS2_0002_MR2_F1.nifti
OAS2_0002_MR2_F1.nifti
OAS2_0002_MR2_F2.nifti
OAS2_0002_MR2_F2.nifti
OAS2_0002_MR2_F2.nifti
OAS2_0002_MR2_F2.nifti
OAS2_0004_MR1_F1.nifti
OAS2_0004_MR1_F1.nifti
OAS2_0004_MR1_F1.nifti
OAS2_0004_MR1_F1.nifti
OAS2_0004_MR1_F2.nifti
OAS2_0004_MR1_F2.nifti
OAS2_0004_MR1_F2.nifti
OAS2_0004_MR1_F2.nifti
OAS2_0004_MR2_F1.nifti
OAS2_0004_MR2_F1.nifti
OAS2_0004_MR2_F1.nifti
OAS

# Model Training

In [None]:
import keras
from keras import layers
from keras import ops
from keras import Sequential
import tensorflow as tf
import SimpleITK as sitk

2025-04-13 22:31:07.353806: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-13 22:31:07.548347: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744605067.619222  150142 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744605067.636908  150142 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1744605067.788024  150142 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

In [None]:
# If training a new model, set up the base and model format
if os.getenv("TRAINMODEL") == "1":
  # Make the ResNet-50 base
  resnet_base = keras.applications.ResNet50(
    include_top=False,
    weights='imagenet',
    input_shape=(data_shape[0], data_shape[1], 3),
    pooling=True
  )

  # Don't train the ResNet-50 base
  resnet_base.trainable = False

  # Set up the OAS-52 layers
  model = Sequential([
    resnet_base,
    layers.Flatten(),
    layers.Dense(426, activation='relu'),
    layers.Dense(426, activation='leaky_relu'),
    layers.Dense(2, activation='softmax')
  ])

  # Compile the model
  model.compile(
    optimizer=keras.optimizers.Adam(),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
  )


In [None]:
# Make the training and validation datasets for Keras
# Both datasets are set up to use binary labels
def makeKerasTVTDatasets(batchSize):
  # Make the training dataset using the .jpg files in the "JPG/train" directory
  dset_train = keras.preprocessing.image_dataset_from_directory(
    directory=os.path.join(os.getenv("JPG"),"train"),
    seed=73,
    image_size=data_shape[:2],
    batch_size=batchSize,
    label_mode='binary')

  # Make the validation dataset using the .jpg files in the "JPG/validate" directory
  dset_validate = keras.preprocessing.image_dataset_from_directory(
    directory=os.path.join(os.getenv("JPG"),"validate"),
    seed=73,
    image_size=data_shape[:2],
    batch_size=batchSize,
    label_mode='binary')

  return dset_train, dset_validate

# Make the datasets, setting the batch size to 4.
# The batch size can be adjusted depending on the computing power of the
# machine being used.
dset_train, dset_validate= makeKerasTVTDatasets(4)

Found 1856 files belonging to 2 classes.


I0000 00:00:1744605071.707932  150142 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 2270 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3050 Ti Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.6


Found 192 files belonging to 2 classes.


In [None]:
# Ensure there is at least 1 GPU available to utilize
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [None]:
# If training a new model, train the model using history.
if os.getenv("TRAINMODEL") == "1":
  model.summary()

  # Train the model
  history = model.fit(dset_train, validation_data=dset_validate, epochs=15)

  if os.getenv("SAVEMODEL") == "1":
    # Save the model
    model.save("oas2-model.keras")

# Otherwise, load in an existing model.
else:

  # Load in an existing model
  model = keras.saving.load_model("oas-52-model.keras", compile=False)
  # Compile the model
  model.compile(
    optimizer=keras.optimizers.Adam(),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
  )
  # Print a summary of its architecture
  model.summary()


# Get the testing dataset
dset_test = keras.preprocessing.image_dataset_from_directory(
  directory=os.path.join(os.getenv("JPG"),"test"),
  seed=73,
  image_size=data_shape[:2],
  batch_size=4,
  label_mode='binary')

Found 280 files belonging to 2 classes.


# Making Predictions

In [None]:
if os.getenv("MAKEPREDICTIONS") =="1":
  predictions = []
  fileList = []
  mriID = []
  groups = []
  slice = []
  df_old = df.copy(deep=True)
  conf = []
  shap_acc  = []
  i = 0


  for root, dir, files in os.walk(os.path.join(os.getenv("JPG"), "test")):
    for f in files:

      g = getImgGroup(f, df)  # Get the group the image belongs to

      # Load the image and convert it to a usable format
      img = keras.preprocessing.image.load_img(os.path.join(root, f), target_size=data_shape[:3])
      img_arr = keras.preprocessing.image.img_to_array(img)
      img_arr = tf.expand_dims(img_arr, 0)

      # Collect extra data to be included alongside the predictions
      groups += [g]
      slice += [f[-6:-4]]
      mriID += [f[:13]]
      fileList += [f]

      # Make the prediction
      p = model.predict(img_arr)
      conf += [abs(p[0][0]-p[0][1])]  # Calculate the confidence of the prediction
      predictions += [np.argmax(p)]   # Use the argument index containing the largest value as the prediction

      # Update progress bar
      print(f'{i:2d} ({i*100./280.:.2f}%)')
      i += 1

  # Save predictions as a DataFrame
  df_predictions = pd.DataFrame({"MRI ID": mriID, "File": fileList, "Slice": slice, "Group": groups, "Prediction": predictions, "Confidence":conf})

  # For each prediction, record if it is true or false
  tf_a = []
  for index, row in df_predictions.iterrows():
    p = row["Prediction"]
    g = row["Group"]
    tf_a += [1 if g == p else 0]

  df_predictions["TF"] = tf_a

  # Save the predictions to an excel spreadsheet
  df_predictions.to_excel(os.getenv("PREDICTTAB"))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step
 0 (0.00%)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
 1 (0.36%)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
 2 (0.71%)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
 3 (1.07%)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
 4 (1.43%)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
 5 (1.79%)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
 6 (2.14%)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
 7 (2.50%)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
 8 (2.86%)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
 9 (3.21%)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
10 (3.57%)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
11 (3.93%)
[1m1/