## First Models

First models on the deparment GPU

In [1]:
"""Trains a model on a department machine.

Make sure to copy the data from thingumy to here first.
"""
import logging
import os
import sys
from pathlib import Path

import keras
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from keras import layers, optimizers

Using TensorFlow backend.
  return f(*args, **kwds)


In [2]:
LENGTH, WIDTH, HEIGHT = (120, 120, 64)

VALID_TRAINING_INDICES = []
VALID_VALIDATION_INDICES = []

In [3]:
def configure_logger():
    root_logger = logging.getLogger()
    root_logger.setLevel(logging.INFO)
    handler = logging.StreamHandler()
    formatter = logging.Formatter(
        fmt='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    handler.setFormatter(formatter)
    root_logger.addHandler(handler)

## Preprocessing


In [4]:
def load_training_data() -> np.array:
    """Returns a 4D matrix of the training data.

     The data is in the form (n_samples, l, w, h). The samples
     are sorted by patient ID.
     """
    arrays = []
    training_filenames = sorted(os.listdir(
        '/home/lzhu7/data/numpy_split/training'))
    for i, filename in enumerate(training_filenames):
        arr = np.load('/home/lzhu7/data/numpy_split/training/' + filename)
        if arr.shape == (LENGTH, WIDTH, HEIGHT):
            arrays.append(arr)
            VALID_TRAINING_INDICES.append(i)
        else:
            logging.info(
                f'training file {filename} has incorrect shape {arr.shape}')
    return np.stack(arrays)


def load_validation_data() -> np.array:
    """Returns a 4D matrix of the validation data.

     The data is in the form (n_samples, l, w, h). The samples
     are sorted by patient ID.
    """
    arrays = []
    validation_filenames = sorted(os.listdir(
        '/home/lzhu7/data/numpy_split/validation'))
    for i, filename in enumerate(validation_filenames):
        arr = np.load('/home/lzhu7/data/numpy_split/validation/' + filename)
        if arr.shape == (LENGTH, WIDTH, HEIGHT):
            arrays.append(arr)
            VALID_VALIDATION_INDICES.append(i)
        else:
            logging.info(
                f'validation file {filename} has incorrect shape {arr.shape}')
    return np.stack(arrays)


def load_labels() -> (np.array, np.array):
    training_df = pd.read_csv('/home/lzhu7/data/training_labels.csv')
    validation_df = pd.read_csv('/home/lzhu7/data/validation_labels.csv')
    training_labels = training_df.sort_values('patient_id')['label'].values
    validation_labels = validation_df.sort_values('patient_id')['label'].values
    return training_labels, validation_labels

In [5]:
configure_logger()
X_train = load_training_data()
logging.info(f'loaded training data with shape {X_train.shape}')
y_train, _ = load_labels()
logging.info(f'loaded training labels with shape {y_train.shape}')
y_train = y_train[VALID_TRAINING_INDICES]
logging.info(f'filtered training labels to shape {y_train.shape}')

2018-06-08 20:44:30,067 - root - INFO - loaded training data with shape (500, 120, 120, 64)
2018-06-08 20:44:30,081 - root - INFO - loaded training labels with shape (500,)
2018-06-08 20:44:30,082 - root - INFO - filtered training labels to shape (500,)


In [6]:
# Uncomment to see validation data
# X_valid = load_validation_data()
# logging.info(f'loaded validation data with shape {X_valid.shape}')
# _, y_valid = load_labels()
# logging.info(f'loaded validation labels with shape {y_valid.shape}')
# y_valid = y_valid[VALID_VALIDATION_INDICES]
# logging.info(f'filtered validation labels to shape {y_valid.shape}')

## Data Exploration

In [7]:
%matplotlib inline

## More preprocessing

In [8]:
def standardize(X, mean, std):
    return (X - mean) / std

In [9]:
X_mean = X_train.mean()
X_std = X_train.std()
print(X_mean, X_std)

36.98628576605903 217.98102904871755


In [10]:
X_train = standardize(X_train, X_mean, X_std)
print(X_train.mean(), X_train.std())

-4.2368577781972915e-17 0.9999999999999996


In [11]:
# X_valid = standardize(X_valid, X_mean, X_std)
# print(X_valid.mean(), X_valid.std())

## Model training

In [32]:
def build_model() -> keras.Model:
    """Returns a compiled model.
    """
    model = keras.Sequential()
    model.add(layers.Conv2D(256,
                            (3, 3),
                            activation='relu',
                            input_shape=(LENGTH, WIDTH, HEIGHT),
                            use_bias=False))
    model.add(layers.Conv2D(256, (3, 3), activation='relu', use_bias=False, padding='same'))
    model.add(layers.MaxPool2D())
    model.add(layers.Conv2D(512, (3, 3), activation='relu', use_bias=False, padding='same'))
    model.add(layers.Conv2D(512, (3, 3), activation='relu', use_bias=False, padding='same'))
    model.add(layers.MaxPool2D())
    model.add(layers.Conv2D(1024, (3, 3), activation='relu', use_bias=False, padding='same'))
    model.add(layers.Conv2D(1024, (3, 3), activation='relu', use_bias=False, padding='same'))
    model.add(layers.MaxPool2D())
    model.add(layers.Flatten())
    model.add(layers.Dense(1024, activation='relu', use_bias=False))
    model.add(layers.Dense(1024, activation='relu', use_bias=False))
#     model.add(layers.Dropout(0.5))
    model.add(layers.Dense(1, activation='sigmoid', use_bias=False))

    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

In [33]:
model = build_model()
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_12 (Conv2D)           (None, 118, 118, 256)     147456    
_________________________________________________________________
conv2d_13 (Conv2D)           (None, 118, 118, 256)     589824    
_________________________________________________________________
max_pooling2d_11 (MaxPooling (None, 59, 59, 256)       0         
_________________________________________________________________
conv2d_14 (Conv2D)           (None, 59, 59, 512)       1179648   
_________________________________________________________________
conv2d_15 (Conv2D)           (None, 59, 59, 512)       2359296   
_________________________________________________________________
max_pooling2d_12 (MaxPooling (None, 29, 29, 512)       0         
_________________________________________________________________
conv2d_16 (Conv2D)           (None, 29, 29, 1024)      4718592   
__________

In [None]:
model.fit(X_train[0:1], y_train[0:1], batch_size=1, epochs=10)

Epoch 1/10


In [31]:
model.predict(X_train[0:10])

array([[0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.]], dtype=float32)

In [None]:
import os
import psutil
process = psutil.Process(os.getpid())
print(process.memory_info().rss)