### Welcome to ML4D BroadE!
We explore machine learning on Bio medical data using Cloud computing, Python, Tensorflow, and the ml4h codebase.

We will start with linear regression.  Our model is a vector, one weight for each input feature, and a single bias weight.

\begin{equation}
y = xw + b
\end{equation}

For notational convenience absorb the bias term into the weight vector by adding a 1 to the input data matrix X

\begin{equation}
y = [1, x][b, w]^T
\end{equation}

In [None]:
import os
import h5py
import numpy as np
from typing import List, Dict, Callable
from collections import defaultdict

# Import:
import os
import sys
import gzip
import pickle
import random
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib import gridspec

from keras import metrics
from keras.models import Sequential
from keras.layers.core import Dense

from ml4h.arguments import _get_tmap
from ml4h.TensorMap import TensorMap

In [None]:
def linear_regression():
    samples = 40
    real_weight = 2.0
    real_bias = 0.5
    x = np.linspace(-1, 1, samples)
    y = real_weight*x + real_bias + (np.random.randn(*x.shape) * 0.1)

    linear_model = Sequential()
    linear_model.add(Dense(1, input_dim=1))
    linear_model.compile(loss='mse', optimizer='sgd')
    linear_model.summary()
    linear_model.fit(x, y, batch_size=1, epochs=10)

    learned_slope = linear_model.get_weights()[0][0][0]
    learned_bias = linear_model.get_weights()[1][0]
    print('Learned slope:',  learned_slope, 'real slope:', real_weight, 'learned bias:', learned_bias, 'real bias:', real_bias)

    plt.plot(x, y)
    plt.plot([-1,1], [-learned_slope+learned_bias, learned_slope+learned_bias], 'r')
    plt.show()
    print('Linear Regression complete!')

In [None]:
linear_regression()

## Now Logistic Regression:
We take the real-valued predictions from linear regression and squish them with a sigmoid.

\begin{equation}
\textbf{y} = \sigma(X\textbf{w} + b)
\end{equation}

where 
\begin{equation}
\sigma(x) = \frac{e^x}{1+e^x}
\end{equation}

In [None]:
import math

def sigmoid(x):
    a = []
    for item in x:
        a.append(math.exp(item)/(1+math.exp(item)))
    return a

In [None]:
x = np.arange(-10., 10., 0.2)
sig = sigmoid(x)
plt.plot(x,sig)
plt.show()

In [None]:
def logistic_regression():
	train, test, valid = load_data('mnist.pkl.gz')

	epochs = 1000
	num_labels = 10
	train_y = make_one_hot(train[1], num_labels)
	valid_y = make_one_hot(valid[1], num_labels)
	test_y = make_one_hot(test[1], num_labels)

	logistic_model = Sequential()
	logistic_model.add(Dense(10, activation='softmax', input_dim=784, name='mnist_templates'))
	logistic_model.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['accuracy'])
	logistic_model.summary()
	templates = logistic_model.layers[0].get_weights()[0]
	plot_templates(templates, 0)
	print('weights shape:', templates.shape)

	for e in range(epochs):
		trainidx = random.sample(range(0, train[0].shape[0]), 8192)
		x_batch = train[0][trainidx,:]
		y_batch = train_y[trainidx]
		logistic_model.train_on_batch(x_batch, y_batch)
		if e % 100 == 0:
			plot_templates(logistic_model.layers[0].get_weights()[0], e)
			print('Logistic Model test set loss and accuracy:', logistic_model.evaluate(test[0], test_y), 'at epoch', e)


def plot_templates(templates, epoch):
	n = 10
	templates = templates.reshape((28,28,n))
	plt.figure(figsize=(16, 8))
	for i in range(n):
		ax = plt.subplot(2, 5, i+1)		
		plt.imshow(templates[:, :, i])
		plt.gray()
		ax.get_xaxis().set_visible(False)
		ax.get_yaxis().set_visible(False)

	plot_name = "./regression_example/mnist_templates_"+str(epoch)+".png"
	if not os.path.exists(os.path.dirname(plot_name)):
		os.makedirs(os.path.dirname(plot_name))		
	plt.savefig(plot_name)


def make_one_hot(y, num_labels):
	ohy = np.zeros((len(y), num_labels))
	for i in range(0, len(y)):
		ohy[i][y[i]] = 1.0
	return ohy


def load_data(dataset):
	''' Loads the dataset
	:type dataset: string
	:param dataset: the path to the dataset (here MNIST)
	'''

	#############
	# LOAD DATA #
	#############

	# Download the MNIST dataset if it is not present
	data_dir, data_file = os.path.split(dataset)
	if data_dir == "" and not os.path.isfile(dataset):
		# Check if dataset is in the data directory.
		new_path = os.path.join("data", dataset)
		if os.path.isfile(new_path) or data_file == 'mnist.pkl.gz':
			dataset = new_path

	if (not os.path.isfile(dataset)) and data_file == 'mnist.pkl.gz':
		from urllib.request import urlretrieve
		origin = (
			'http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz'
		)
		print('Downloading data from %s' % origin)
		if not os.path.exists(os.path.dirname(dataset)):
			os.makedirs(os.path.dirname(dataset))	
		urlretrieve(origin, dataset)

	print('loading data...')

	# Load the dataset
	f = gzip.open(dataset, 'rb')
	if sys.version_info[0] == 3:
		u = pickle._Unpickler(f)
		u.encoding = 'latin1'
		train_set, valid_set, test_set = u.load()
	else:
		train_set, valid_set, test_set = pickle.load(f)

	f.close()
	#train_set, valid_set, test_set format: tuple(input, target)
	#input is an numpy.ndarray of 2 dimensions (a matrix)
	#which row's correspond to an example. target is a
	#numpy.ndarray of 1 dimensions (vector)) that have the same length as
	#the number of rows in the input. It should give the target
	#target to the example with the same index in the input.

	return train_set, valid_set, test_set

In [None]:
logistic_regression()

## TensorMaps
The critical data structure in the ml4h codebase is the TensorMap.
This abstraction provides a way to translate ***any*** kind of input data, into structured numeric tensors with clear semantics for interpretation and modeling.

## The Model Factory
The function ***make_multimodal_multitask_model()*** takes lists of TensorMaps and connects them with intelligent goo.

## Tensorization
Is the process of gathering any number of input files and consolidating them into compressed HD5 files.  We tend to make one HD5 file per sample in the study.  The files contain the raw data and labels we will use to train models.  It tends to be efficient to separate tensor construction from model training.

In [None]:
def tmaps_by_sample_id(tensor_folder: str, sample_id: str, tmaps: List[TensorMap]):
    path = os.path.join(tensor_folder, sample_id + '.hd5')
    result_dict = defaultdict(lambda: None)
    if os.path.isfile(path):
            with h5py.File(path, 'r') as hd5:
                for tmap in tmaps:
                    try:
                        result_dict[tmap] = tmap.tensor_from_file(tmap, hd5)
                    except (IndexError, KeyError, ValueError, OSError, RuntimeError):
                        continue
    return result_dict


def tmaps_with_properties(tensor_folder: str, tmap_properties: Dict[TensorMap, Callable[[np.ndarray], bool]], search_size=100):
    all_ids = [file.strip('.hd5') for file in sorted(os.listdir(tensor_folder))[:search_size]]
    results = map(lambda sample_id: tmaps_by_sample_id(tensor_folder, sample_id, tmap_properties.keys()), all_ids)
    return {
        sample_id: result
        for sample_id, result in zip(all_ids, results)
        if all(
            result[tmap] is not None and tmap_properties[tmap](result[tmap])
            for tmap in tmap_properties.keys()
        )
    }


def tmaps_with_properties_from_keys(tensor_folder: str, tmap_properties: Dict[str, Callable[[np.ndarray], bool]], search_size=100):
    return tmaps_with_properties(
        tensor_folder,
        {_get_tmap(key): prop for key, prop in tmap_properties.items()},
        search_size,
    )


def plot_array_3d(a):
    slice_axis = -1
    sides = int(np.ceil(np.sqrt(a.shape[slice_axis])))
    _, axes = plt.subplots(sides, sides, figsize=(16, 16))
    print(a.shape)
    vmin = np.min(a)
    vmax = np.max(a)
    for i in range(a.shape[slice_axis]):
        axes[i//sides, i%sides].imshow(a[..., i], cmap='gray', vmin=vmin, vmax=vmax)
        axes[i//sides, i%sides].set_yticklabels([])
        axes[i//sides, i%sides].set_xticklabels([])

In [None]:
tmaps = [_get_tmap('t1_30_slices'), _get_tmap('t1_dicom_30_slices'),_get_tmap('t2_flair_30_slices'), _get_tmap('t2_dicom_30_slices')]

In [None]:
t = tmaps_by_sample_id('/mnt/disks/brain-tensors-all-40k/2020-01-14/', '3035859', tmaps)
for k in t:
    print(k.name, 'has', t[k].shape)

In [None]:
for k in t:
    plot_array_3d(t[k])

The Erowid Data Scrape.

In [None]:
http://human.brain-map.org/mri_viewers/data