# Introduction to ML using ML4H

## Prerequisites
- Basic comfort with python, some linear algebra, some data science
- Follow the instructions in the main [README](https://github.com/broadinstitute/ml4h) for installing ML4H
- Now we are ready to teach the machines!

In [None]:
# Imports
import os
import sys
import pickle
import random
from typing import List, Dict, Callable
from collections import defaultdict, Counter

import csv
import gzip
import h5py
import shutil
import zipfile
import pydicom
import numpy as np

from keras import metrics
from keras.models import Sequential
from keras.layers import Dense, Conv2D, Flatten

%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib import gridspec

### Linear Regression
We explore machine learning on Bio medical data using Cloud computing, Python, Tensorflow, and the ml4h codebase.

We will start with linear regression.  Our model is a vector, one weight for each input feature, and a single bias weight.

\begin{equation}
y = xw + b
\end{equation}

For notational convenience absorb the bias term into the weight vector by adding a 1 to the input data matrix $X$

\begin{equation}
y = [\textbf{1}, X][b, \textbf{w}]^T
\end{equation}

#### The Dense Layer is Matrix (Tensor) Multiplication
![Matrix Multiplication](https://www.mathwarehouse.com/algebra/matrix/images/matrix-multiplication/how-to-multiply-2-matrices-demo.gif)

In [None]:
def linear_regression():
    samples = 40
    real_weight = 2.0
    real_bias = 0.5
    x = np.linspace(-1, 1, samples)
    y = real_weight*x + real_bias + (np.random.randn(*x.shape) * 0.1)

    linear_model = Sequential()
    linear_model.add(Dense(1, input_dim=1))
    linear_model.compile(loss='mean_squared_error', optimizer='sgd')
    linear_model.summary()
    linear_model.fit(x, y, batch_size=1, epochs=6)

    learned_slope = linear_model.get_weights()[0][0][0]
    learned_bias = linear_model.get_weights()[1][0]
    print('Learned slope:',  learned_slope, 'real slope:', real_weight, 'learned bias:', learned_bias, 'real bias:', real_bias)

    plt.plot(x, y)
    plt.plot([-1,1], [-learned_slope+learned_bias, learned_slope+learned_bias], 'r')
    plt.show()
    print('Linear Regression complete!')

In [None]:
linear_regression()

## Now Logistic Regression:
We take the real-valued predictions from linear regression and squish them with a sigmoid.

\begin{equation}
\textbf{y} = \sigma(X\textbf{w} + b)
\end{equation}

where 
\begin{equation}
\sigma(x) = \frac{e^x}{1+e^x} = \frac{1}{1+e^{-x}}
\end{equation}

In [None]:
def sigmoid(x):
    a = []
    for item in x:
        a.append(np.exp(item)/(1+np.exp(item)))
    return a

x = np.arange(-10., 10., 0.2)
sig = sigmoid(x)
plt.plot(x,sig)
plt.show()

In [None]:
def logistic_regression(epochs = 600, num_labels = 10):
    train, test, valid = load_data('mnist.pkl.gz')  
    
    train_y = make_one_hot(train[1], num_labels)
    valid_y = make_one_hot(valid[1], num_labels)
    test_y = make_one_hot(test[1], num_labels)

    logistic_model = Sequential()
    logistic_model.add(Dense(num_labels, activation='softmax', input_dim=784, name='mnist_templates'))
    logistic_model.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['accuracy'])
    logistic_model.summary()
    
    templates = logistic_model.layers[0].get_weights()[0]
    plot_templates(templates, 0)
    print('weights shape:', templates.shape)

    for e in range(epochs):
        trainidx = random.sample(range(0, train[0].shape[0]), 8192)
        x_batch = train[0][trainidx,:]
        y_batch = train_y[trainidx]
        logistic_model.train_on_batch(x_batch, y_batch)
        if e % 100 == 0:
            plot_templates(logistic_model.layers[0].get_weights()[0], e)
            print('Logistic Model test set loss and accuracy:', logistic_model.evaluate(test[0], test_y), 'at epoch', e)


def plot_templates(templates, epoch):
    n = 10
    templates = templates.reshape((28,28,n))
    plt.figure(figsize=(16, 8))
    for i in range(n):
        ax = plt.subplot(2, 5, i+1)		
        plt.imshow(templates[:, :, i])
        plt.gray()
        ax.get_xaxis().set_visible(False)
        ax.get_yaxis().set_visible(False)

    plot_name = "./regression_example/mnist_templates_"+str(epoch)+".png"
    if not os.path.exists(os.path.dirname(plot_name)):
        os.makedirs(os.path.dirname(plot_name))		
    plt.savefig(plot_name)


def make_one_hot(y, num_labels):
    ohy = np.zeros((len(y), num_labels))
    for i in range(0, len(y)):
        ohy[i][y[i]] = 1.0
    return ohy


def load_data(dataset):
    ''' Loads the dataset
    :param dataset: the path to the dataset (here MNIST)'''
    data_dir, data_file = os.path.split(dataset)
    if data_dir == "" and not os.path.isfile(dataset):
        # Check if dataset is in the data directory.
        new_path = os.path.join("data", dataset)
        if os.path.isfile(new_path) or data_file == 'mnist.pkl.gz':
            dataset = new_path

    if (not os.path.isfile(dataset)) and data_file == 'mnist.pkl.gz':
        from urllib.request import urlretrieve
        origin = ('http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz')
        print('Downloading data from %s' % origin)
        if not os.path.exists(os.path.dirname(dataset)):
            os.makedirs(os.path.dirname(dataset))	
        urlretrieve(origin, dataset)

    print('loading data...')
    f = gzip.open(dataset, 'rb')
    if sys.version_info[0] == 3:
        u = pickle._Unpickler(f)
        u.encoding = 'latin1'
        train_set, valid_set, test_set = u.load()
    else:
        train_set, valid_set, test_set = pickle.load(f)
    f.close()

    return train_set, valid_set, test_set

def plot_mnist(sides):
    train, _, _ = load_data('mnist.pkl.gz')
    print(train[0].shape)
    mnist_images = train[0].reshape((-1, 28, 28, 1))
    sides = int(np.ceil(np.sqrt(min(sides, mnist_images.shape[0]))))
    _, axes = plt.subplots(sides, sides, figsize=(16, 16))
    for i in range(sides*sides):
        axes[i // sides, i % sides].imshow(mnist_images[i, ..., 0], cmap='gray')

# Look B4 U Learn!

In [None]:
plot_mnist(25)

## Cross Entropy Loss:
Our favorite loss function for categorical data.
\begin{equation}
L(true, model) = -\sum_{x\in\mathcal{X}} true(x)\, \log model(x)
\end{equation}

Binary cross entropy with $N$ data points $x$ each with a binary label: 
\begin{equation}
true(x) \in \{0, 1\} \\
L(true, model) = -\frac{1}{N}\sum^N_{i=1} true(x_i)\log(model(x_i)) + (1-true(x_i))log(1-model(x_i))
\end{equation}

This is the Kullback Leibler divergence between the true distribution and the predicted. 
This function emerges in many fields as diverse as probability, information theory, and physics.
What is the information difference between the truth and our model?  How much data do I lose by replacing the truth with the model's predictions. What is the temperature difference between my predictions and the truth?!

Categorical cross entropy with $K$ different classes or labels: 
\begin{equation}
true(x) \in \{0, 1, 2, ..., K\} \\
L(true, model) = -\frac{1}{N}\sum^N_{i=1}\sum^K_{j=1} y_{ik}\log(q_k(x_i)))
\end{equation}

In [None]:
logistic_regression()

# Deep Models: "Hidden" Layers and The MultiLayerPerceptron

In [None]:
def multilayer_perceptron():
    train, test, valid = load_data('mnist.pkl.gz')

    num_labels = 10
    train_y = make_one_hot(train[1], num_labels)
    valid_y = make_one_hot(valid[1], num_labels)
    test_y = make_one_hot(test[1], num_labels)

    mlp_model = Sequential()
    mlp_model.add(Dense(500, activation='relu', input_dim=784))
    mlp_model.add(Dense(num_labels, activation='softmax'))
    mlp_model.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['accuracy'])
    mlp_model.summary()
    mlp_model.fit(train[0], train_y, validation_data=(valid[0],valid_y), batch_size=32, epochs=3)
    print('Multilayer Perceptron trained. Test set loss and accuracy:', mlp_model.evaluate(test[0], test_y))

multilayer_perceptron()

# Convolutions Flip, Slide, Multiply, Add
Convolutions look for their kernel in a larger signal.

In convolution, you always and only find what you're looking with.

Convolution and cross correlation are deeply related:

\begin{equation}
f(t) \circledast g(t) \triangleq\ \int_{-\infty}^\infty f(\tau) g(t - \tau) \, d\tau. = \int_{-\infty}^\infty f(t-\tau) g(\tau)\, d\tau.
\end{equation}



![title](https://upload.wikimedia.org/wikipedia/commons/2/21/Comparison_convolution_correlation.svg)

In [None]:
def convolutional_neural_network(filters=32, kernel_size=(3,3), padding='valid', num_labels = 10):
    train, test, valid = load_data('mnist.pkl.gz')

    train_y = make_one_hot(train[1], num_labels)
    valid_y = make_one_hot(valid[1], num_labels)
    test_y = make_one_hot(test[1], num_labels)
    
    print(train[0].shape)
    mnist_images = train[0].reshape((-1, 28, 28, 1))
    mnist_valid = valid[0].reshape((-1, 28, 28, 1))
    mnist_test = test[0].reshape((-1, 28, 28, 1))

    cnn_model = Sequential()
    cnn_model.add(Conv2D(input_shape=(28, 28, 1), filters=filters, kernel_size=kernel_size, padding=padding, activation='relu'))
    cnn_model.add(Conv2D(filters=filters, kernel_size=kernel_size, padding=padding, activation='relu'))
    cnn_model.add(Conv2D(filters=filters, kernel_size=kernel_size, padding=padding, activation='relu'))
    cnn_model.add(Flatten())
    cnn_model.add(Dense(16, activation='relu'))
    cnn_model.add(Dense(num_labels, activation='softmax'))
    cnn_model.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['accuracy'])
    cnn_model.summary()
    cnn_model.fit(mnist_images, train_y, validation_data=(mnist_valid, valid_y), batch_size=32, epochs=3)
    
    print('Convolutional Neural Network trained. Test set loss and accuracy:', cnn_model.evaluate(mnist_test, test_y))

convolutional_neural_network()

# Why (and When!) is Convolution Helpful?
- Decouples input size from model size
- Translationally Equivariant (Not Invariant), so we can find features wherever they might occur in the signal
- Local structure is often informative
- But not always! (eg Tabular data)