## Quick Setup

In [None]:
# Add SystemML PySpark API file.
sc.addPyFile("https://raw.githubusercontent.com/apache/incubator-systemml/3d5f9b11741f6d6ecc6af7cbaa1069cde32be838/src/main/java/org/apache/sysml/api/python/SystemML.py")

# Create a SystemML MLContext object
from SystemML import MLContext
ml = MLContext(sc)

## Download Data - MNIST

The MNIST dataset contains labeled images of handwritten digits, where each example is a 28x28 pixel image of grayscale values in the range [0,255] stretched out as 784 pixels, and each label is one of 10 possible digits in [0,9].  Here, we download 60,000 training examples, and 10,000 test examples, where the format is "label, pixel_1, pixel_2, ..., pixel_n".

In [None]:
%%sh
mkdir -p data/mnist/
cd data/mnist/
curl -O http://pjreddie.com/media/files/mnist_train.csv
curl -O http://pjreddie.com/media/files/mnist_test.csv

## SystemML "LeNet" Neural Network

### 1. Train

In [None]:
script = """
source("mnist_lenet.dml") as mnist_lenet

# Read training data
data = read($data, format="csv")
n = nrow(data)
C = $C
Hin = $Hin
Win = $Win

# Extract images and labels
images = data[,2:ncol(data)]
labels = data[,1]

# Scale images to [-1,1], and one-hot encode the labels
images = (images / 255.0) * 2 - 1
labels = table(seq(1, n), labels+1, n, 10)

# Split into training (55,000 examples) and validation (5,000 examples)
X = images[5001:nrow(images),]
X_val = images[1:5000,]
y = labels[5001:nrow(images),]
y_val = labels[1:5000,]

# Train
[W1, b1, W2, b2, W3, b3, W4, b4] = mnist_lenet::train(X, y, X_val, y_val, C, Hin, Win)

# Write model out
write(W1, $W1out)
write(b1, $b1out)
write(W2, $W2out)
write(b2, $b2out)
write(W3, $W3out)
write(b3, $b3out)
write(W4, $W4out)
write(b4, $b4out)

print("")
print("")
"""
ml.reset()
out = ml.executeScript(script, {"data": "data/mnist/mnist_train.csv",
                                "C": 1, "Hin": 28, "Win": 28},
                       outputs=["W1", "b1", "W2", "b2", "W3", "b3", "W4", "b4"])

### 2. Extract model from SystemML back into PySpark

In [None]:
# Extract variables
W1 = out.getDF(sqlContext, "W1").sort("ID").drop("ID")
b1 = out.getDF(sqlContext, "b1").sort("ID").drop("ID")
W2 = out.getDF(sqlContext, "W2").sort("ID").drop("ID")
b2 = out.getDF(sqlContext, "b2").sort("ID").drop("ID")
W3 = out.getDF(sqlContext, "W3").sort("ID").drop("ID")
b3 = out.getDF(sqlContext, "b3").sort("ID").drop("ID")
W4 = out.getDF(sqlContext, "W4").sort("ID").drop("ID")
b4 = out.getDF(sqlContext, "b4").sort("ID").drop("ID")

### 3. Compute Test Accuracy

In [None]:
script = """
source("mnist_lenet.dml") as mnist_lenet

# Read test data
data = read($data, format="csv")
n = nrow(data)
C = $C
Hin = $Hin
Win = $Win

# Extract images and labels
X_test = data[,2:ncol(data)]
y_test = data[,1]

# Scale images to [-1,1], and one-hot encode the labels
X_test = (X_test / 255.0) * 2 - 1
y_test = table(seq(1, n), y_test+1, n, 10)

# Read model coefficients
W1 = read($W1)
b1 = read($b1)
W2 = read($W2)
b2 = read($b2)
W3 = read($W3)
b3 = read($b3)
W4 = read($W4)
b4 = read($b4)

# Eval on test set
probs = mnist_lenet::predict(X_test, C, Hin, Win, W1, b1, W2, b2, W3, b3, W4, b4)
[loss, accuracy] = mnist_lenet::eval(probs, y_test)

print("Test Accuracy: " + accuracy)

print("")
print("")
"""
ml.reset()
ml.executeScript(script, {"data": "data/mnist/mnist_train.csv",
                          "C": 1, "Hin": 28, "Win": 28,
                          "W1": W1, "b1": b1,
                          "W2": W2, "b2": b2,
                          "W3": W3, "b3": b3,
                          "W4": W4, "b4": b4})