## Start Master and Slave

Please run the following commands in terminal to start master and slave on every machine using user hdfs. You can look up the spark home env in the env.py.

```bash
su hdfs
export MASTER=spark://master0.datascience.com:7077
export CORES_PER_WORKER=1
% on master machine, run:
${SPARK_HOME}/sbin/start-master.sh;
% on slave machine, run:
${SPARK_HOME}/sbin/start-slave.sh -c ${CORES_PER_WORKER} -m 3G ${MASTER}
```

## Set up the Environment

The codes in env.py do not have to be changed for a different training process.

In [1]:
%run env.py
# %env # print env

## Import Libraries

Please import and initialize findspark first before import other pyspark libraries.

In [2]:
import findspark
findspark.init()
from pyspark.context import SparkContext
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession

import numpy
import tensorflow as tf

from tensorflowonspark import TFCluster
from argparse import Namespace

from mnist_train_tf import main_tf
from mnist_train_keras import main_keras

## Set up Spark Config

Our machine is running on spark 2.3.
Spark env properties can be found [here](https://spark.apache.org/docs/2.3.0/configuration.html).
Yarn env properties can be found [here](https://spark.apache.org/docs/2.3.0/running-on-yarn.html).

Some notes:

**spark.submit.deployMode**: please change the value from `client` to `cluster` if you submit your work from different location, details can be found [here](https://stackoverflow.com/questions/41124428/spark-yarn-cluster-vs-client-how-to-choose-which-one-to-use?rq=1).

**spark.executor.instances**: this should match the amount of your worker nodes.

**spark.executor.memory**: check the remaining memory first at YARN ResourceManager UI, you may want to kill some dead process first.

**spark.executorEnv.CLASSPATH**: specifies the location of user-defined classes and packages, sometimes this is used for backward compatiblity.

In [3]:
conf = SparkConf()

conf.setAll([("spark.app.name", "mnist-standalone-train"), # your app name
             ("spark.master", "spark://master0.datascience.com:7077"), # cluster mode, please leave this unchanged
             ("spark.cores.max", "2"), # check comments above
             ("spark.task.cpus", "1"), # check comments above
             ("spark.executorEnv.LD_LIBRARY_PATH", os.environ["LD_LIBRARY_PATH"]), # please leave this unchanged
             ("spark.executorEnv.HADOOP_HDFS_HOME", os.environ["HADOOP_HDFS_HOME"]), # please leave this unchanged
             ("spark.executorEnv.CLASSPATH", os.environ["CLASSPATH"])]) # please leave this unchanged

sc = SparkContext(conf=conf)
sc.addPyFile("mnist_train_tf.py")
sc.addPyFile("mnist_train_keras.py")

# print(sc._conf.getAll())

## Set up Arguments

In [4]:
args = Namespace(
  batch_size=100, # number of records per batch
  epochs=1, # number of epochs
  format="csv", # example format: ["csv", "tfr"]
  images="hdfs://master0.datascience.com:8020/data/mnist/csv/train/images", # HDFS path to MNIST images in parallelized format
  labels="hdfs://master0.datascience.com:8020/data/mnist/csv/train/labels", # HDFS path to MNIST labels in parallelized format
  model="hdfs://master0.datascience.com:8020/data/mnist/model", # HDFS path to save/load model during train/inference
  cluster_size=2, # number of nodes in the cluster
  num_ps=1, # number of parameter servers
  output="hdfs://master0.datascience.com:8020/data/mnist/predictions", # HDFS path to save test/inference output
  readers=1, # number of reader/enqueue threads
  steps=1000, # maximum number of steps
  tensorboard=False, # launch tensorboard process
  mode="train", # train|inference
  rdma=False # use rdma connection
)

## Main

In [5]:
if args.format == "tfr":
  images = sc.newAPIHadoopFile(args.images, "org.tensorflow.hadoop.io.TFRecordFileInputFormat",
                               keyClass="org.apache.hadoop.io.BytesWritable",
                               valueClass="org.apache.hadoop.io.NullWritable")

  def toNumpy(bytestr):
    example = tf.train.Example()
    example.ParseFromString(bytestr)
    features = example.features.feature
    image = numpy.array(features['image'].int64_list.value)
    label = numpy.array(features['label'].int64_list.value)
    return (image, label)

  dataRDD = images.map(lambda x: toNumpy(bytes(x[0])))
else:  # args.format == "csv":
  images = sc.textFile(args.images).map(lambda ln: [int(x) for x in ln.split(',')])
  labels = sc.textFile(args.labels).map(lambda ln: [float(x) for x in ln.split(',')])
  dataRDD = images.zip(labels)

# train with tensorflow
cluster = TFCluster.run(sc, main_tf, args, args.cluster_size, args.num_ps, args.tensorboard, TFCluster.InputMode.SPARK)

# train with keras
# cluster = TFCluster.run(sc, main_keras, args, args.cluster_size, args.num_ps, args.tensorboard, TFCluster.InputMode.SPARK)

if args.mode == "train":
  cluster.train(dataRDD, args.epochs)
else:
  labelRDD = cluster.inference(dataRDD)
  labelRDD.saveAsTextFile(args.output)
cluster.shutdown()

sc.stop()

2019-03-17 13:57:52,736 INFO (MainThread-89285) Reserving TFSparkNodes 
2019-03-17 13:57:52,737 INFO (MainThread-89285) cluster_template: {'ps': range(0, 1), 'worker': range(1, 2)}
2019-03-17 13:57:52,740 INFO (MainThread-89285) listening for reservations at ('172.29.0.3', 46085)
2019-03-17 13:57:52,741 INFO (MainThread-89285) Starting TensorFlow on executors
2019-03-17 13:57:52,749 INFO (MainThread-89285) Waiting for TFSparkNodes to start
2019-03-17 13:57:52,750 INFO (MainThread-89285) waiting for 2 reservations
2019-03-17 13:57:53,752 INFO (MainThread-89285) waiting for 2 reservations
2019-03-17 13:57:54,754 INFO (MainThread-89285) waiting for 2 reservations
2019-03-17 13:57:55,756 INFO (MainThread-89285) waiting for 2 reservations
2019-03-17 13:57:56,758 INFO (MainThread-89285) waiting for 2 reservations
2019-03-17 13:57:57,760 INFO (MainThread-89285) waiting for 2 reservations
2019-03-17 13:57:58,762 INFO (MainThread-89285) waiting for 2 reservations
2019-03-17 13:57:59,764 INFO (M

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 2 in stage 1.0 failed 4 times, most recent failure: Lost task 2.3 in stage 1.0 (TID 14, 172.29.0.5, executor 0): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/usr/hdp/3.1.0.0-78/spark2/python/lib/pyspark.zip/pyspark/worker.py", line 253, in main
    process()
  File "/usr/hdp/3.1.0.0-78/spark2/python/lib/pyspark.zip/pyspark/worker.py", line 248, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/usr/hdp/3.1.0.0-78/spark2/python/pyspark/rdd.py", line 2440, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/usr/hdp/3.1.0.0-78/spark2/python/pyspark/rdd.py", line 2440, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/usr/hdp/3.1.0.0-78/spark2/python/pyspark/rdd.py", line 2440, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/usr/hdp/3.1.0.0-78/spark2/python/pyspark/rdd.py", line 350, in func
    return f(iterator)
  File "/usr/hdp/3.1.0.0-78/spark2/python/pyspark/rdd.py", line 799, in func
    r = f(it)
  File "/usr/lib/python3.6/site-packages/tensorflowonspark/TFSparkNode.py", line 406, in _train
    queue.put(item, block=True)
AttributeError: 'AutoProxy[get_queue]' object has no attribute 'put'

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:330)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:470)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:453)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:284)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$class.foreach(Iterator.scala:893)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
	at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$12.apply(RDD.scala:945)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$12.apply(RDD.scala:945)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2074)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2074)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:109)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	at java.lang.Thread.run(Thread.java:745)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1651)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1639)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1638)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1638)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:831)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1872)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1821)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1810)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:642)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2034)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2055)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2074)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2099)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:945)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:944)
	at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:165)
	at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:745)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/usr/hdp/3.1.0.0-78/spark2/python/lib/pyspark.zip/pyspark/worker.py", line 253, in main
    process()
  File "/usr/hdp/3.1.0.0-78/spark2/python/lib/pyspark.zip/pyspark/worker.py", line 248, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/usr/hdp/3.1.0.0-78/spark2/python/pyspark/rdd.py", line 2440, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/usr/hdp/3.1.0.0-78/spark2/python/pyspark/rdd.py", line 2440, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/usr/hdp/3.1.0.0-78/spark2/python/pyspark/rdd.py", line 2440, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/usr/hdp/3.1.0.0-78/spark2/python/pyspark/rdd.py", line 350, in func
    return f(iterator)
  File "/usr/hdp/3.1.0.0-78/spark2/python/pyspark/rdd.py", line 799, in func
    r = f(it)
  File "/usr/lib/python3.6/site-packages/tensorflowonspark/TFSparkNode.py", line 406, in _train
    queue.put(item, block=True)
AttributeError: 'AutoProxy[get_queue]' object has no attribute 'put'

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:330)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:470)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:453)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:284)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$class.foreach(Iterator.scala:893)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
	at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$12.apply(RDD.scala:945)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$12.apply(RDD.scala:945)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2074)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2074)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:109)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	... 1 more
