In [1]:
from __future__ import division, print_function, unicode_literals
import numpy as np

# IRIS dataset

In [2]:
from pyspark.sql.types import *

fields = [StructField(field_name, DoubleType(), True) 
          for field_name in ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']] + \
    [StructField('species', StringType(), True)]
    
schema = StructType(fields)

iris_df = sqlContext.read.csv("../dataset/iris.csv", header=True, schema=schema)

In [3]:
iris_df.printSchema()

root
 |-- sepal_length: double (nullable = true)
 |-- sepal_width: double (nullable = true)
 |-- petal_length: double (nullable = true)
 |-- petal_width: double (nullable = true)
 |-- species: string (nullable = true)



In [4]:
iris_df.show(5)

+------------+-----------+------------+-----------+-------+
|sepal_length|sepal_width|petal_length|petal_width|species|
+------------+-----------+------------+-----------+-------+
|         5.1|        3.5|         1.4|        0.2| setosa|
|         4.9|        3.0|         1.4|        0.2| setosa|
|         4.7|        3.2|         1.3|        0.2| setosa|
|         4.6|        3.1|         1.5|        0.2| setosa|
|         5.0|        3.6|         1.4|        0.2| setosa|
+------------+-----------+------------+-----------+-------+
only showing top 5 rows



In [5]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline

# features
feature_assembler = VectorAssembler(
    inputCols=["sepal_length", "sepal_width", "petal_length", "petal_width"],
    outputCol="features")

# labels
str_indexer = StringIndexer(inputCol="species", outputCol="species_index")
label_encoder = OneHotEncoder(inputCol="species_index", outputCol="labels")

# pipeline
pipeline = Pipeline(stages=[feature_assembler, str_indexer, label_encoder])

In [6]:
iris_pipeline = pipeline.fit(iris_df)
iris_dataset_df = iris_pipeline.transform(iris_df, {label_encoder.dropLast: False})

In [7]:
iris_dataset_df.select('features', 'labels').show(20)

+-----------------+-------------+
|         features|       labels|
+-----------------+-------------+
|[5.1,3.5,1.4,0.2]|(3,[2],[1.0])|
|[4.9,3.0,1.4,0.2]|(3,[2],[1.0])|
|[4.7,3.2,1.3,0.2]|(3,[2],[1.0])|
|[4.6,3.1,1.5,0.2]|(3,[2],[1.0])|
|[5.0,3.6,1.4,0.2]|(3,[2],[1.0])|
|[5.4,3.9,1.7,0.4]|(3,[2],[1.0])|
|[4.6,3.4,1.4,0.3]|(3,[2],[1.0])|
|[5.0,3.4,1.5,0.2]|(3,[2],[1.0])|
|[4.4,2.9,1.4,0.2]|(3,[2],[1.0])|
|[4.9,3.1,1.5,0.1]|(3,[2],[1.0])|
|[5.4,3.7,1.5,0.2]|(3,[2],[1.0])|
|[4.8,3.4,1.6,0.2]|(3,[2],[1.0])|
|[4.8,3.0,1.4,0.1]|(3,[2],[1.0])|
|[4.3,3.0,1.1,0.1]|(3,[2],[1.0])|
|[5.8,4.0,1.2,0.2]|(3,[2],[1.0])|
|[5.7,4.4,1.5,0.4]|(3,[2],[1.0])|
|[5.4,3.9,1.3,0.4]|(3,[2],[1.0])|
|[5.1,3.5,1.4,0.3]|(3,[2],[1.0])|
|[5.7,3.8,1.7,0.3]|(3,[2],[1.0])|
|[5.1,3.8,1.5,0.3]|(3,[2],[1.0])|
+-----------------+-------------+
only showing top 20 rows



In [8]:
iris_data = iris_dataset_df.select('features', 'labels').rdd \
    .map(lambda r: {f: r[f].toArray().reshape(1, -1) for f in ['features', 'labels']}) 

# Minibatch generator

In [9]:
train_data, test_data = iris_data.sortBy(lambda x: np.random.random()).randomSplit([0.65, 0.35])

In [10]:
train_data.persist()

PythonRDD[18] at RDD at PythonRDD.scala:48

In [12]:
import math

def concate_data(x, y):
    dataset = {}
    for f in ['features', 'labels']:
        dataset[f] = np.concatenate((x[f], y[f]), axis=0)
    return dataset


def next_batch_rdd_maker(data_rdd, batch_size, epoch_limit=None):
    data_cnt = data_rdd.count()
    per_batch_ratios = [batch_size / data_cnt] * int(math.ceil(data_cnt / batch_size))
    def next_batch():
        current_epoch = 0
        while True:
            current_epoch += 1
            if epoch_limit is not None and current_epoch > epoch_limit: 
                break 
            batches = data_rdd \
                .sortBy(lambda x: np.random.random()) \
                .randomSplit(per_batch_ratios)
            for batch in batches:
                dataset = batch.reduce(concate_data)
                yield dataset['features'], dataset['labels']
    return next_batch

In [13]:
test_data_dict = test_data.reduce(concate_data)
test_data_dict['features'][:5], test_data_dict['labels'][:5]

(array([[ 6.3,  3.4,  5.6,  2.4],
        [ 6.4,  3.2,  5.3,  2.3],
        [ 4.4,  3. ,  1.3,  0.2],
        [ 6. ,  2.2,  5. ,  1.5],
        [ 4.9,  3.1,  1.5,  0.1]]), array([[ 0.,  1.,  0.],
        [ 0.,  1.,  0.],
        [ 0.,  0.,  1.],
        [ 0.,  1.,  0.],
        [ 0.,  0.,  1.]]))

In [14]:
next_batch = next_batch_rdd_maker(train_data, 10)

In [15]:
next(next_batch())

(array([[ 5. ,  3.3,  1.4,  0.2],
        [ 5.1,  3.8,  1.5,  0.3],
        [ 5.7,  3. ,  4.2,  1.2],
        [ 5.1,  3.7,  1.5,  0.4],
        [ 6.5,  2.8,  4.6,  1.5],
        [ 5. ,  3.5,  1.3,  0.3],
        [ 6.4,  3.2,  5.3,  2.3],
        [ 6.7,  3. ,  5.2,  2.3]]), array([[ 0.,  0.,  1.],
        [ 0.,  0.,  1.],
        [ 1.,  0.,  0.],
        [ 0.,  0.,  1.],
        [ 1.,  0.,  0.],
        [ 0.,  0.,  1.],
        [ 0.,  1.,  0.],
        [ 0.,  1.,  0.]]))

In [21]:
epoches = 20
data_cnt = train_data.count()
batch_size = 12
n_batch = data_cnt // batch_size

print(data_cnt, batch_size, n_batch)

88 12 7


## Tensorflow example 

In [85]:
import tensorflow as tf
from tensorflow.contrib.layers import fully_connected

In [89]:
tf.reset_default_graph()

X = tf.placeholder(tf.float32, shape=[None, 4])
Y = tf.placeholder(tf.float32, shape=[None, 3])

with tf.name_scope('dnn'):
    hidden1 = fully_connected(X, 128, scope="hidden1")
    hidden2 = fully_connected(hidden1, 32, scope="hidden2")
    logits = fully_connected(hidden1, 3, scope="logits",
                            activation_fn=None)

with tf.name_scope('loss'):
    xentropy = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=Y)
    loss = tf.reduce_mean(xentropy, name='loss')

with tf.name_scope('train'):
    optimizer = tf.train.AdamOptimizer(learning_rate=0.01)
    training_op = optimizer.minimize(loss)

with tf.name_scope('eval'):
    correct = tf.equal(
        tf.arg_max(logits, 1), tf.arg_max(Y, 1))
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

init = tf.global_variables_initializer()

with tf.Session() as sess:
    init.run()
    
    for epoch in xrange(epoches):
        i = 0
        next_batch = next_batch_rdd_maker(train_data, batch_size)
        for X_batch, Y_batch in next_batch():
            the_loss, _ = sess.run([loss, training_op], feed_dict={X: X_batch, Y: Y_batch})
            i += 1
            if i >= n_batch:
                break
        acc_test = accuracy.eval(feed_dict={X: test_data_dict['features'], Y: test_data_dict['labels']})
        print('epoch: {}, test accuracy: {}'.format(epoch+1, acc_test))

epoch: 1, test accuracy: 0.681818187237
epoch: 2, test accuracy: 0.931818187237
epoch: 3, test accuracy: 0.659090936184
epoch: 4, test accuracy: 0.840909063816
epoch: 5, test accuracy: 0.977272748947
epoch: 6, test accuracy: 0.931818187237
epoch: 7, test accuracy: 0.95454543829
epoch: 8, test accuracy: 0.95454543829
epoch: 9, test accuracy: 0.931818187237
epoch: 10, test accuracy: 0.95454543829
epoch: 11, test accuracy: 0.95454543829
epoch: 12, test accuracy: 0.977272748947
epoch: 13, test accuracy: 0.977272748947
epoch: 14, test accuracy: 0.977272748947
epoch: 15, test accuracy: 0.977272748947
epoch: 16, test accuracy: 0.977272748947
epoch: 17, test accuracy: 0.931818187237
epoch: 18, test accuracy: 0.727272748947
epoch: 19, test accuracy: 0.95454543829
epoch: 20, test accuracy: 0.931818187237


## Keras example 

In [16]:
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

Using TensorFlow backend.


In [23]:
adam_optimizer = Adam(lr=0.001)

model = Sequential()
model.add(Dense(128, input_shape=(4,)))
model.add(Dense(32))
model.add(Dense(3, activation='softmax'))
model.compile(optimizer=adam_optimizer,
      loss='categorical_crossentropy',
      metrics=['accuracy'])

for epoch in xrange(epoches):
    i = 0
    next_batch = next_batch_rdd_maker(train_data, batch_size)
    for X_batch, Y_batch in next_batch():
        model.train_on_batch(X_batch, Y_batch)
        i += 1
        if i >= n_batch:
            break
    metrics = model.test_on_batch(test_data_dict['features'], test_data_dict['labels'])
    print('epoch: {}, test accuracy: {}'.format(epoch+1, metrics[1]))

epoch: 1, test accuracy: 0.306451618671
epoch: 2, test accuracy: 0.741935491562
epoch: 3, test accuracy: 0.741935491562
epoch: 4, test accuracy: 0.629032254219
epoch: 5, test accuracy: 0.854838728905
epoch: 6, test accuracy: 0.854838728905
epoch: 7, test accuracy: 0.741935491562
epoch: 8, test accuracy: 0.677419364452
epoch: 9, test accuracy: 0.93548387289
epoch: 10, test accuracy: 0.693548381329
epoch: 11, test accuracy: 0.983870983124
epoch: 12, test accuracy: 0.822580635548
epoch: 13, test accuracy: 0.951612889767
epoch: 14, test accuracy: 0.967741906643
epoch: 15, test accuracy: 1.0
epoch: 16, test accuracy: 0.983870983124
epoch: 17, test accuracy: 0.93548387289
epoch: 18, test accuracy: 1.0
epoch: 19, test accuracy: 0.951612889767
epoch: 20, test accuracy: 0.951612889767
