In [1]:
from __future__ import division, print_function, unicode_literals
import numpy as np

In [2]:
! ls dataset

iris.csv news.txt


# IRIS dataset

In [8]:
from pyspark.sql.types import *

fields = [StructField(field_name, DoubleType(), True) 
          for field_name in ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']] + \
    [StructField('species', StringType(), True)]
    
schema = StructType(fields)

iris_df = sqlContext.read.csv("dataset/iris.csv", header=True, schema=schema)

In [9]:
iris_df.printSchema()

root
 |-- sepal_length: double (nullable = true)
 |-- sepal_width: double (nullable = true)
 |-- petal_length: double (nullable = true)
 |-- petal_width: double (nullable = true)
 |-- species: string (nullable = true)



In [10]:
iris_df.show(5)

+------------+-----------+------------+-----------+-------+
|sepal_length|sepal_width|petal_length|petal_width|species|
+------------+-----------+------------+-----------+-------+
|         5.1|        3.5|         1.4|        0.2| setosa|
|         4.9|        3.0|         1.4|        0.2| setosa|
|         4.7|        3.2|         1.3|        0.2| setosa|
|         4.6|        3.1|         1.5|        0.2| setosa|
|         5.0|        3.6|         1.4|        0.2| setosa|
+------------+-----------+------------+-----------+-------+
only showing top 5 rows



In [11]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline

str_indexer = StringIndexer(inputCol="species", outputCol="species_index")
label_encoder = OneHotEncoder(inputCol="species_index", outputCol="labels")
feature_assembler = VectorAssembler(
    inputCols=["sepal_length", "sepal_width", "petal_length", "petal_width"],
    outputCol="features")
pipeline = Pipeline(stages=[str_indexer, label_encoder, feature_assembler])

In [12]:
iris_pipeline = pipeline.fit(iris_df)
iris_dataset_df = iris_pipeline.transform(iris_df, {label_encoder.dropLast: False})

In [13]:
iris_dataset_df.show(20)

+------------+-----------+------------+-----------+-------+-------------+-------------+-----------------+
|sepal_length|sepal_width|petal_length|petal_width|species|species_index|       labels|         features|
+------------+-----------+------------+-----------+-------+-------------+-------------+-----------------+
|         5.1|        3.5|         1.4|        0.2| setosa|          2.0|(3,[2],[1.0])|[5.1,3.5,1.4,0.2]|
|         4.9|        3.0|         1.4|        0.2| setosa|          2.0|(3,[2],[1.0])|[4.9,3.0,1.4,0.2]|
|         4.7|        3.2|         1.3|        0.2| setosa|          2.0|(3,[2],[1.0])|[4.7,3.2,1.3,0.2]|
|         4.6|        3.1|         1.5|        0.2| setosa|          2.0|(3,[2],[1.0])|[4.6,3.1,1.5,0.2]|
|         5.0|        3.6|         1.4|        0.2| setosa|          2.0|(3,[2],[1.0])|[5.0,3.6,1.4,0.2]|
|         5.4|        3.9|         1.7|        0.4| setosa|          2.0|(3,[2],[1.0])|[5.4,3.9,1.7,0.4]|
|         4.6|        3.4|         1.4|       

In [14]:
iris_data = iris_dataset_df.select('features', 'labels').rdd \
    .map(lambda r: {f: r[f].toArray().reshape(1, -1) for f in ['features', 'labels']}) 

# Minibatch generator

In [15]:
train_data, test_data = iris_data.randomSplit([0.5, 0.5])

In [16]:
train_data.persist()

PythonRDD[43] at RDD at PythonRDD.scala:48

In [17]:
import math

def concate_data(x, y):
    dataset = {}
    for f in ['features', 'labels']:
        dataset[f] = np.concatenate((x[f], y[f]), axis=0)
    return dataset


def next_batch_rdd_maker(data_rdd, batch_size):
    data_cnt = float(data_rdd.count())
    per_batch_ratios = [batch_size / data_cnt] * int(math.ceil(data_cnt / batch_size))
    def next_batch():
        current_epoch = 0
        while True:
#             current_epoch += 1
#             if epoch_limit & current_epoch > epoch_limit: 
#                 break 
            batches = data_rdd \
                .sortBy(lambda x: np.random.random()) \
                .randomSplit(per_batch_ratios)
            for batch in batches:
                dataset = batch.reduce(concate_data)
                yield dataset['features'], dataset['labels']
    return next_batch

In [18]:
test_data_dict = test_data.reduce(concate_data)

In [19]:
next_batch = next_batch_rdd_maker(train_data, 16)

In [35]:
next(next_batch())

(array([[ 6.5,  3. ,  5.8,  2.2],
        [ 5.8,  2.7,  3.9,  1.2],
        [ 5.4,  3.9,  1.7,  0.4],
        [ 6.5,  3.2,  5.1,  2. ],
        [ 6.6,  3. ,  4.4,  1.4],
        [ 5.5,  2.4,  3.8,  1.1],
        [ 5.6,  3. ,  4.1,  1.3],
        [ 6.8,  3.2,  5.9,  2.3],
        [ 5.2,  3.4,  1.4,  0.2],
        [ 6.1,  2.8,  4.7,  1.2],
        [ 5. ,  2. ,  3.5,  1. ]]), array([[ 0.,  1.,  0.],
        [ 1.,  0.,  0.],
        [ 0.,  0.,  1.],
        [ 0.,  1.,  0.],
        [ 1.,  0.,  0.],
        [ 1.,  0.,  0.],
        [ 1.,  0.,  0.],
        [ 0.,  1.,  0.],
        [ 0.,  0.,  1.],
        [ 1.,  0.,  0.],
        [ 1.,  0.,  0.]]))

## Tensorflow example 

In [36]:
import tensorflow as tf
from tensorflow.contrib.layers import fully_connected

In [39]:
epoches = 20
data_cnt = train_data.count()
batch_size = 16
n_batch = data_cnt // batch_size

print(data_cnt, batch_size, n_batch)

81 16 5


In [42]:
tf.reset_default_graph()

X = tf.placeholder(tf.float32, shape=[None, 4])
Y = tf.placeholder(tf.float32, shape=[None, 3])

with tf.name_scope('dnn'):
    hidden1 = fully_connected(X, 128, scope="hidden1")
    hidden2 = fully_connected(hidden1, 32, scope="hidden2")
    logits = fully_connected(hidden1, 3, scope="logits",
                            activation_fn=None)

with tf.name_scope('loss'):
    xentropy = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=Y)
    loss = tf.reduce_mean(xentropy, name='loss')

with tf.name_scope('train'):
    optimizer = tf.train.AdamOptimizer(learning_rate=0.01)
    training_op = optimizer.minimize(loss)

with tf.name_scope('eval'):
    correct = tf.equal(
        tf.arg_max(logits, 1), tf.arg_max(Y, 1))
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

init = tf.global_variables_initializer()

with tf.Session() as sess:
    init.run()
    
    for epoch in xrange(epoches):
        i = 0
        next_batch = next_batch_rdd_maker(train_data, batch_size)
        #next_batch = next_sample_rdd_maker(train_data, batch_size)
        for X_batch, Y_batch in next_batch():
            the_loss, _ = sess.run([loss, training_op], feed_dict={X: X_batch, Y: Y_batch})
            i += 1
            if i >= n_batch:
                break
        acc_test = accuracy.eval(feed_dict={X: test_data_dict['features'], Y: test_data_dict['labels']})
        print('epoch: {}, test accuracy: {}'.format(epoch+1, acc_test))

epoch: 1, test accuracy: 0.637681186199
epoch: 2, test accuracy: 0.637681186199
epoch: 3, test accuracy: 0.637681186199
epoch: 4, test accuracy: 0.637681186199
epoch: 5, test accuracy: 0.942028999329
epoch: 6, test accuracy: 0.753623187542
epoch: 7, test accuracy: 0.956521749496
epoch: 8, test accuracy: 0.942028999329
epoch: 9, test accuracy: 0.956521749496
epoch: 10, test accuracy: 0.956521749496
epoch: 11, test accuracy: 0.956521749496
epoch: 12, test accuracy: 0.971014499664
epoch: 13, test accuracy: 0.971014499664
epoch: 14, test accuracy: 0.971014499664
epoch: 15, test accuracy: 0.971014499664
epoch: 16, test accuracy: 0.956521749496
epoch: 17, test accuracy: 0.956521749496
epoch: 18, test accuracy: 0.971014499664
epoch: 19, test accuracy: 0.971014499664
epoch: 20, test accuracy: 0.985507249832


## Keras example 

In [46]:
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

In [48]:
adam_optimizer = Adam(lr=0.001)

model = Sequential()
model.add(Dense(128, input_shape=(4,)))
model.add(Dense(32))
model.add(Dense(3, activation='softmax'))
model.compile(optimizer=adam_optimizer,
      loss='categorical_crossentropy',
      metrics=['accuracy'])

for epoch in xrange(epoches):
    i = 0
    next_batch = next_batch_rdd_maker(train_data, batch_size)
    for X_batch, Y_batch in next_batch():
        model.train_on_batch(X_batch, Y_batch)
        i += 1
        if i >= n_batch:
            break
    evaluate = model.test_on_batch(test_data_dict['features'], test_data_dict['labels'])
    print('epoch: {}, test accuracy: {}'.format(epoch+1, acc_test))

epoch: 1, test accuracy: 0.985507249832
epoch: 2, test accuracy: 0.985507249832
epoch: 3, test accuracy: 0.985507249832
epoch: 4, test accuracy: 0.985507249832
epoch: 5, test accuracy: 0.985507249832
epoch: 6, test accuracy: 0.985507249832
epoch: 7, test accuracy: 0.985507249832
epoch: 8, test accuracy: 0.985507249832
epoch: 9, test accuracy: 0.985507249832
epoch: 10, test accuracy: 0.985507249832
epoch: 11, test accuracy: 0.985507249832
epoch: 12, test accuracy: 0.985507249832
epoch: 13, test accuracy: 0.985507249832
epoch: 14, test accuracy: 0.985507249832
epoch: 15, test accuracy: 0.985507249832
epoch: 16, test accuracy: 0.985507249832
epoch: 17, test accuracy: 0.985507249832
epoch: 18, test accuracy: 0.985507249832
epoch: 19, test accuracy: 0.985507249832
epoch: 20, test accuracy: 0.985507249832
