# Analysis

**Joeri Hermans** (Technical Student, IT-DB-SAS, CERN)             
*Departement of Knowledge Engineering*         
*Maastricht University, The Netherlands*

In this section we will be analyzing the a relatively large preprocessed dataset, and apply some deep learning models to it.

In [1]:
import numpy as np

import time

import requests

from keras.optimizers import *
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation

from pyspark import SparkContext
from pyspark import SparkConf

from pyspark.ml.feature import StandardScaler
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import BinaryClassificationMetrics

from distkeras.distributed import *
from distkeras.utils import *

Using TensorFlow backend.


In [2]:
# Modify these variables according to your needs.
application_name = "Distributed Deep Learning: Analysis"
using_spark_2 = False
yarn = "p01001532067275.cern.ch:8088" # Address:port of ResourceManager
if not yarn:
    # Tell master to use local resources.
    master = "local[*]"
    num_cores = 3
    num_executors = 1
else:
    # Tell master to use YARN.
    master = "yarn-client"
    max_num_executors = 11
    num_cores = 2

In [3]:
# Check if YARN is specified.
if yarn:
    # Build the ResourceManager metrics URI.
    yarn_metrics_uri = "http://" + yarn + "/ws/v1/cluster/metrics"
    # Fetch the number of available nodes
    response = requests.get(yarn_metrics_uri)
    data = response.json()
    # Fetch the number of active nodes.
    num_active_nodes = int(data['clusterMetrics']['activeNodes'])
    # Assign the number of executors.
    num_executors = num_active_nodes
    if num_executors > max_num_executors:
        num_executors = max_num_executors

In [4]:
# This variable is derived from the number of cores and executors, and will be used to assign the number of model trainers.
num_workers = num_executors * num_cores

print("Number of desired executors: " + `num_executors`)
print("Number of desired cores / executor: " + `num_cores`)
print("Total number of workers: " + `num_workers`)

Number of desired executors: 11
Number of desired cores / executor: 2
Total number of workers: 22


In [6]:
conf = SparkConf()
conf.set("spark.app.name", application_name)
conf.set("spark.master", master)
conf.set("spark.executor.cores", `num_cores`)
conf.set("spark.executor.instances", `num_executors`)
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");

# Check if the user is running Spark 2.0 +
if using_spark_2:
    sc = SparkSession.builder.config(conf=conf) \
            .appName(application_name) \
            .getOrCreate()
else:
    # Create the Spark context.
    sc = SparkContext(conf=conf)
    # Add the missing imports
    from pyspark import SQLContext
    sqlContext = SQLContext(sc)

In [7]:
# Check if we are using Spark 2.0
if using_spark_2:
    reader = sc
else:
    reader = sqlContext
# Read the dataset.
raw_dataset = reader.read.parquet("data/processed_dataset.parquet")

In [26]:
# Check the schema.
raw_dataset.printSchema()

root
 |-- features_normalized: vector (nullable = true)
 |-- label_index: double (nullable = true)
 |-- label: vector (nullable = true)



250000

In [14]:
nb_features = len(raw_dataset.select("features_normalized").take(1)[0]["features_normalized"])
nb_classes = len(raw_dataset.select("label").take(1)[0]["label"])

print("Number of features: " + str(nb_features))
print("Number of classes: " + str(nb_classes))

Number of features: 30
Number of classes: 2


In [20]:
# Finally, we create a trainingset and a testset.
(trainingSet, testSet) = raw_dataset.randomSplit([0.7, 0.3])
trainingSet.cache()
testSet.cache()

DataFrame[features_normalized: vector, label_index: double, label: vector]

## Model construction

In [15]:
model = Sequential()
model.add(Dense(500, input_shape=(nb_features,)))
model.add(Activation('relu'))
model.add(Dropout(0.4))
model.add(Dense(500))
#model.add(Activation('relu'))
#model.add(Dropout(0.4))
#model.add(Dense(500))
model.add(Activation('relu'))
model.add(Dense(nb_classes))
model.add(Activation('softmax'))

In [16]:
# Summarize the model.
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
dense_1 (Dense)                  (None, 500)           15500       dense_input_1[0][0]              
____________________________________________________________________________________________________
activation_1 (Activation)        (None, 500)           0           dense_1[0][0]                    
____________________________________________________________________________________________________
dropout_1 (Dropout)              (None, 500)           0           activation_1[0][0]               
____________________________________________________________________________________________________
dense_2 (Dense)                  (None, 500)           250500      dropout_1[0][0]                  
___________________________________________________________________________________________

In [17]:
optimizer = 'adagrad'
loss = 'categorical_crossentropy'

In [18]:
metric_name = "f1"
evaluator = MulticlassClassificationEvaluator(metricName=metric_name, predictionCol="predicted_index", labelCol="label_index")

In [33]:
def evaluate(model):
    global testSet
    
    # Clear the prediction column from the testset.
    testSet = testSet.select("features_normalized", "label_index", "label")
    # Apply a prediction from a trained model.
    predictor = ModelPredictor(keras_model=trained_model, features_col="features_normalized")
    testSet = predictor.predict(testSet)
    # Transform the prediction vector to an indexed label.
    testSet = index_transformer.transform(testSet)
    # Store the F1 score of the SingleTrainer.
    score = evaluator.evaluate(testSet)
    
    return score

In [34]:
index_transformer = LabelIndexTransformer(output_dim=nb_classes)

In [35]:
results = {}
time_spent = {}

In [36]:
# Distribute the training and test set to the workers.
testSet = testSet.repartition(num_workers)
trainingSet = trainingSet.repartition(num_workers)

In [37]:
# Count the instances.
num_test_set = testSet.count()
num_training_set = trainingSet.count()

print("Number of testset instances: " + str(num_test_set))
print("Number of trainingset instances: " + str(num_training_set))
print("Total number of instances: " + str(num_test_set + num_training_set))

Number of testset instances: 75236
Number of trainingset instances: 174764
Total number of instances: 250000


In [38]:
time_start = time.time()
async_easgd_trainer = AsynchronousEASGD(keras_model=model, features_col="features_normalized",
                                        batch_size=10, num_workers=num_workers, rho=5.0, learning_rate=0.05,
                                        worker_optimizer=optimizer, loss=loss, communication_window=30)
trained_model = async_easgd_trainer.train(trainingSet)
dt = time.time() - time_start
time_spent['async_easgd_trainer'] = dt

print("Time spent (Asynchronous EASGD): " + `dt` + " seconds.")

Time spent (Asynchronous EASGD): 17.88398814201355 seconds.


In [39]:
score = evaluate(trained_model)
results['easgd_trainer'] = score

print("F1 (EASGD): " + `score`)

F1 (EASGD): 0.8023815453380814
