In [20]:
## 1) Reading the data
import os
import numpy as np
import pandas as pd
from pyspark.sql.types import *
from pyspark.ml import Pipeline
from pyspark.sql import functions as f
from pyspark.sql.functions import udf, StringType
from pyspark.sql import SparkSession, functions as F
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.feature import OneHotEncoder, VectorAssembler,StringIndexer
#Building session now
spark = SparkSession.builder.appName('deep_learning_with_spark').getOrCreate()
#Finding out the directory to read the file
#pwd
#Reading the file now
data = spark.read.csv('/home/jovyan/dl_data.csv', header=True,
inferSchema=True)

In [21]:
## 2) Exploring schema
data.dtypes
data.printSchema()


root
 |-- Visit_Number_Bucket: string (nullable = true)
 |-- Page_Views_Normalized: double (nullable = true)
 |-- Orders_Normalized: integer (nullable = true)
 |-- Internal_Search_Successful_Normalized: double (nullable = true)
 |-- Internal_Search_Null_Normalized: double (nullable = true)
 |-- Email_Signup_Normalized: double (nullable = true)
 |-- Total_Seconds_Spent_Normalized: double (nullable = true)
 |-- Store_Locator_Search_Normalized: double (nullable = true)
 |-- Mapped_Last_Touch_Channel: string (nullable = true)
 |-- Mapped_Mobile_Device_Type: string (nullable = true)
 |-- Mapped_Browser_Type: string (nullable = true)
 |-- Mapped_Entry_Pages: string (nullable = true)
 |-- Mapped_Site_Section: string (nullable = true)
 |-- Mapped_Promo_Code: string (nullable = true)
 |-- Maped_Product_Name: string (nullable = true)
 |-- Mapped_Search_Term: string (nullable = true)
 |-- Mapped_Product_Collection: string (nullable = true)



In [22]:
## 3) Applying MPC
train, validation, test = data.randomSplit([0.7, 0.2, 0.1], 1234)


In [23]:
## 4) Building the pipeline
categorical_columns = [item[0] for item in data.dtypes if item[1].startswith('string')]
numeric_columns = [item[0] for item in data.dtypes if item[1].startswith('double')]
indexers = [StringIndexer(inputCol=column, outputCol='{0}_index'.format(column)) for column in categorical_columns]

#string indexer
featuresCreator = VectorAssembler(inputCols=[indexer.getOutputCol() for indexer in indexers] + numeric_columns, outputCol="features")

#Configure the classifier
layers = [len(featuresCreator.getInputCols()), 4, 2, 2]

classifier = MultilayerPerceptronClassifier(labelCol='label', featuresCol='features', maxIter=100,
layers=layers, blockSize=128, seed=1234)


In [24]:
## 5) Fit and get output from pipeline

pipeline = Pipeline(stages=indexers + [featuresCreator, classifier])
model = pipeline.fit(train)
# let's checkout the results
train_output_df = model.transform(train)
validation_output_df = model.transform(validation)
test_output_df = model.transform(test)

IllegalArgumentException: label does not exist. Available: Visit_Number_Bucket, Page_Views_Normalized, Orders_Normalized, Internal_Search_Successful_Normalized, Internal_Search_Null_Normalized, Email_Signup_Normalized, Total_Seconds_Spent_Normalized, Store_Locator_Search_Normalized, Mapped_Last_Touch_Channel, Mapped_Mobile_Device_Type, Mapped_Browser_Type, Mapped_Entry_Pages, Mapped_Site_Section, Mapped_Promo_Code, Maped_Product_Name, Mapped_Search_Term, Mapped_Product_Collection, Visit_Number_Bucket_index, Mapped_Last_Touch_Channel_index, Mapped_Mobile_Device_Type_index, Mapped_Browser_Type_index, Mapped_Entry_Pages_index, Mapped_Site_Section_index, Mapped_Promo_Code_index, Maped_Product_Name_index, Mapped_Search_Term_index, Mapped_Product_Collection_index, features

In [None]:
## 6) Evaluate using different metrics

train_predictionAndLabels = train_output_df.select("prediction", "label")
validation_predictionAndLabels = validation_output_df.select("prediction","label")
test_predictionAndLabels = test_output_df.select("prediction", "label")
metrics = ['weightedPrecision', 'weightedRecall', 'accuracy']

for metric in metrics:
    evaluator = MulticlassClassificationEvaluator(metricName=metric)
    print('Train ' + metric + ' = ' +
    str(evaluator.evaluate(train_predictionAndLabels)))
    print('Validation ' + metric + ' = ' +
    str(evaluator.evaluate(validation_predictionAndLabels)))
    print('Test ' + metric + ' = ' +
    str(evaluator.evaluate(test_predictionAndLabels)))


In [None]:
## 7) Plots and visualizations

import matplotlib.pyplot as plt
import numpy as np
import itertools
def plot_confusion_matrix(cm, classes,normalize=False,
        title='Confusion matrix',cmap=plt.cm.Blues):
"""
This function prints and plots the confusion matrix.
Normalization can be applied by setting `normalize=True`.
"""
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')
    print(cm)
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
            horizontalalignment="center",
            color="white" if cm[i, j] > thresh else "black")
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
#Get Class labels
class_temp = test_predictionAndLabels.select("label").groupBy("label").count().sort('count',
ascending=False).toPandas()["label"].tolist()
#Calculate confusion matrix
from sklearn.metrics import confusion_matrix
y_true = test_predictionAndLabels.select("label")
y_true = y_true.toPandas()
y_pred = test_predictionAndLabels.select("prediction")
y_pred = y_pred.toPandas()
cnf_matrix = confusion_matrix(y_true, y_pred,labels=class_temp)
cnf_matrix
#Plotting Results
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_temp,
title='Confusion matrix, without normalization')
plt.show()
