This notebook follows along the Sagemaker/Spark tutorial found [here](https://github.com/awslabs/amazon-sagemaker-examples/blob/master/sagemaker-spark/pyspark_mnist/pyspark_mnist_pca_kmeans.ipynb).

# Import

## Packages / Libraries

In [1]:
import os
import boto3
import numpy as np
import string

import sagemaker
from sagemaker import get_execution_role
import sagemaker_pyspark

from pyspark import SparkContext, SparkConf
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession

from sagemaker_pyspark.algorithms import PCASageMakerEstimator, KMeansSageMakerEstimator
from sagemaker_pyspark import RandomNamePolicyFactory, IAMRole, EndpointCreationPolicy
from sagemaker_pyspark.transformation.serializers import ProtobufRequestRowSerializer

from pyspark.sql.types import DoubleType
import matplotlib.pyplot as plt

## Setup AWS and Spark

In [3]:
role = get_execution_role()
region = boto3.Session().region_name

# Configure Spark to use the SageMaker Spark dependency jars
jars = sagemaker_pyspark.classpath_jars()

classpath = ":".join(sagemaker_pyspark.classpath_jars())

# See the SageMaker Spark Github to learn how to connect to EMR from a notebook instance
spark = SparkSession.builder.config("spark.driver.extraClassPath", classpath)\
    .master("local[*]").getOrCreate()
    
spark

## Load Data

In [6]:
spark._jsc.hadoopConfiguration().set('fs.s3a.endpoint', 's3.{}.amazonaws.com'.format(region))

trainingData = spark.read.format('libsvm')\
    .option('numFeatures', '784')\
    .load('s3a://sagemaker-sample-data-{}/spark/mnist/train/'.format(region))

testData = spark.read.format('libsvm')\
    .option('numFeatures', '784')\
    .load('s3a://sagemaker-sample-data-{}/spark/mnist/test/'.format(region))

trainingData.show(5)
trainingData.printSchema()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  5.0|(784,[152,153,154...|
|  0.0|(784,[127,128,129...|
|  4.0|(784,[160,161,162...|
|  1.0|(784,[158,159,160...|
|  9.0|(784,[208,209,210...|
+-----+--------------------+
only showing top 5 rows

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)

