In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://www-us.apache.org/dist/spark/spark-2.4.4/spark-2.4.4-bin-hadoop2.6.tgz
!tar xvf spark-2.4.4-bin-hadoop2.6.tgz
!pip install -q findspark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.4-bin-hadoop2.6"
import findspark
findspark.init()
import pyspark
sc = pyspark.SparkContext(appName="PySpark_dataframe")

# ML Pipeline

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt

spark = SparkSession.builder.master('local[2]') \
        .appName('stats_app') \
        .getOrCreate()

*A pipeline allows us to maintain the data flow of all the relevant transformations that are required to reach the end result.*



A machine learning project typically involves steps like data preprocessing, feature extraction, model fitting and evaluating results. We need to perform a lot of transformations on the data in sequence. As you can imagine, keeping track of them can potentially become a tedious task.

![alt text](https://learning.oreilly.com/library/view/spark-the-definitive/9781491912201/assets/spdg_2404.png)

Raw data:

Public Dataset of Accelerometer Data for Human Motion Primitives Detection.[link to dataset](https://github.com/wchill/HMP_Dataset)

HMP is a public collection of labelled accelerometer data recordings to be used for the creation and validation of acceleration models of human motion primitives.


In [None]:
# delete files from previous runs
!rm -f hmp.parquet*

# download the file containing the data in PARQUET format
!wget https://github.com/IBM/coursera/raw/master/hmp.parquet
    
# create a dataframe out of it
df = spark.read.parquet('hmp.parquet')

# register a corresponding query table
df.createOrReplaceTempView('df')

In [None]:
df.show()

+---+---+---+--------------------+-----------+
|  x|  y|  z|              source|      class|
+---+---+---+--------------------+-----------+
| 22| 49| 35|Accelerometer-201...|Brush_teeth|
| 22| 49| 35|Accelerometer-201...|Brush_teeth|
| 22| 52| 35|Accelerometer-201...|Brush_teeth|
| 22| 52| 35|Accelerometer-201...|Brush_teeth|
| 21| 52| 34|Accelerometer-201...|Brush_teeth|
| 22| 51| 34|Accelerometer-201...|Brush_teeth|
| 20| 50| 35|Accelerometer-201...|Brush_teeth|
| 22| 52| 34|Accelerometer-201...|Brush_teeth|
| 22| 50| 34|Accelerometer-201...|Brush_teeth|
| 22| 51| 35|Accelerometer-201...|Brush_teeth|
| 21| 51| 33|Accelerometer-201...|Brush_teeth|
| 20| 50| 34|Accelerometer-201...|Brush_teeth|
| 21| 49| 33|Accelerometer-201...|Brush_teeth|
| 21| 49| 33|Accelerometer-201...|Brush_teeth|
| 20| 51| 35|Accelerometer-201...|Brush_teeth|
| 18| 49| 34|Accelerometer-201...|Brush_teeth|
| 19| 48| 34|Accelerometer-201...|Brush_teeth|
| 16| 53| 34|Accelerometer-201...|Brush_teeth|
| 18| 52| 35|

Convert the string column to index(integer)

In [None]:
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(inputCol='class', outputCol='class_index')
indexed = indexer.fit(df).transform(df)
indexed.show()

+---+---+---+--------------------+-----------+-----------+
|  x|  y|  z|              source|      class|class_index|
+---+---+---+--------------------+-----------+-----------+
| 22| 49| 35|Accelerometer-201...|Brush_teeth|        6.0|
| 22| 49| 35|Accelerometer-201...|Brush_teeth|        6.0|
| 22| 52| 35|Accelerometer-201...|Brush_teeth|        6.0|
| 22| 52| 35|Accelerometer-201...|Brush_teeth|        6.0|
| 21| 52| 34|Accelerometer-201...|Brush_teeth|        6.0|
| 22| 51| 34|Accelerometer-201...|Brush_teeth|        6.0|
| 20| 50| 35|Accelerometer-201...|Brush_teeth|        6.0|
| 22| 52| 34|Accelerometer-201...|Brush_teeth|        6.0|
| 22| 50| 34|Accelerometer-201...|Brush_teeth|        6.0|
| 22| 51| 35|Accelerometer-201...|Brush_teeth|        6.0|
| 21| 51| 33|Accelerometer-201...|Brush_teeth|        6.0|
| 20| 50| 34|Accelerometer-201...|Brush_teeth|        6.0|
| 21| 49| 33|Accelerometer-201...|Brush_teeth|        6.0|
| 21| 49| 33|Accelerometer-201...|Brush_teeth|        6.

In [None]:
indexed.groupby('class_index').count().show()

+-----------+-----+
|class_index|count|
+-----------+-----+
|        8.0|25036|
|        0.0|92254|
|        7.0|25417|
|        1.0|45801|
|        4.0|40258|
|       11.0|15225|
|        3.0|41673|
|        2.0|42792|
|       10.0|15375|
|       13.0| 6683|
|        6.0|29829|
|        5.0|31236|
|        9.0|23504|
|       12.0|11446|
+-----------+-----+



In [None]:
from pyspark.ml.feature import OneHotEncoder
encoder = OneHotEncoder(inputCol='class_index', outputCol='categoryVec')

encoded = encoder.transform(indexed)

encoded.show()


+---+---+---+--------------------+-----------+-----------+--------------+
|  x|  y|  z|              source|      class|class_index|   categoryVec|
+---+---+---+--------------------+-----------+-----------+--------------+
| 22| 49| 35|Accelerometer-201...|Brush_teeth|        6.0|(13,[6],[1.0])|
| 22| 49| 35|Accelerometer-201...|Brush_teeth|        6.0|(13,[6],[1.0])|
| 22| 52| 35|Accelerometer-201...|Brush_teeth|        6.0|(13,[6],[1.0])|
| 22| 52| 35|Accelerometer-201...|Brush_teeth|        6.0|(13,[6],[1.0])|
| 21| 52| 34|Accelerometer-201...|Brush_teeth|        6.0|(13,[6],[1.0])|
| 22| 51| 34|Accelerometer-201...|Brush_teeth|        6.0|(13,[6],[1.0])|
| 20| 50| 35|Accelerometer-201...|Brush_teeth|        6.0|(13,[6],[1.0])|
| 22| 52| 34|Accelerometer-201...|Brush_teeth|        6.0|(13,[6],[1.0])|
| 22| 50| 34|Accelerometer-201...|Brush_teeth|        6.0|(13,[6],[1.0])|
| 22| 51| 35|Accelerometer-201...|Brush_teeth|        6.0|(13,[6],[1.0])|
| 21| 51| 33|Accelerometer-201...|Brus

In [None]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

vectorAssembler = VectorAssembler(inputCols=['x','y','z'],
                                  outputCol = 'features')

features_vectorized = vectorAssembler.transform(encoded)
features_vectorized.show()

+---+---+---+--------------------+-----------+-----------+--------------+----------------+
|  x|  y|  z|              source|      class|class_index|   categoryVec|        features|
+---+---+---+--------------------+-----------+-----------+--------------+----------------+
| 22| 49| 35|Accelerometer-201...|Brush_teeth|        6.0|(13,[6],[1.0])|[22.0,49.0,35.0]|
| 22| 49| 35|Accelerometer-201...|Brush_teeth|        6.0|(13,[6],[1.0])|[22.0,49.0,35.0]|
| 22| 52| 35|Accelerometer-201...|Brush_teeth|        6.0|(13,[6],[1.0])|[22.0,52.0,35.0]|
| 22| 52| 35|Accelerometer-201...|Brush_teeth|        6.0|(13,[6],[1.0])|[22.0,52.0,35.0]|
| 21| 52| 34|Accelerometer-201...|Brush_teeth|        6.0|(13,[6],[1.0])|[21.0,52.0,34.0]|
| 22| 51| 34|Accelerometer-201...|Brush_teeth|        6.0|(13,[6],[1.0])|[22.0,51.0,34.0]|
| 20| 50| 35|Accelerometer-201...|Brush_teeth|        6.0|(13,[6],[1.0])|[20.0,50.0,35.0]|
| 22| 52| 34|Accelerometer-201...|Brush_teeth|        6.0|(13,[6],[1.0])|[22.0,52.0,34.0]|

In [None]:
from pyspark.ml.feature import Normalizer
normalizer = Normalizer(inputCol='features', outputCol='features_norm', p = 1.0)
normalized_data = normalizer.transform(features_vectorized)
normalized_data.show()


+---+---+---+--------------------+-----------+-----------+--------------+----------------+--------------------+
|  x|  y|  z|              source|      class|class_index|   categoryVec|        features|       features_norm|
+---+---+---+--------------------+-----------+-----------+--------------+----------------+--------------------+
| 22| 49| 35|Accelerometer-201...|Brush_teeth|        6.0|(13,[6],[1.0])|[22.0,49.0,35.0]|[0.20754716981132...|
| 22| 49| 35|Accelerometer-201...|Brush_teeth|        6.0|(13,[6],[1.0])|[22.0,49.0,35.0]|[0.20754716981132...|
| 22| 52| 35|Accelerometer-201...|Brush_teeth|        6.0|(13,[6],[1.0])|[22.0,52.0,35.0]|[0.20183486238532...|
| 22| 52| 35|Accelerometer-201...|Brush_teeth|        6.0|(13,[6],[1.0])|[22.0,52.0,35.0]|[0.20183486238532...|
| 21| 52| 34|Accelerometer-201...|Brush_teeth|        6.0|(13,[6],[1.0])|[21.0,52.0,34.0]|[0.19626168224299...|
| 22| 51| 34|Accelerometer-201...|Brush_teeth|        6.0|(13,[6],[1.0])|[22.0,51.0,34.0]|[0.20560747663

In [None]:
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[indexer, encoder, vectorAssembler, normalizer])
model = pipeline.fit(df)
prediction = model.transform(df)
prediction.show()

+---+---+---+--------------------+-----------+-----------+--------------+----------------+--------------------+
|  x|  y|  z|              source|      class|class_index|   categoryVec|        features|       features_norm|
+---+---+---+--------------------+-----------+-----------+--------------+----------------+--------------------+
| 22| 49| 35|Accelerometer-201...|Brush_teeth|        6.0|(13,[6],[1.0])|[22.0,49.0,35.0]|[0.20754716981132...|
| 22| 49| 35|Accelerometer-201...|Brush_teeth|        6.0|(13,[6],[1.0])|[22.0,49.0,35.0]|[0.20754716981132...|
| 22| 52| 35|Accelerometer-201...|Brush_teeth|        6.0|(13,[6],[1.0])|[22.0,52.0,35.0]|[0.20183486238532...|
| 22| 52| 35|Accelerometer-201...|Brush_teeth|        6.0|(13,[6],[1.0])|[22.0,52.0,35.0]|[0.20183486238532...|
| 21| 52| 34|Accelerometer-201...|Brush_teeth|        6.0|(13,[6],[1.0])|[21.0,52.0,34.0]|[0.19626168224299...|
| 22| 51| 34|Accelerometer-201...|Brush_teeth|        6.0|(13,[6],[1.0])|[22.0,51.0,34.0]|[0.20560747663

In [None]:
df_train = prediction.select(['categoryVec', 'features_norm'])

In [None]:
df_train.show()

+--------------+--------------------+
|   categoryVec|       features_norm|
+--------------+--------------------+
|(13,[6],[1.0])|[0.20754716981132...|
|(13,[6],[1.0])|[0.20754716981132...|
|(13,[6],[1.0])|[0.20183486238532...|
|(13,[6],[1.0])|[0.20183486238532...|
|(13,[6],[1.0])|[0.19626168224299...|
|(13,[6],[1.0])|[0.20560747663551...|
|(13,[6],[1.0])|[0.19047619047619...|
|(13,[6],[1.0])|[0.20370370370370...|
|(13,[6],[1.0])|[0.20754716981132...|
|(13,[6],[1.0])|[0.20370370370370...|
|(13,[6],[1.0])|[0.2,0.4857142857...|
|(13,[6],[1.0])|[0.19230769230769...|
|(13,[6],[1.0])|[0.20388349514563...|
|(13,[6],[1.0])|[0.20388349514563...|
|(13,[6],[1.0])|[0.18867924528301...|
|(13,[6],[1.0])|[0.17821782178217...|
|(13,[6],[1.0])|[0.18811881188118...|
|(13,[6],[1.0])|[0.15533980582524...|
|(13,[6],[1.0])|[0.17142857142857...|
|(13,[6],[1.0])|[0.17821782178217...|
+--------------+--------------------+
only showing top 20 rows



# Exercise:

Add the MinMax scaler

Transform features by scaling each feature to a given range.

In [None]:
from pyspark.ml.feature import MinMaxScaler

indexer = StringIndexer(inputCol="class", outputCol="classIndex")
encoder = OneHotEncoder(inputCol="classIndex", outputCol="categoryVec")
vectorAssembler = VectorAssembler(inputCols=["x","y","z"],
                                  outputCol="features")
normalizer = Normalizer(inputCol="features", outputCol="features_norm", p=1.0)
# todo: add you code here
minmaxscalar = $$
###########################
pipeline = Pipeline(stages=[indexer, encoder, vectorAssembler, normalizer,minmaxscaler])
model = pipeline.fit(df)
prediction = model.transform(df)
prediction.show()

In [None]:
# Solution
from pyspark.ml.feature import MinMaxScaler

indexer = StringIndexer(inputCol="class", outputCol="classIndex")
encoder = OneHotEncoder(inputCol="classIndex", outputCol="categoryVec")
vectorAssembler = VectorAssembler(inputCols=["x","y","z"],
                                  outputCol="features")
normalizer = Normalizer(inputCol="features", outputCol="features_norm", p=1.0)
minmaxscalar = MinMaxScaler(inputCol="features", outputCol="features_minmax")

pipeline = Pipeline(stages=[indexer, encoder, vectorAssembler, normalizer,minmaxscalar])
model = pipeline.fit(df)
prediction = model.transform(df)
prediction.show()

+---+---+---+--------------------+-----------+----------+--------------+----------------+--------------------+--------------------+
|  x|  y|  z|              source|      class|classIndex|   categoryVec|        features|       features_norm|     features_minmax|
+---+---+---+--------------------+-----------+----------+--------------+----------------+--------------------+--------------------+
| 22| 49| 35|Accelerometer-201...|Brush_teeth|       6.0|(13,[6],[1.0])|[22.0,49.0,35.0]|[0.20754716981132...|[0.34920634920634...|
| 22| 49| 35|Accelerometer-201...|Brush_teeth|       6.0|(13,[6],[1.0])|[22.0,49.0,35.0]|[0.20754716981132...|[0.34920634920634...|
| 22| 52| 35|Accelerometer-201...|Brush_teeth|       6.0|(13,[6],[1.0])|[22.0,52.0,35.0]|[0.20183486238532...|[0.34920634920634...|
| 22| 52| 35|Accelerometer-201...|Brush_teeth|       6.0|(13,[6],[1.0])|[22.0,52.0,35.0]|[0.20183486238532...|[0.34920634920634...|
| 21| 52| 34|Accelerometer-201...|Brush_teeth|       6.0|(13,[6],[1.0])|[21.