In [1]:
%%sh

conda install mleap

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

## Package Plan ##

  environment location: /opt/conda

  added / updated specs:
    - mleap


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    ca-certificates-2019.11.28 |       hecc5488_0         145 KB  conda-forge
    certifi-2019.11.28         |           py37_0         148 KB  conda-forge
    mleap-0.15.0               |             py_0          31 KB  conda-forge
    nose-1.3.7                 |        py37_1003         212 KB  conda-forge
    nose-exclude-0.5.0         |             py_0          17 KB  conda-forge
    openssl-1.1.1d             |       h516909a_0         2.1 MB  conda-forge
    ------------------------------------------------------------
                                           Total:         2.6 MB

The following NEW packages will be INSTALLED:

  



  current version: 4.7.10
  latest version: 4.8.2

Please update conda by running

    $ conda update -n base conda




In [2]:
import pyspark
from pyspark.sql import SparkSession, Window, DataFrame
import pyspark.sql.functions as F
import pyspark.sql.types as T

spark = (
    SparkSession.builder
    .config('spark.jars.packages', 'ml.combust.mleap:mleap-spark-base_2.11:0.14.0,ml.combust.mleap:mleap-spark_2.11:0.14.0')
    .getOrCreate()
)

In [3]:
import sys
sys.path.append('/home/jovyan')

In [4]:
import json

def load_iris(cwd: str = "") -> DataFrame:
    """Load Iris data as spark sql dataframe

    :param cwd: Optional path to the project root
    :type cwd: str
    
    :returns: Spark SQL DataFrame
    :rtype: DataFrame

    :Example:

    +---------------+--------------+---------------+--------------+-----------+
    |sepal_length_cm|sepal_width_cm|petal_length_cm|petal_width_cm|      class|
    +---------------+--------------+---------------+--------------+-----------+
    |            5.1|           3.5|            1.4|           0.2|Iris-setosa|
    +---------------+--------------+---------------+--------------+-----------+
    """
    schema = T.StructType.fromJson(
        json.loads(
            """
            {"fields":[{"metadata":{},"name":"sepal_length_cm","nullable":false,"type":"double"},{"metadata":{},"name":"sepal_width_cm","nullable":false,"type":"double"},{"metadata":{},"name":"petal_length_cm","nullable":false,"type":"double"},{"metadata":{},"name":"petal_width_cm","nullable":false,"type":"double"},{"metadata":{},"name":"class","nullable":false,"type":"string"}],"type":"struct"}
            """.strip()
        )
    )
    spark = SparkSession.builder.getOrCreate()
    return (
        spark.read.format("csv")
        .schema(schema)
        .load("{}data/iris".format(cwd))
    )


In [5]:
iris = load_iris('../')
iris.filter("class != 'Iris-setosa'").show(1)

+---------------+--------------+---------------+--------------+---------------+
|sepal_length_cm|sepal_width_cm|petal_length_cm|petal_width_cm|          class|
+---------------+--------------+---------------+--------------+---------------+
|            7.0|           3.2|            4.7|           1.4|Iris-versicolor|
+---------------+--------------+---------------+--------------+---------------+
only showing top 1 row



In [6]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import IndexToString, StringIndexer, VectorAssembler

In [7]:
labelIndexer = StringIndexer(inputCol="class", outputCol="indexedLabel").fit(iris)
assembler = VectorAssembler(
    inputCols=["sepal_length_cm", "sepal_width_cm", "petal_length_cm", "petal_width_cm"],
    outputCol="features"
)
labelConverter = IndexToString(
    inputCol="prediction", outputCol="predictedLabel",
    labels=labelIndexer.labels
)
rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="features", numTrees=10)
pipeline = Pipeline(stages=[labelIndexer, assembler, rf, labelConverter])

In [8]:
model = pipeline.fit(iris)

In [9]:
model.transform(iris.select('sepal_length_cm', 'sepal_width_cm', 'petal_length_cm', 'petal_width_cm')).show()

+---------------+--------------+---------------+--------------+-----------------+--------------+-------------+----------+--------------+
|sepal_length_cm|sepal_width_cm|petal_length_cm|petal_width_cm|         features| rawPrediction|  probability|prediction|predictedLabel|
+---------------+--------------+---------------+--------------+-----------------+--------------+-------------+----------+--------------+
|            5.1|           3.5|            1.4|           0.2|[5.1,3.5,1.4,0.2]|[10.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|   Iris-setosa|
|            4.9|           3.0|            1.4|           0.2|[4.9,3.0,1.4,0.2]|[10.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|   Iris-setosa|
|            4.7|           3.2|            1.3|           0.2|[4.7,3.2,1.3,0.2]|[10.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|   Iris-setosa|
|            4.6|           3.1|            1.5|           0.2|[4.6,3.1,1.5,0.2]|[10.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|   Iris-setosa|
|            5.0|           3.6|         

In [10]:
import mleap.pyspark
from mleap.pyspark.spark_support import SimpleSparkSerializer

In [11]:
model.serializeToBundle("file:/home/jovyan/notebooks/model", model.transform(iris))

In [12]:
from pyspark.ml import PipelineModel
model2 = PipelineModel.deserializeFromBundle('file:/home/jovyan/notebooks/model')
model2.transform(iris.select('sepal_length_cm', 'sepal_width_cm', 'petal_length_cm', 'petal_width_cm')).show()

+---------------+--------------+---------------+--------------+-----------------+--------------+-------------+----------+--------------+
|sepal_length_cm|sepal_width_cm|petal_length_cm|petal_width_cm|         features| rawPrediction|  probability|prediction|predictedLabel|
+---------------+--------------+---------------+--------------+-----------------+--------------+-------------+----------+--------------+
|            5.1|           3.5|            1.4|           0.2|[5.1,3.5,1.4,0.2]|[10.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|   Iris-setosa|
|            4.9|           3.0|            1.4|           0.2|[4.9,3.0,1.4,0.2]|[10.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|   Iris-setosa|
|            4.7|           3.2|            1.3|           0.2|[4.7,3.2,1.3,0.2]|[10.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|   Iris-setosa|
|            4.6|           3.1|            1.5|           0.2|[4.6,3.1,1.5,0.2]|[10.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|   Iris-setosa|
|            5.0|           3.6|         