In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession \
   .builder \
   .master('local[2]') \
   .appName("ML OverView") \
   .getOrCreate()

- org.apache.spark.ml -> Dataframe
- org.apache.spark.mllib -> Spark low-level RDD APIs (maintenance mode)

                              The machine learning workflow in spark
![title](MLWorkflowInSpark.jpg "The machine learning workflow in spark")

In [3]:
from pyspark.ml.linalg import Vectors
denseVec = Vectors.dense(1.0, 2.0, 3.0)
size = 3
idx = [1, 2] # locations of non-zero elements in vector
values = [2.0, 3.0]
sparseVec = Vectors.sparse(size, idx, values)

In [4]:
denseVec.values

array([1., 2., 3.])

In [5]:
df = spark.read.json("../data/simple-ml/")

In [6]:
df.orderBy('value2').show()

+-----+----+------+------------------+
|color| lab|value1|            value2|
+-----+----+------+------------------+
|green|good|     1|14.386294994851129|
|green| bad|    16|14.386294994851129|
| blue| bad|     8|14.386294994851129|
| blue| bad|     8|14.386294994851129|
| blue| bad|    12|14.386294994851129|
|green| bad|    16|14.386294994851129|
|green|good|    12|14.386294994851129|
|  red|good|    35|14.386294994851129|
|  red|good|    35|14.386294994851129|
|  red| bad|     2|14.386294994851129|
|  red| bad|    16|14.386294994851129|
|  red| bad|    16|14.386294994851129|
| blue| bad|     8|14.386294994851129|
|green|good|     1|14.386294994851129|
|green|good|    12|14.386294994851129|
| blue| bad|     8|14.386294994851129|
|  red|good|    35|14.386294994851129|
| blue| bad|    12|14.386294994851129|
|  red| bad|    16|14.386294994851129|
|green|good|    12|14.386294994851129|
+-----+----+------+------------------+
only showing top 20 rows



In [7]:
df1 = spark.read.format("libsvm").load(
   "../data/sample_libsvm_data.txt")

In [8]:
df1.show(1)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[127,128,129...|
+-----+--------------------+
only showing top 1 row



In [9]:
df1.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)



In [10]:
df.printSchema()

root
 |-- color: string (nullable = true)
 |-- lab: string (nullable = true)
 |-- value1: long (nullable = true)
 |-- value2: double (nullable = true)



# The basics of RFormula  operator are
- '~' Separate target and terms
- '+' Concat terms; "+ 0" means removing the intercept ( this means that the y-intercept of the line that we will fit will be 0)
- '-' Remove a term; "- 0" means removing the intercept (this means that the y-intercept of the line that we will fit will be 0 -- yes)
- ':' Interaction (multiplication for numeric values, or binarized categorical values)
- '.' All columns except the target/dependent variable.

In [11]:
from pyspark.ml.feature import RFormula
supervised = RFormula(formula="lab ~. + color:value1 + color:value2")

In [12]:
fittedRF = supervised.fit(df)
preparedRF = fittedRF.transform(df)
preparedRF.show()

+-----+----+------+------------------+--------------------+-----+
|color| lab|value1|            value2|            features|label|
+-----+----+------+------------------+--------------------+-----+
|green|good|     1|14.386294994851129|(10,[1,2,3,5,8],[...|  1.0|
| blue| bad|     8|14.386294994851129|(10,[2,3,6,9],[8....|  0.0|
| blue| bad|    12|14.386294994851129|(10,[2,3,6,9],[12...|  0.0|
|green|good|    15| 38.97187133755819|(10,[1,2,3,5,8],[...|  1.0|
|green|good|    12|14.386294994851129|(10,[1,2,3,5,8],[...|  1.0|
|green| bad|    16|14.386294994851129|(10,[1,2,3,5,8],[...|  0.0|
|  red|good|    35|14.386294994851129|(10,[0,2,3,4,7],[...|  1.0|
|  red| bad|     1| 38.97187133755819|(10,[0,2,3,4,7],[...|  0.0|
|  red| bad|     2|14.386294994851129|(10,[0,2,3,4,7],[...|  0.0|
|  red| bad|    16|14.386294994851129|(10,[0,2,3,4,7],[...|  0.0|
|  red|good|    45| 38.97187133755819|(10,[0,2,3,4,7],[...|  1.0|
|green|good|     1|14.386294994851129|(10,[1,2,3,5,8],[...|  1.0|
| blue| ba

In [13]:
fittedRF.explainParams()

"featuresCol: features column name (default: features)\nforceIndexLabel: Force to index label whether it is numeric or string (default: False)\nformula: R model formula (current: lab ~. + color:value1 + color:value2)\nhandleInvalid: How to handle invalid data (unseen or NULL values) in features and label column of string type. Options are 'skip' (filter out rows with invalid data), error (throw an error), or 'keep' (put invalid data in a special additional bucket, at index numLabels). (default: error)\nlabelCol: label column name (default: label)\nstringIndexerOrderType: How to order categories of a string FEATURE column used by StringIndexer. The last category after ordering is dropped when encoding strings. Supported options: frequencyDesc, frequencyAsc, alphabetDesc, alphabetAsc. The default value is 'frequencyDesc'. When the ordering is set to 'alphabetDesc', RFormula drops the same category as R when encoding strings. (default: frequencyDesc)"

In [14]:
train , test = preparedRF.randomSplit([0.7, 0.3])

In [15]:
from pyspark.ml.classification import LogisticRegression, LogisticRegressionSummary
lr = LogisticRegression(labelCol="label", featuresCol="features")

In [16]:
lr.explainParams()

"aggregationDepth: suggested depth for treeAggregate (>= 2). (default: 2)\nelasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. (default: 0.0)\nfamily: The name of family which is a description of the label distribution to be used in the model. Supported options: auto, binomial, multinomial (default: auto)\nfeaturesCol: features column name. (default: features, current: features)\nfitIntercept: whether to fit an intercept term. (default: True)\nlabelCol: label column name. (default: label, current: label)\nlowerBoundsOnCoefficients: The lower bounds on coefficients if fitting under bound constrained optimization. The bound matrix must be compatible with the shape (1, number of features) for binomial regression, or (number of classes, number of features) for multinomial regression. (undefined)\nlowerBoundsOnIntercepts: The lower bounds on intercepts if fitting under bound constrained optimizat

In [17]:
fittedLR = lr.fit(train)

In [18]:
fittedLR.transform(train).select("label", "prediction").show()

+-----+----------+
|label|prediction|
+-----+----------+
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
+-----+----------+
only showing top 20 rows



In [19]:
# Creating logical pipeline
train, test = df.randomSplit([0.7, 0.3])

In [20]:
rForm = RFormula()
lr = LogisticRegression().setLabelCol("label").setFeaturesCol("features")

In [21]:
from  pyspark.ml import Pipeline
stages = [rForm, lr]
pipeline = Pipeline().setStages(stages)

In [22]:
from pyspark.ml.tuning import ParamGridBuilder
params = ParamGridBuilder() \
   .addGrid(rForm.formula, [
    "lab ~ . + color:value1",
    "lab ~ . + color:value1 + color:value2"]) \
   .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]) \
   .addGrid(lr.regParam, [0.1, 2.0]) \
   .build()

# in the above parameter grid, there are three hyperparameters that will diverge from the defaults:
- Two different versions of the RFormula.
- Three different optionts for the ElasticNet Parameter.
- Two different options for the regularization parameter.

This gives a total of 12 different combinations of these parameters

In [23]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator() \
   .setMetricName("areaUnderROC") \
   .setRawPredictionCol("prediction") \
   .setLabelCol("label")

In [24]:
from pyspark.ml.tuning import TrainValidationSplit
tvs = TrainValidationSplit() \
   .setTrainRatio(0.75) \
   .setEstimatorParamMaps(params) \
   .setEstimator(pipeline) \
   .setEvaluator(evaluator)
tvsFitted = tvs.fit(train)

In [25]:
evaluator.evaluate(tvsFitted.transform(test))

0.875

In [27]:
from pyspark.ml import PipelineModel
from pyspark.ml.classification import LogisticRegressionModel
trainedPipeline = tvsFitted.bestModel

In [29]:
tvsFitted.transform(test).show()

+-----+----+------+------------------+--------------------+-----+--------------------+--------------------+----------+
|color| lab|value1|            value2|            features|label|       rawPrediction|         probability|prediction|
+-----+----+------+------------------+--------------------+-----+--------------------+--------------------+----------+
| blue| bad|     8|14.386294994851129|(7,[2,3,6],[8.0,1...|  0.0|[2.05856932810732...|[0.88681064207379...|       0.0|
| blue| bad|    12|14.386294994851129|(7,[2,3,6],[12.0,...|  0.0|[2.38802048540920...|[0.91590923232524...|       0.0|
| blue| bad|    12|14.386294994851129|(7,[2,3,6],[12.0,...|  0.0|[2.38802048540920...|[0.91590923232524...|       0.0|
| blue| bad|    12|14.386294994851129|(7,[2,3,6],[12.0,...|  0.0|[2.38802048540920...|[0.91590923232524...|       0.0|
|green| bad|    16|14.386294994851129|[0.0,1.0,16.0,14....|  0.0|[-0.6731292249650...|[0.33779650948150...|       1.0|
|green| bad|    16|14.386294994851129|[0.0,1.0,1