In [41]:
import pyspark
sc = pyspark.SparkContext('local[*]')

In [42]:
from pyspark.sql import SQLContext
sqlc = SQLContext(sc)

## Spark ML Tuning

### Elements:
- Model (Estimator or Pipeline)
- Set of ParamMaps (to perform the grid search) - you should use the ***ParamGridBuilder*** utility
- Evaluator (to assess the fitness of the model)

## Cross-Validation

- Splits the dataset into K folds
- Each fold is splitted into a training (2/3) and a test (1/3) sets
- It will fit K models and compute the average of the K evaluation metrics (according to the Evaluator)
- Based on the metrics, it will determine the best set of parameters
- Then it will fit the model one final time, using this set of parameters and the whole dataset
- This is a VERY computationally expensive 

In [43]:
!rm -rf metastore_db/*.lck

training = sqlc.createDataFrame([
    (0, "a b c d e spark", 1.0),
    (1, "b d", 0.0),
    (2, "spark f g h", 1.0),
    (3, "hadoop mapreduce", 0.0),
    (4, "b spark who", 1.0),
    (5, "g d a y", 0.0),
    (6, "spark fly", 1.0),
    (7, "was mapreduce", 0.0),
    (8, "e spark program", 1.0),
    (9, "a e c l", 0.0),
    (10, "spark compile", 1.0),
    (11, "hadoop software", 0.0)
], ["id", "text", "label"])

In [44]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import HashingTF, Tokenizer

tokenizer = Tokenizer(inputCol="text", outputCol="words")

hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")

lr = LogisticRegression(maxIter=10)

pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

In [45]:
from pyspark.ml.tuning import ParamGridBuilder

paramGrid = ParamGridBuilder() \
    .addGrid(hashingTF.numFeatures, [10, 100, 1000]) \
    .addGrid(lr.regParam, [0.1, 0.01]) \
    .build()

In [46]:
paramGrid

[{Param(parent='LogisticRegression_414ba203ce9e4989fac8', name='regParam', doc='regularization parameter (>= 0).'): 0.1,
  Param(parent='HashingTF_4aaaa107b6b05116b537', name='numFeatures', doc='number of features.'): 10},
 {Param(parent='LogisticRegression_414ba203ce9e4989fac8', name='regParam', doc='regularization parameter (>= 0).'): 0.1,
  Param(parent='HashingTF_4aaaa107b6b05116b537', name='numFeatures', doc='number of features.'): 100},
 {Param(parent='LogisticRegression_414ba203ce9e4989fac8', name='regParam', doc='regularization parameter (>= 0).'): 0.1,
  Param(parent='HashingTF_4aaaa107b6b05116b537', name='numFeatures', doc='number of features.'): 1000},
 {Param(parent='LogisticRegression_414ba203ce9e4989fac8', name='regParam', doc='regularization parameter (>= 0).'): 0.01,
  Param(parent='HashingTF_4aaaa107b6b05116b537', name='numFeatures', doc='number of features.'): 10},
 {Param(parent='LogisticRegression_414ba203ce9e4989fac8', name='regParam', doc='regularization parameter

In [47]:
from pyspark.ml.tuning import CrossValidator

crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(),
                          numFolds=3)

cvModel = crossval.fit(training)

In [48]:
cvModel.avgMetrics

[0.962962962962963,
 0.962962962962963,
 0.9259259259259258,
 0.962962962962963,
 0.962962962962963,
 0.9259259259259258]

In [69]:
cvModel.bestModel.stages[1].save('awesome1.parquet')

In [71]:
!cat awesome2.parquet/metadata/part-00000

{"class":"org.apache.spark.ml.classification.LogisticRegressionModel","timestamp":1518882244188,"sparkVersion":"2.2.0","uid":"LogisticRegression_414ba203ce9e4989fac8","paramMap":{"threshold":0.5,"regParam":0.1,"aggregationDepth":2,"rawPredictionCol":"rawPrediction","standardization":true,"tol":1.0E-6,"fitIntercept":true,"predictionCol":"prediction","family":"auto","featuresCol":"features","elasticNetParam":0.0,"maxIter":10,"probabilityCol":"probability","labelCol":"label"}}


In [10]:
cvModel.bestModel.stages

[Tokenizer_4f44b9f0f030d0197044,
 HashingTF_44f4ba8750770b40f4ea,
 LogisticRegression_4fe98415bc06b1f16ab7]

In [11]:
lr_best = cvModel.bestModel.stages[2]

In [12]:
lr_best.coefficients

DenseVector([-1.0059, 0.0124, 0.5976, -0.2633, -0.6609, 1.787, -1.4679, 0.0121, 1.1602, 0.0])

In [13]:
lr_summary = lr_best.summary

In [14]:
lr_summary.areaUnderROC

1.0

In [15]:
test = sqlc.createDataFrame([
    (4, "spark i j k"),
    (5, "l m n"),
    (6, "mapreduce spark"),
    (7, "apache hadoop")
], ["id", "text"])

In [16]:
prediction = cvModel.transform(test)

selected = prediction.select("id", "text", "probability", "prediction")

for row in selected.collect():
    print(row)

Row(id=4, text='spark i j k', probability=DenseVector([0.627, 0.373]), prediction=0.0)
Row(id=5, text='l m n', probability=DenseVector([0.3451, 0.6549]), prediction=1.0)
Row(id=6, text='mapreduce spark', probability=DenseVector([0.3351, 0.6649]), prediction=1.0)
Row(id=7, text='apache hadoop', probability=DenseVector([0.2767, 0.7233]), prediction=1.0)


## Train-Validation Split

- It uses the entire dataset
- The dataset is splitted into a training and a test sets according to the ***trainRatio*** parameter
- It will fit a model for each set of parameters and evaluate its metrics (according to the Evaluator)
- Based on the metrics, it will determine the best set of parameters
- Then it will fit the model one final time, using this set of parameters and the whole dataset
- This is a much less expensive, but it may not yield good results if the dataset is not large enough

In [22]:
from pyspark.ml.regression import LinearRegression

data = sqlc.read.format("libsvm").load("/home/ubuntu/spark/data/mllib/sample_linear_regression_data.txt")

train, test = data.randomSplit([0.7, 0.3])

lr = LinearRegression(maxIter=20, regParam=0.1)

In [23]:
from pyspark.ml.tuning import ParamGridBuilder

paramGrid = ParamGridBuilder()\
    .addGrid(lr.regParam, [0.1, 0.01]) \
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])\
    .build()

In [24]:
paramGrid

[{Param(parent='LinearRegression_4f8e9390c770160f7729', name='regParam', doc='regularization parameter (>= 0).'): 0.1,
  Param(parent='LinearRegression_4f8e9390c770160f7729', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0.0},
 {Param(parent='LinearRegression_4f8e9390c770160f7729', name='regParam', doc='regularization parameter (>= 0).'): 0.1,
  Param(parent='LinearRegression_4f8e9390c770160f7729', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0.5},
 {Param(parent='LinearRegression_4f8e9390c770160f7729', name='regParam', doc='regularization parameter (>= 0).'): 0.1,
  Param(parent='LinearRegression_4f8e9390c770160f7729', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1

In [25]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import TrainValidationSplit

tvs = TrainValidationSplit(estimator=lr,
                           estimatorParamMaps=paramGrid,
                           evaluator=RegressionEvaluator(),
                           trainRatio=0.8)

model = tvs.fit(train)

In [26]:
model.bestModel

LinearRegression_4f8e9390c770160f7729

In [27]:
model.bestModel.coefficients

DenseVector([-0.7786, 0.4071, -0.9313, 3.2022, 0.7962, 1.0608, 0.1326, -0.9576, -0.7159, 0.5896])

In [28]:
model.bestModel.summary.r2

0.04953668424664892

In [29]:
prediction = model.transform(test)

In [30]:
prediction.toPandas()[:5]

Unnamed: 0,label,features,prediction
0,-23.48744,"(-0.519535443126, 0.808035794841, 0.8498613208...",-0.886864
1,-18.275214,"(-0.489685764918, 0.683231434274, 0.9115808714...",0.865535
2,-17.0654,"(-0.0177244659457, 0.563282914714, 0.142324203...",0.981015
3,-15.862009,"(-0.634145383179, -0.925918063973, 0.302702923...",1.795393
4,-15.780685,"(-0.256849206372, 0.774097619743, -0.782915810...",3.708987


In [31]:
sc.stop()