# Demo code from Mastering Machine Learning with Spark 2.x
### Chapter02
### https://github.com/PacktPublishing/Mastering-Machine-Learning-with-Spark-2.x


In [1]:
import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.linalg._
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import org.apache.spark.mllib.evaluation._
import org.apache.spark.mllib.tree._


In [2]:
import org.apache.spark.mllib.linalg.Vector
/**
  * A simple console based table.
  */
object Tabulizer {
  def table(cells: Seq[Product]): String =
    table(cells.map(p => p.productIterator.toList), false)

  def table(header: Seq[String], cells: Seq[Product], format: Map[Int, String]): String =
    table(Seq(header) ++ cells.map(p => {
      p.productIterator.toList.zipWithIndex.map { case (v, i) =>
          format.get(i).map(f => String.format(f, v.asInstanceOf[Object])).getOrElse(v)
      }
    }), true)

  def table[A, B](cells: scala.collection.Map[A, B]): String = {
    val header = cells.keys.toSeq
    val values = header.map(cells(_))
    table(Seq(header) ++ Seq(values), true)
  }

  /*def table(header: Seq[String], cells: Seq[Seq[Any]]): String =
    table(Seq(header) ++ cells, header = true)*/

  def table(vector: Vector, cols: Int, format: String = "%.3f"): String =
    table(vector.toArray.map(format.format(_)), cols, None)

  def table(list: Seq[Any], cols: Int, header: Option[Seq[String]]): String =
    table(tblize(header.map(_ ++ list).getOrElse(list), cols), header.isDefined)

  def table(cells: Seq[Seq[Any]], header: Boolean): String = {
    val colSizes = cells
      .map(_.map(v => if (v != null) v.toString.length else 1))
      .reduce((v1, v2) => v1.zip(v2).map { case (v1, v2) => if (v1 > v2) v1 else v2 })
    val rowSeparator = colSizes.map("-" * _).mkString("+", "+", "+")
    def valueFormatter(v: Any, size: Int): String =
      ("%" + size + "s").format(if (v != null) v else "-")
    val rows = cells
      .map(row => row.zip(colSizes)
        .map { case (v, size) => valueFormatter(v, size) }.mkString("|", "|", "|"))
    if (header)
      s"""
         #$rowSeparator
         #${rows.head}
         #$rowSeparator
         #${rows.tail.mkString("\n")}
         #$rowSeparator
      """.stripMargin('#')
    else
      s"""
         #$rowSeparator
         #${rows.mkString("\n")}
         #$rowSeparator
      """.stripMargin('#')
  }

  def tblize(list: Seq[Product], horizontal: Boolean, cols: Int): Seq[Seq[Any]] = {
    val arity = list.head.productArity
    tblize(list.flatMap(_.productIterator.toList), cols = arity * cols)
  }

  def tblize(list: Seq[Any], cols: Int = 4): Seq[Seq[Any]] = {
    val nrow = list.length / cols + (if (list.length % cols == 0) 0 else 1)
    list.sliding(cols, cols)
      .map(s => if (s.length == cols || s.length == list.length) s else s.padTo(cols, null))
      .foldLeft(Seq[Seq[Any]]()) { case (a, s) => a ++ Seq(s) }
  }
}


defined object Tabulizer


In [4]:
val rawData = sc.textFile(s"${sys.env.get("DATADIR").getOrElse("data")}/higgs100k.csv")
println(s"Number of rows: ${rawData.count}")


Number of rows: 100000                                                          


rawData = data/higgs100k.csv MapPartitionsRDD[3] at textFile at <console>:40


lastException: Throwable = null


data/higgs100k.csv MapPartitionsRDD[3] at textFile at <console>:40

In [5]:
println("Rows")
println(rawData.take(2).mkString("\n"))


Rows
1.000000000000000000e+00,8.692932128906250000e-01,-6.350818276405334473e-01,2.256902605295181274e-01,3.274700641632080078e-01,-6.899932026863098145e-01,7.542022466659545898e-01,-2.485731393098831177e-01,-1.092063903808593750e+00,0.000000000000000000e+00,1.374992132186889648e+00,-6.536741852760314941e-01,9.303491115570068359e-01,1.107436060905456543e+00,1.138904333114624023e+00,-1.578198313713073730e+00,-1.046985387802124023e+00,0.000000000000000000e+00,6.579295396804809570e-01,-1.045456994324922562e-02,-4.576716944575309753e-02,3.101961374282836914e+00,1.353760004043579102e+00,9.795631170272827148e-01,9.780761599540710449e-01,9.200048446655273438e-01,7.216574549674987793e-01,9.887509346008300781e-01,8.766783475875854492e-01
1.000000000000000000e+00,9.075421094894409180e-01,3.291472792625427246e-01,3.594118654727935791e-01,1.497969865798950195e+00,-3.130095303058624268e-01,1.095530629158020020e+00,-5.575249195098876953e-01,-1.588229775428771973e+00,2.173076152801513672e+00,8.125811

In [6]:
val data = rawData.map(line => line.split(',').map(_.toDouble))


data = MapPartitionsRDD[4] at map at <console>:42


MapPartitionsRDD[4] at map at <console>:42

In [7]:
data.take(2)


[[1.0, 0.869293212890625, -0.6350818276405334, 0.22569026052951813, 0.327470064163208, -0.6899932026863098, 0.7542022466659546, -0.24857313930988312, -1.0920639038085938, 0.0, 1.3749921321868896, -0.6536741852760315, 0.9303491115570068, 1.1074360609054565, 1.138904333114624, -1.5781983137130737, -1.046985387802124, 0.0, 0.657929539680481, -0.010454569943249226, -0.0457671694457531, 3.101961374282837, 1.353760004043579, 0.9795631170272827, 0.978076159954071, 0.9200048446655273, 0.7216574549674988, 0.9887509346008301, 0.8766783475875854], [1.0, 0.9075421094894409, 0.3291472792625427, 0.3594118654727936, 1.4979698657989502, -0.3130095303058624, 1.09553062915802, -0.5575249195098877, -1.588229775428772, 2.1730761528015137, 0.8125811815261841, -0.2136419266462326, 1.2710145711898804, 2.214872121810913, 0.4999939501285553, -1.2614318132400513, 0.7321561574935913, 0.0, 0.39870089292526245, -1.138930082321167, -8.191101951524615E-4, 0.0, 0.3022198975086212, 0.8330481648445129, 0.98569965362548

In [8]:
// Split the rows into input(=features) - output(=response) pairs
val response: RDD[Int] = data.map(row => row(0).toInt)
val features: RDD[Vector] = data.map(line => Vectors.dense(line.slice(1, line.size)))


response = MapPartitionsRDD[5] at map at <console>:45
features = MapPartitionsRDD[6] at map at <console>:46


MapPartitionsRDD[6] at map at <console>:46

In [9]:
response.take(2)

[1, 1]

In [10]:
features.take(2)

[[0.869293212890625,-0.6350818276405334,0.22569026052951813,0.327470064163208,-0.6899932026863098,0.7542022466659546,-0.24857313930988312,-1.0920639038085938,0.0,1.3749921321868896,-0.6536741852760315,0.9303491115570068,1.1074360609054565,1.138904333114624,-1.5781983137130737,-1.046985387802124,0.0,0.657929539680481,-0.010454569943249226,-0.0457671694457531,3.101961374282837,1.353760004043579,0.9795631170272827,0.978076159954071,0.9200048446655273,0.7216574549674988,0.9887509346008301,0.8766783475875854], [0.9075421094894409,0.3291472792625427,0.3594118654727936,1.4979698657989502,-0.3130095303058624,1.09553062915802,-0.5575249195098877,-1.588229775428772,2.1730761528015137,0.8125811815261841,-0.2136419266462326,1.2710145711898804,2.214872121810913,0.4999939501285553,-1.2614318132400513,0.7321561574935913,0.0,0.39870089292526245,-1.138930082321167,-8.191101951524615E-4,0.0,0.3022198975086212,0.8330481648445129,0.9856996536254883,0.9780983924865723,0.7797321677207947,0.9923557639122009,

In [11]:
val featuresMatrix = new RowMatrix(features)
val featuresSummary = featuresMatrix.computeColumnSummaryStatistics()




featuresMatrix = org.apache.spark.mllib.linalg.distributed.RowMatrix@18bc80ce
featuresSummary = org.apache.spark.mllib.stat.MultivariateOnlineSummarizer@7bf2d16e


org.apache.spark.mllib.stat.MultivariateOnlineSummarizer@7bf2d16e

In [12]:
//Print mean of columns
println(s"Higgs Features Mean Values = ${Tabulizer.table(featuresSummary.mean, 8)}")


Higgs Features Mean Values = 
+-----+------+------+------+------+-----+------+-----+
|0.990|-0.004|-0.002| 0.995|-0.008|0.987|-0.003|0.000|
|0.998| 0.991|-0.001| 0.004| 1.004|0.993| 0.002|0.001|
|1.006| 0.986|-0.008|-0.004| 0.993|1.033| 1.023|1.050|
|1.010| 0.973| 1.032| 0.959|     -|    -|     -|    -|
+-----+------+------+------+------+-----+------+-----+
      


In [13]:
//Print the variance of columns
println(s"Higgs Features Variance Values = ${Tabulizer.table(featuresSummary.variance, 8)}")


Higgs Features Variance Values = 
+-----+-----+-----+-----+-----+-----+-----+-----+
|0.316|1.010|1.012|0.354|1.014|0.224|1.017|1.017|
|1.056|0.248|1.010|1.014|1.101|0.238|1.017|1.011|
|1.431|0.255|1.018|1.014|1.951|0.426|0.138|0.027|
|0.159|0.274|0.132|0.098|    -|    -|    -|    -|
+-----+-----+-----+-----+-----+-----+-----+-----+
      


In [14]:
// Calculate how many nonzeros we have in each column
val nonZeros = featuresSummary.numNonzeros
println(s"Non-zero values count per column: ${Tabulizer.table(nonZeros, cols = 8, format = "%.0f")}")


Non-zero values count per column: 
+------+------+------+------+------+------+------+------+
|100000|100000|100000|100000|100000|100000|100000|100000|
| 50907|100000|100000|100000| 50023|100000|100000|100000|
| 43176|100000|100000|100000| 34973|100000|100000|100000|
|100000|100000|100000|100000|     -|     -|     -|     -|
+------+------+------+------+------+------+------+------+
      


nonZeros = [100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,50907.0,100000.0,100000.0,100000.0,50023.0,100000.0,100000.0,100000.0,43176.0,100000.0,100000.0,100000.0,34973.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0]


[100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,50907.0,100000.0,100000.0,100000.0,50023.0,100000.0,100000.0,100000.0,43176.0,100000.0,100000.0,100000.0,34973.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0]

In [15]:
val numRows = featuresMatrix.numRows
val numCols = featuresMatrix.numCols


numRows = 100000
numCols = 28


28

In [16]:
val colsWithZeros = nonZeros
  .toArray
  .zipWithIndex
  .filter { case (rows, idx) => rows != numRows }


colsWithZeros = Array((50907.0,8), (50023.0,12), (43176.0,16), (34973.0,20))


[(50907.0,8), (50023.0,12), (43176.0,16), (34973.0,20)]

In [17]:
println(s"Columns with zeros:\n${Tabulizer.table(Seq("#zeros", "column"), colsWithZeros, Map.empty[Int, String])}")


Columns with zeros:

+-------+------+
| #zeros|column|
+-------+------+
|50907.0|     8|
|50023.0|    12|
|43176.0|    16|
|34973.0|    20|
+-------+------+
      


In [18]:
val sparsity = nonZeros.toArray.sum / (numRows * numCols)
println(f"Data sparsity: ${sparsity}%.2f")


Data sparsity: 0.92


sparsity = 0.9210996428571429


0.9210996428571429

In [19]:
val responseValues = response.distinct.collect
println(s"Response values: ${responseValues.mkString(", ")}")


Response values: 0, 1                                                           


responseValues = Array(0, 1)


[0, 1]

In [20]:
val responseDistribution = response.map(v => (v,1)).countByKey
println(s"Response distribution:\n${Tabulizer.table(responseDistribution)}")


Response distribution:                                                          

+-----+-----+
|    0|    1|
+-----+-----+
|47166|52834|
+-----+-----+
      


responseDistribution = Map(0 -> 47166, 1 -> 52834)


Map(0 -> 47166, 1 -> 52834)

In [21]:
val higgs = response.zip(features).map { case (response, features) => LabeledPoint(response, features) }
higgs.setName("higgs").cache()


higgs = higgs MapPartitionsRDD[15] at map at <console>:47


higgs MapPartitionsRDD[15] at map at <console>:47

In [22]:
val trainTestSplits = higgs.randomSplit(Array(0.8, 0.2))
val (trainingData, testData) = (trainTestSplits(0), trainTestSplits(1))


trainTestSplits = Array(MapPartitionsRDD[16] at randomSplit at <console>:49, MapPartitionsRDD[17] at randomSplit at <console>:49)
trainingData = MapPartitionsRDD[16] at randomSplit at <console>:49
testData = MapPartitionsRDD[17] at randomSplit at <console>:49


MapPartitionsRDD[17] at randomSplit at <console>:49

###  === Tree Model ===  

In [23]:
val dtNumClasses = 2
val dtCategoricalFeaturesInfo = Map[Int, Int]()
val dtImpurity = "gini"
val dtMaxDepth = 5
val dtMaxBins = 10


dtNumClasses = 2
dtCategoricalFeaturesInfo = Map()
dtImpurity = gini
dtMaxDepth = 5
dtMaxBins = 10


10

In [24]:
val dtreeModel = DecisionTree.trainClassifier(trainingData,
                                              dtNumClasses,
                                              dtCategoricalFeaturesInfo,
                                              dtImpurity,
                                              dtMaxDepth,
                                              dtMaxBins)


[Stage 14:>                                                         (0 + 3) / 3]

dtreeModel = DecisionTreeModel classifier of depth 5 with 63 nodes


DecisionTreeModel classifier of depth 5 with 63 nodes

In [25]:
println(s"Decision Tree Model:\n${dtreeModel.toDebugString}")


Decision Tree Model:
DecisionTreeModel classifier of depth 5 with 63 nodes
  If (feature 25 <= 1.0560009479522705)
   If (feature 25 <= 0.6134785413742065)
    If (feature 27 <= 0.8689159750938416)
     If (feature 5 <= 0.8908801674842834)
      If (feature 22 <= 0.7452906966209412)
       Predict: 1.0
      Else (feature 22 > 0.7452906966209412)
       Predict: 0.0
     Else (feature 5 > 0.8908801674842834)
      If (feature 27 <= 0.7907640337944031)
       Predict: 1.0
      Else (feature 27 > 0.7907640337944031)
       Predict: 1.0
    Else (feature 27 > 0.8689159750938416)
     If (feature 22 <= 0.9895526170730591)
      If (feature 24 <= 1.0830920934677124)
       Predict: 0.0
      Else (feature 24 > 1.0830920934677124)
       Predict: 0.0
     Else (feature 22 > 0.9895526170730591)
      If (feature 5 <= 1.5511850118637085)
       Predict: 0.0
      Else (feature 5 > 1.5511850118637085)
       Predict: 1.0
   Else (feature 25 > 0.6134785413742065)
    If (feature 26 <= 0.7856372

In [26]:
val treeLabelAndPreds = testData.map { point =>
  val prediction = dtreeModel.predict(point.features)
  (point.label.toInt, prediction.toInt)
}


treeLabelAndPreds = MapPartitionsRDD[42] at map at <console>:49


MapPartitionsRDD[42] at map at <console>:49

In [27]:
val treeTestErr = treeLabelAndPreds.filter(r => r._1 != r._2).count.toDouble / testData.count()
println(f"Tree Model: Test Error = ${treeTestErr}%.3f")


Tree Model: Test Error = 0.333


treeTestErr = 0.33283648829929946


0.33283648829929946

In [28]:
val cm = treeLabelAndPreds.combineByKey(
  createCombiner = (label: Int) => if (label == 0) (1,0) else (0,1),
  mergeValue = (v: (Int,Int), label: Int) => if (label == 0) (v._1 +1, v._2) else (v._1, v._2 + 1),
  mergeCombiners = (v1: (Int,Int), v2: (Int,Int)) => (v1._1 + v2._1, v1._2 + v2._2)).collect


cm = Array((0,(5430,4083)), (1,(2616,7998)))


[(0,(5430,4083)), (1,(2616,7998))]

In [29]:
val (tn, tp, fn, fp) = (cm(0)._2._1, cm(1)._2._2, cm(1)._2._1, cm(0)._2._2)
println(f"""Confusion Matrix
            |   ${0}%5d ${1}%5d  ${"Err"}%10s
            |0  ${tn}%5d ${fp}%5d ${tn+fp}%5d ${fp.toDouble/(tn+fp)}%5.4f
            |1  ${fn}%5d ${tp}%5d ${fn+tp}%5d ${fn.toDouble/(fn+tp)}%5.4f
            |   ${tn+fn}%5d ${fp+tp}%5d ${tn+fp+fn+tp}%5d ${(fp+fn).toDouble/(tn+fp+fn+tp)}%5.4f
            |""".stripMargin)


Confusion Matrix
       0     1         Err
0   5430  4083  9513 0.4292
1   2616  7998 10614 0.2465
    8046 12081 20127 0.3328



tn = 5430
tp = 7998
fn = 2616
fp = 4083


4083

In [30]:
type Predictor = {
  def predict(features: Vector): Double
}


defined type alias Predictor


In [31]:
def computeMetrics(model: Predictor, data: RDD[LabeledPoint]): BinaryClassificationMetrics = {
  val predAndLabels = data.map(newData => (model.predict(newData.features), newData.label))
  new BinaryClassificationMetrics(predAndLabels)
}


computeMetrics: (model: Predictor, data: org.apache.spark.rdd.RDD[org.apache.spark.mllib.regression.LabeledPoint])org.apache.spark.mllib.evaluation.BinaryClassificationMetrics


In [32]:
val treeMetrics = computeMetrics(dtreeModel, testData)
println(f"Tree Model: AUC on Test Data = ${treeMetrics.areaUnderROC()}%.3f")


Tree Model: AUC on Test Data = 0.662


treeMetrics = org.apache.spark.mllib.evaluation.BinaryClassificationMetrics@1479d9e7


org.apache.spark.mllib.evaluation.BinaryClassificationMetrics@1479d9e7

### === Random Forest Model ===


In [33]:
val numClasses = 2
val categoricalFeaturesInfo = Map[Int, Int]()
val numTrees = 10
val featureSubsetStrategy = "auto"
val impurity = "gini"
val maxDepth = 5
val maxBins = 10
val seed = 42


numClasses = 2
categoricalFeaturesInfo = Map()
numTrees = 10
featureSubsetStrategy = auto
impurity = gini
maxDepth = 5
maxBins = 10
seed = 42


42

In [34]:
val rfModel = RandomForest.trainClassifier(trainingData,
                                           numClasses,
                                           categoricalFeaturesInfo,
                                           numTrees,
                                           featureSubsetStrategy,
                                           impurity,
                                           maxDepth,
                                           maxBins,
                                           seed)



rfModel = 


TreeEnsembleModel classifier with 10 trees


In [35]:
def computeError(model: Predictor, data: RDD[LabeledPoint]): Double = {
  val labelAndPreds = data.map { point =>
    val prediction = model.predict(point.features)
    (point.label, prediction)
                               }
  labelAndPreds.filter(r => r._1 != r._2).count.toDouble/data.count
}


computeError: (model: Predictor, data: org.apache.spark.rdd.RDD[org.apache.spark.mllib.regression.LabeledPoint])Double


In [36]:
val rfTestErr = computeError(rfModel, testData)
println(f"RF Model: Test Error = ${rfTestErr}%.3f")


RF Model: Test Error = 0.326


rfTestErr = 0.3264768718636657


0.3264768718636657

In [37]:
val rfMetrics = computeMetrics(rfModel, testData)
println(f"RF Model: AUC on Test Data = ${rfMetrics.areaUnderROC}%.3f")


RF Model: AUC on Test Data = 0.668


rfMetrics = org.apache.spark.mllib.evaluation.BinaryClassificationMetrics@20f47655


org.apache.spark.mllib.evaluation.BinaryClassificationMetrics@20f47655

### The below cell might take some time to run + we might also need large enough memory depending on the grid search parameters...

In [38]:
val rfGrid =
for (
  gridNumTrees <- Array(10, 11);
  gridImpurity <- Array("entropy", "gini");
  gridDepth <- Array(10);
  gridBins <- Array(12,14)) yield {
  val gridModel = RandomForest.trainClassifier(trainingData, 2, Map[Int, Int](), gridNumTrees, "auto", gridImpurity, gridDepth, gridBins)
  val gridAUC = computeMetrics(gridModel, testData).areaUnderROC
  val gridErr = computeError(gridModel, testData)
  ((gridNumTrees, gridImpurity, gridDepth, gridBins), gridAUC, gridErr)
}


rfGrid = Array(((10,entropy,10,12),0.6917657292864599,0.30426789884235106), ((10,entropy,10,14),0.693615359708,0.30223083420281216), ((10,gini,10,12),0.6909380707023332,0.3052615889104188), ((10,gini,10,14),0.6888254488799563,0.3068514930193273), ((11,entropy,10,12),0.6978558156441422,0.30054156108709695), ((11,entropy,10,14),0.6893462173122175,0.307199284543151), ((11,gini,10,12),0.6993283476236767,0.2994485020122224), ((11,gini,10,14),0.6943655009713582,0.30396979182193073))


[((10,entropy,10,12),0.6917657292864599,0.30426789884235106), ((10,entropy,10,14),0.693615359708,0.30223083420281216), ((10,gini,10,12),0.6909380707023332,0.3052615889104188), ((10,gini,10,14),0.6888254488799563,0.3068514930193273), ((11,entropy,10,12),0.6978558156441422,0.30054156108709695), ((11,entropy,10,14),0.6893462173122175,0.307199284543151), ((11,gini,10,12),0.6993283476236767,0.2994485020122224), ((11,gini,10,14),0.6943655009713582,0.30396979182193073)]

In [39]:
println(
  s"""RF Model: Grid results:
     ~${Tabulizer.table(Seq("trees, impurity, depth, bins", "AUC", "error"), rfGrid, format = Map(1 -> "%.3f", 2 -> "%.3f"))}
   """.stripMargin('~'))


RF Model: Grid results:

+----------------------------+-----+-----+
|trees, impurity, depth, bins|  AUC|error|
+----------------------------+-----+-----+
|          (10,entropy,10,12)|0.692|0.304|
|          (10,entropy,10,14)|0.694|0.302|
|             (10,gini,10,12)|0.691|0.305|
|             (10,gini,10,14)|0.689|0.307|
|          (11,entropy,10,12)|0.698|0.301|
|          (11,entropy,10,14)|0.689|0.307|
|             (11,gini,10,12)|0.699|0.299|
|             (11,gini,10,14)|0.694|0.304|
+----------------------------+-----+-----+
      
   


In [40]:
val rfParamsMaxAUC = rfGrid.maxBy(g => g._2)
println(f"RF Model: Parameters ${rfParamsMaxAUC._1}%s producing max AUC = ${rfParamsMaxAUC._2}%.3f (error = ${rfParamsMaxAUC._3}%.3f)")


RF Model: Parameters (11,gini,10,12) producing max AUC = 0.699 (error = 0.299)


rfParamsMaxAUC = ((11,gini,10,12),0.6993283476236767,0.2994485020122224)


((11,gini,10,12),0.6993283476236767,0.2994485020122224)

### === Gradient Boosted Trees Model ===


In [41]:
import org.apache.spark.mllib.tree.GradientBoostedTrees
import org.apache.spark.mllib.tree.configuration.BoostingStrategy
import org.apache.spark.mllib.tree.configuration.Algo


In [42]:
val gbmStrategy = BoostingStrategy.defaultParams(Algo.Classification)
gbmStrategy.setNumIterations(10)
gbmStrategy.setLearningRate(0.1)
gbmStrategy.treeStrategy.setNumClasses(2)
gbmStrategy.treeStrategy.setMaxDepth(10)
gbmStrategy.treeStrategy.setCategoricalFeaturesInfo(java.util.Collections.emptyMap[Integer, Integer])


gbmStrategy = BoostingStrategy(org.apache.spark.mllib.tree.configuration.Strategy@4db66399,org.apache.spark.mllib.tree.loss.LogLoss$@dc2fc01,10,0.1,0.001)


BoostingStrategy(org.apache.spark.mllib.tree.configuration.Strategy@4db66399,org.apache.spark.mllib.tree.loss.LogLoss$@dc2fc01,10,0.1,0.001)

In [43]:
val gbmModel = GradientBoostedTrees.train(trainingData, gbmStrategy)


gbmModel = 


TreeEnsembleModel classifier with 10 trees


In [44]:
val gbmTestErr = computeError(gbmModel, testData)
println(f"GBM Model: Test Error = ${gbmTestErr}%.3f")


GBM Model: Test Error = 0.304


gbmTestErr = 0.30436726784915785


0.30436726784915785

In [45]:
val gbmMetrics = computeMetrics(dtreeModel, testData)
println(f"GBM Model: AUC on Test Data = ${gbmMetrics.areaUnderROC()}%.3f")


GBM Model: AUC on Test Data = 0.662


gbmMetrics = org.apache.spark.mllib.evaluation.BinaryClassificationMetrics@4b3c5fe9


org.apache.spark.mllib.evaluation.BinaryClassificationMetrics@4b3c5fe9

In [46]:
val gbmGrid =
for (
  gridNumIterations <- Array(5, 10);
  gridDepth <- Array(2, 3);
  gridLearningRate <- Array(0.1, 0.01))
  yield {
    gbmStrategy.setNumIterations(gridNumIterations)
    gbmStrategy.treeStrategy.setMaxDepth(gridDepth)
    gbmStrategy.setLearningRate(gridLearningRate)

    val gridModel = GradientBoostedTrees.train(trainingData, gbmStrategy)
    val gridAUC = computeMetrics(gridModel, testData).areaUnderROC
    val gridErr = computeError(gridModel, testData)
    ((gridNumIterations, gridDepth, gridLearningRate), gridAUC, gridErr)
  }


gbmGrid = Array(((5,2,0.1),0.6392499431173206,0.3590699060962886), ((5,2,0.01),0.6334560953363809,0.3665722661102002), ((5,3,0.1),0.6676326323141039,0.33114721518358425), ((5,3,0.01),0.6541108365173669,0.3441148705718686), ((10,2,0.1),0.646501259143939,0.3509216475381329), ((10,2,0.01),0.6334560953363809,0.3665722661102002), ((10,3,0.1),0.6754064846076272,0.3227008496050082), ((10,3,0.01),0.6618918641397387,0.33626471903413324))


[((5,2,0.1),0.6392499431173206,0.3590699060962886), ((5,2,0.01),0.6334560953363809,0.3665722661102002), ((5,3,0.1),0.6676326323141039,0.33114721518358425), ((5,3,0.01),0.6541108365173669,0.3441148705718686), ((10,2,0.1),0.646501259143939,0.3509216475381329), ((10,2,0.01),0.6334560953363809,0.3665722661102002), ((10,3,0.1),0.6754064846076272,0.3227008496050082), ((10,3,0.01),0.6618918641397387,0.33626471903413324)]

In [47]:
println(
  s"""GBM Model: Grid results:
     ~${Tabulizer.table(Seq("iterations, depth, learningRate", "AUC", "error"), gbmGrid.sortBy(-_._2).take(10), format = Map(1 -> "%.3f", 2 -> "%.3f"))}
   """.stripMargin('~'))



GBM Model: Grid results:

+-------------------------------+-----+-----+
|iterations, depth, learningRate|  AUC|error|
+-------------------------------+-----+-----+
|                     (10,3,0.1)|0.675|0.323|
|                      (5,3,0.1)|0.668|0.331|
|                    (10,3,0.01)|0.662|0.336|
|                     (5,3,0.01)|0.654|0.344|
|                     (10,2,0.1)|0.647|0.351|
|                      (5,2,0.1)|0.639|0.359|
|                     (5,2,0.01)|0.633|0.367|
|                    (10,2,0.01)|0.633|0.367|
+-------------------------------+-----+-----+
      
   


In [48]:
val gbmParamsMaxAUC = gbmGrid.maxBy(g => g._2)
println(f"GBM Model: Parameters ${gbmParamsMaxAUC._1}%s producing max AUC = ${gbmParamsMaxAUC._2}%.3f (error = ${gbmParamsMaxAUC._3}%.3f)")


GBM Model: Parameters (10,3,0.1) producing max AUC = 0.675 (error = 0.323)


gbmParamsMaxAUC = ((10,3,0.1),0.6754064846076272,0.3227008496050082)


((10,3,0.1),0.6754064846076272,0.3227008496050082)