### Import necessary spark functions.

In [1]:
import org.apache.spark.sql.functions._ 
import org.apache.spark.ml.feature._
import org.apache.spark.ml.classification.RandomForestClassifier
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator

In [8]:
val train_small = spark.read.csv("train-sample-small.csv")

train_small = [_c0: string, _c1: string ... 13 more fields]


[_c0: string, _c1: string ... 13 more fields]

In [53]:
import org.apache.spark.sql.types._

val customSchema = StructType(Array(
    StructField("PostId", DoubleType, true),
    StructField("PostCreationDate", StringType, true),
    StructField("OwnerUserId", DoubleType, true),
    StructField("OwnerCreationDate", StringType, true),
    StructField("ReputationAtPostCreation", DoubleType, true),
    StructField("OwnerUndeletedAnswerCountAtPostTime", DoubleType, true),
    StructField("Title", StringType, true),
    StructField("BodyMarkdown", StringType, true),
    StructField("Tag1", StringType, true),
    StructField("Tag2", StringType, true),
    StructField("Tag3", StringType, true),
    StructField("Tag4", StringType, true),
    StructField("Tag5", StringType, true),
    StructField("PostClosedDate", StringType, true),
    StructField("OpenStatus", StringType, true)
))

val temp = spark.
    read.
    option("quote", "\"").
    option("escape", "\"").
    option("header", "true").
    option("multiLine", "true").
    schema(customSchema).
    csv("train-sample-small.csv")

customSchema = StructType(StructField(PostId,DoubleType,true), StructField(PostCreationDate,StringType,true), StructField(OwnerUserId,DoubleType,true), StructField(OwnerCreationDate,StringType,true), StructField(ReputationAtPostCreation,DoubleType,true), StructField(OwnerUndeletedAnswerCountAtPostTime,DoubleType,true), StructField(Title,StringType,true), StructField(BodyMarkdown,StringType,true), StructField(Tag1,StringType,true), StructField(Tag2,StringType,true), StructField(Tag3,StringType,true), StructField(Tag4,StringType,true), StructField(Tag5,StringType,true), StructField(PostClosedDate,StringType,true), StructField(OpenStatus,StringType,true))
temp = [PostId: double, PostC...


[PostId: double, PostC...

In [54]:
temp.count()

14027

In [56]:
temp.    
    groupBy("OpenStatus").
    count().
    orderBy(col("count").desc).
    show()

+-------------------+-----+
|         OpenStatus|count|
+-------------------+-----+
|               open| 7075|
|not a real question| 3049|
|          off topic| 1750|
|   not constructive| 1547|
|      too localized|  606|
+-------------------+-----+



### Read sample of training data set in from S3 bucket.

In [27]:
val train_small = spark.
    read.
    option("inferSchema", "true").
    option("header", "true").
    csv("s3://stackoverflow-kaggle/data/train-sample.csv")

Name: java.io.IOException
Message: No FileSystem for scheme: s3
StackTrace:   at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:2660)
  at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:2667)
  at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:94)
  at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:2703)
  at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:2685)
  at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:373)
  at org.apache.hadoop.fs.Path.getFileSystem(Path.java:295)
  at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$org$apache$spark$sql$execution$datasources$DataSource$$checkAndGlobPathIfNecessary$1.apply(DataSource.scala:547)
  at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$org$apache$spark$sql$execution$datasources$DataSource$$checkAndGlobPathIfNecessary$1.apply(DataSource.scala:545)
  at scala.collection.TraversableLike$$anonfun$flatMap$1.apply(Traversab

In [3]:
train_small.show()

+--------------------+--------------------+-----------+-------------------+------------------------+-----------------------------------+--------------------+--------------------+----+----+----+----+----+--------------+----------+
|              PostId|    PostCreationDate|OwnerUserId|  OwnerCreationDate|ReputationAtPostCreation|OwnerUndeletedAnswerCountAtPostTime|               Title|        BodyMarkdown|Tag1|Tag2|Tag3|Tag4|Tag5|PostClosedDate|OpenStatus|
+--------------------+--------------------+-----------+-------------------+------------------------+-----------------------------------+--------------------+--------------------+----+----+----+----+----+--------------+----------+
|             2221995| 02/08/2010 13:57:35|      60200|01/29/2009 13:43:43|                     260|                                 11|3d object visible...|"I'm playing arou...|null|null|null|null|null|          null|      null|
|I want to just sh...|                null|       null|               null|     

## Simple model: Random Forest with Tags as features

To start with, we'll take a simple approach - encode the Tag columns as categorical variables and predict `OpenStatus` using Random Forest. Spark's RF Classifier example will serve as a reference ([link](https://github.com/apache/spark/blob/master/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestClassifierExample.scala)).

In [4]:
val simple_train = train_small.
        select("Tag1", "Tag2", "Tag3", "Tag4", "Tag5", "OpenStatus")

simple_train = [Tag1: string, Tag2: string ... 4 more fields]


[Tag1: string, Tag2: string ... 4 more fields]

### EDA - frequency of `OpenStatus` and `Tag1`

We see that `null` is the most often observed response by a large margin --- our data set is very imbalanced.

In [5]:
val status_counts = simple_train.
    groupBy("OpenStatus").
    count().
    orderBy(col("count").desc)

status_counts.show()

+--------------------+------+
|          OpenStatus| count|
+--------------------+------+
|                null|179428|
| not a real question|   736|
|                open|   578|
|           off topic|   335|
|    not constructive|   289|
|                  NA|    33|
|       too localized|    32|
|         item_number|     5|
|               4(8)>|     2|
|org.springframewo...|     1|
|              iphone|     1|
|       data['Notes']|     1|
| 01/06/2012 02:37:00|     1|
| 02/21/2011 17:25:58|     1|
|                   l|     1|
|               '079'|     1|
| this can only wo...|     1|
|               '{5}'|     1|
| 06/14/2012 01:19:12|     1|
| 09/17/2011 08:13:57|     1|
+--------------------+------+
only showing top 20 rows



status_counts = [OpenStatus: string, count: bigint]


[OpenStatus: string, count: bigint]

As it turns out pretty much every null `OpenStatus` is paired with a null `Tag1` (at leas in our sample training set).

Note: in the join the use of `<=>` allows for joining on null values of `tag1` ([so_link](https://stackoverflow.com/a/41729359)).

In [38]:
val tag1_counts = simple_train.
    groupBy("tag1").
    count().
    withColumnRenamed("count", "tag1_count").
    orderBy(col("count").desc)

val tag1_status = simple_train.
    groupBy("tag1", "OpenStatus").
    count().
    join(tag1_counts, train_small("tag1") <=> tag1_counts("tag1"), joinType = "left").
    where(col("count") > 20).
    orderBy(col("tag1_count").desc_nulls_first, col("count").desc_nulls_first)

tag1_status.show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

tag1_counts: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [tag1: string, tag1_count: bigint]
tag1_status: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [tag1: string, OpenStatus: string ... 3 more fields]
+-------------------+-------------------+-------+-------------------+----------+
|               tag1|         OpenStatus|  count|               tag1|tag1_count|
+-------------------+-------------------+-------+-------------------+----------+
|               null|               null|1761529|               null|   1761536|
|               open|               null|   6058|               open|      6060|
|               java|not a real question|    840|               java|      1709|
|               java|               open|    422|               java|      1709|
|               java|   not constructive|    313|               java|      1709|
|               java|          off topic|    105|               java|      1709|
|               java|      too localized|  

### Pipeline for Random Forest

Since there are null `Tag1` values, we use `setHandleInvalid("keep")` so that they are indexed rather than dropped ([so_link](https://stackoverflow.com/a/36113473)).

In [40]:
val tag1Indexer = new StringIndexer().
    setInputCol("Tag1").
    setOutputCol("indexedTag1").
    setHandleInvalid("keep").
    fit(simple_train)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

tag1Indexer: org.apache.spark.ml.feature.StringIndexerModel = strIdx_ba0487a209f3


In [42]:
tag1Indexer.
    transform(simple_train).
    select("Tag1", "indexedTag1").
    show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----+-----------+
|Tag1|indexedTag1|
+----+-----------+
|null|    10609.0|
|null|    10609.0|
|null|    10609.0|
|null|    10609.0|
|null|    10609.0|
|null|    10609.0|
|null|    10609.0|
|null|    10609.0|
|null|    10609.0|
|null|    10609.0|
|null|    10609.0|
|null|    10609.0|
|null|    10609.0|
|null|    10609.0|
|null|    10609.0|
|null|    10609.0|
|null|    10609.0|
|null|    10609.0|
|null|    10609.0|
|null|    10609.0|
+----+-----------+
only showing top 20 rows



We want to create a StringIndexer for each Tag column; rather than create 5 variables we'll take a functional approach:

In [70]:
val featureCols = Array[String]("Tag1", "Tag2", "Tag3", "Tag4", "Tag5")

val featureIndexers = featureCols.map { colName =>
    new StringIndexer().
        setInputCol(colName).
        setOutputCol("indexed" + colName).
        setHandleInvalid("keep").
        fit(simple_train)
}

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

featureCols: Array[String] = Array(Tag1, Tag2, Tag3, Tag4, Tag5)
featureIndexers: Array[org.apache.spark.ml.feature.StringIndexerModel] = Array(strIdx_3f2aedc88dcd, strIdx_b80161de7b98, strIdx_285c550b6cee, strIdx_a152531a9eae, strIdx_db6635162996)


Spark ML models expect a feature vector to be the only predictor.  [`VectorAssembler`](https://spark.apache.org/docs/latest/ml-features.html#vectorassembler) is a transformer that combines a list of columns into a single vector column.

In [54]:
val assembler = new VectorAssembler().
    setInputCols(featureCols.map{x => "indexed" + x}).
    setOutputCol("features")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

assembler: org.apache.spark.ml.feature.VectorAssembler = vecAssembler_47f54c17f830


Just as with the categorical features, we index the response:

In [71]:
val labelIndexer = new StringIndexer().
    setInputCol("OpenStatus").
    setOutputCol("label").
    setHandleInvalid("keep").
    fit(simple_train)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

labelIndexer: org.apache.spark.ml.feature.StringIndexerModel = strIdx_763c47d16a69


Since the response is indexed, we need a way to transform the predicted response back to its original string value.  This inverse transformer is called [`IndexToString`](https://spark.apache.org/docs/latest/ml-features.html#indextostring):

In [72]:
val labelConverter = new IndexToString().
    setInputCol("prediction").
    setOutputCol("predictionLabel").
    setLabels(labelIndexer.labels)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

labelConverter: org.apache.spark.ml.feature.IndexToString = idxToStr_16829d1c7c43


Finally we can specify our model, a [`RandomForestClassifer`](https://spark.apache.org/docs/latest/ml-classification-regression.html#random-forest-classifier):

In [75]:
val rf = new RandomForestClassifier().
    setLabelCol("label").
    setFeaturesCol("features").
    setNumTrees(10).
    setMaxBins(12000)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

rf: org.apache.spark.ml.classification.RandomForestClassifier = rfc_9df6a58d40ea


In [76]:
val pipeline = new Pipeline().
    setStages(featureIndexers ++ Array(assembler, labelIndexer, rf, labelConverter))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

pipeline: org.apache.spark.ml.Pipeline = pipeline_b75699e47b54


### Fit Pipeline

In [68]:
val Array(trainingData, testData) = simple_train.randomSplit(Array(0.7, 0.3))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

trainingData: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [Tag1: string, Tag2: string ... 4 more fields]
testData: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [Tag1: string, Tag2: string ... 4 more fields]


In [77]:
val model = pipeline.fit(trainingData)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

model: org.apache.spark.ml.PipelineModel = pipeline_b75699e47b54


In [79]:
val predictions = model.transform(testData)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

predictions: org.apache.spark.sql.DataFrame = [Tag1: string, Tag2: string ... 15 more fields]


In [81]:
predictions.select("predictionLabel", "label", "features").show(5)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 616.0 failed 4 times, most recent failure: Lost task 0.3 in stage 616.0 (TID 6026, ip-172-31-8-106.us-west-2.compute.internal, executor 26): org.apache.spark.SparkException: Failed to execute user defined function($anonfun$10: (double) => string)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:291)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:283)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:836)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun

In [78]:
val evaluator = new MulticlassClassificationEvaluator()
    .setLabelCol("indexedLabel")
    .setPredictionCol("prediction")
    .setMetricName("accuracy")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

evaluator: org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator = mcEval_a06798eca4b7
