In [6]:
import org.apache.spark.ml.feature.OneHotEncoderEstimator
import org.apache.spark.ml.feature.StringIndexer
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.feature.{IndexToString, VectorIndexer}
import org.apache.spark.ml.PipelineStage
import org.apache.spark.ml.classification.{RandomForestClassificationModel, RandomForestClassifier}
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder}
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator

In [7]:
val gender_submission = spark.read.format("csv").option("inferSchema", "true").option("header", "true").load("data/gender_submission.csv")

gender_submission = [PassengerId: int, Survived: int]


[PassengerId: int, Survived: int]

In [8]:
val test = spark.read.format("csv").option("inferSchema", "true").option("header", "true").load("data/test.csv")

test.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



test = [PassengerId: int, Pclass: int ... 9 more fields]


[PassengerId: int, Pclass: int ... 9 more fields]

In [9]:
val df = spark.read.format("csv").option("inferSchema", "true").option("header", "true").load("data/train.csv")

df = [PassengerId: int, Survived: int ... 10 more fields]


[PassengerId: int, Survived: int ... 10 more fields]

In [10]:
df.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [11]:
df.show(5)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| null|       S|
+-----------+--------+------+--------------------+------+----+-----+-----+------

In [12]:
import org.apache.spark.sql.functions._

val agemeanValue = df.agg(mean(df("Age"))).first.getDouble(0)

val faremeanValue = df.agg(mean(df("Fare"))).first.getDouble(0)

val fixedDf = df.na.fill(agemeanValue, Array("Age"))

val reFixed = fixedDf.na.fill(agemeanValue, Array("Fare"))

agemeanValue = 29.69911764705882
faremeanValue = 32.2042079685746
fixedDf = [PassengerId: int, Survived: int ... 10 more fields]
reFixed = [PassengerId: int, Survived: int ... 10 more fields]


[PassengerId: int, Survived: int ... 10 more fields]

In [20]:
val Array(recvTrainData, recvTestData) = reFixed.randomSplit(Array(0.8, 0.2))

val trainingData = recvTrainData.withColumnRenamed("Survived", "label")

val testData = recvTestData.withColumnRenamed("Survived", "label")

recvTrainData = [PassengerId: int, Survived: int ... 10 more fields]
recvTestData = [PassengerId: int, Survived: int ... 10 more fields]
trainingData = [PassengerId: int, label: int ... 10 more fields]
testData = [PassengerId: int, label: int ... 10 more fields]


[PassengerId: int, label: int ... 10 more fields]

In [21]:
val cols = Array("Sex", "Pclass")
val stringIndexers = cols.map { eachCol =>
  new StringIndexer(eachCol)
    .setInputCol(eachCol)
    .setOutputCol(eachCol + "_Indexed")
    .fit(trainingData)
}

cols = Array(Sex, Pclass)
stringIndexers = Array(Sex, Pclass)


Array(Sex, Pclass)

In [22]:
val cols = Array("Fare", "Age", "SibSp", "Parch", "Pclass_Indexed", "Sex_Indexed")
val assembler = new VectorAssembler()
  .setInputCols(cols)
  .setOutputCol("Features")

cols = Array(Fare, Age, SibSp, Parch, Pclass_Indexed, Sex_Indexed)
assembler = vecAssembler_9af1fd261440


vecAssembler_9af1fd261440

In [23]:
val randomForest = new RandomForestClassifier()
  .setLabelCol("label")
  .setFeaturesCol("Features")

randomForest = rfc_f3469863e9ff


rfc_f3469863e9ff

In [24]:
val pipeline = new Pipeline().setStages((stringIndexers :+ assembler :+ randomForest ).toArray)

pipeline = pipeline_b75dd74f59bb


pipeline_b75dd74f59bb

In [25]:
val paramGrid = new ParamGridBuilder()
      .addGrid(randomForest.maxBins, Array(23, 29, 31))
      .addGrid(randomForest.maxDepth, Array(4, 6, 8))
      .addGrid(randomForest.impurity, Array("entropy", "gini"))
      .build()

val evaluator = new BinaryClassificationEvaluator()
        .setLabelCol("label")

val cv = new CrossValidator()
      .setEstimator(pipeline)
      .setEvaluator(evaluator)
      .setEstimatorParamMaps(paramGrid)
      .setNumFolds(10)

    // train the model
val crossValidatorModel = cv.fit(trainingData)

paramGrid = 


Array({
	rfc_f3469863e9ff-impurity: entropy,
	rfc_f3469863e9ff-maxBins: 23,
	rfc_f3469863e9ff-maxDepth: 4
}, {
	rfc_f3469863e9ff-impurity: entropy,
	rfc_f3469863e9ff-maxBins: 29,
	rfc_f3469863e9ff-maxDepth: 4
}, {
	rfc_f3469863e9ff-impurity: entropy,
	rfc_f3469863e9ff-maxBins: 31,
	rfc_f3469863e9ff-maxDepth: 4
}, {
	rfc_f3469863e9ff-impurity: entropy,
	rfc_f3469863e9ff-maxBins: 23,
	rfc_f3469863e9ff-maxDepth: 6
}, {
	rfc_f3469863e9ff-impurity: entropy,
	rfc_f3469863e9ff-maxBins: 29,
	rfc_f3469863e9ff-maxDepth: 6
}, {
	rfc_f3469863e9ff-impurity: entropy,
	rfc_f3469863e9ff-maxBins: 31,
	rfc_f3469863e9ff-maxDepth: 6
}, {
	rfc_f3469863e9ff-impurity: entropy,
	rfc_f3469863e9ff-maxBins: 23,
	rfc_f3469863e9ff-maxDepth: 8
}, {
	rfc_f3469863...


In [26]:
val predictions = crossValidatorModel.transform(testData)

val accuracy = evaluator.evaluate(predictions)


predictions = [PassengerId: int, label: int ... 16 more fields]
accuracy = 0.8923680361654037


0.8923680361654037