In [1]:
val gender_submission = spark.read.format("csv").option("inferSchema", "true").option("header", "true").load("data/gender_submission.csv")

gender_submission = [PassengerId: int, Survived: int]


[PassengerId: int, Survived: int]

In [74]:
import org.apache.spark.ml.feature.OneHotEncoderEstimator
import org.apache.spark.ml.feature.StringIndexer
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.feature.{IndexToString, VectorIndexer}
import org.apache.spark.ml.PipelineStage
import org.apache.spark.ml.classification.{RandomForestClassificationModel, RandomForestClassifier}
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder}
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator

In [75]:
val test = spark.read.format("csv").option("inferSchema", "true").option("header", "true").load("data/test.csv")

test.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



test = [PassengerId: int, Pclass: int ... 9 more fields]


[PassengerId: int, Pclass: int ... 9 more fields]

In [76]:
val df = spark.read.format("csv").option("inferSchema", "true").option("header", "true").load("data/train.csv")

df = [PassengerId: int, Survived: int ... 10 more fields]


[PassengerId: int, Survived: int ... 10 more fields]

In [77]:
df.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [78]:
df.show(5)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| null|       S|
+-----------+--------+------+--------------------+------+----+-----+-----+------

In [79]:
import org.apache.spark.sql.functions._

val agemeanValue = df.agg(mean(df("Age"))).first.getDouble(0)

val faremeanValue = df.agg(mean(df("Fare"))).first.getDouble(0)

val fixedDf = df.na.fill(agemeanValue, Array("Age"))

val reFixed = fixedDf.na.fill(agemeanValue, Array("Fare"))


agemeanValue = 29.69911764705882
faremeanValue = 32.2042079685746
fixedDf = [PassengerId: int, Survived: int ... 10 more fields]
reFixed = [PassengerId: int, Survived: int ... 10 more fields]


[PassengerId: int, Survived: int ... 10 more fields]

In [48]:
val Array(trainingData, testData) = reFixed.randomSplit(Array(0.7, 0.3))

trainingData = [PassengerId: int, Survived: int ... 10 more fields]
testData = [PassengerId: int, Survived: int ... 10 more fields]


[PassengerId: int, Survived: int ... 10 more fields]

In [85]:
//Indexing categorical fetures
val featuresCatColNames = Seq("Pclass", "Sex")
val stringIndexers = featuresCatColNames.map { colName =>
  new StringIndexer()
    .setInputCol(colName)
    .setOutputCol(colName + "Indexed")
    .fit(trainingData)
}


//Indexing label
val labelIndexer = new StringIndexer()
.setInputCol("Survived")
.setOutputCol("SurvivedIndexed")
.fit(trainingData)

val featuresNumColNames = Seq("Age", "SibSp", "Parch", "Fare")
val indexedfeaturesCatColNames = featuresCatColNames.map(_ + "Indexed")
val allIndexedFeaturesColNames = featuresNumColNames ++ indexedfeaturesCatColNames
val assembler = new VectorAssembler()
  .setInputCols(Array(allIndexedFeaturesColNames: _*))
  .setOutputCol("Features")


featuresCatColNames = List(Pclass, Sex)
stringIndexers = List(strIdx_925e72d1943e, strIdx_5bd5f1360b34)
labelIndexer = strIdx_d0fe150ad733
featuresNumColNames = List(Age, SibSp, Parch, Fare)
indexedfeaturesCatColNames = List(PclassIndexed, SexIndexed)
allIndexedFeaturesColNames = List(Age, SibSp, Parch, Fare, PclassIndexed, SexIndexed)
assembler = vecAssembler_223a45bad371


vecAssembler_223a45bad371

In [86]:
val randomForest = new RandomForestClassifier()
  .setLabelCol("SurvivedIndexed")
  .setFeaturesCol("Features")

//Retrieve original labels
val labelConverter = new IndexToString()
  .setInputCol("prediction")
  .setOutputCol("predictedLabel")
  .setLabels(labelIndexer.labels)

randomForest = rfc_ee75c738638e
labelConverter = idxToStr_76085f42583d


idxToStr_76085f42583d

In [87]:
// define the order of the operations to be performed
val pipeline = new Pipeline().setStages(
  (stringIndexers :+ labelIndexer :+ assembler :+ randomForest :+ labelConverter).toArray)


pipeline = pipeline_85662be05d38


pipeline_85662be05d38

In [88]:
 // grid of values to perform cross validation on
 val paramGrid = new ParamGridBuilder()
      .addGrid(randomForest.maxBins, Array(25, 28, 31))
      .addGrid(randomForest.maxDepth, Array(4, 6, 8))
      .addGrid(randomForest.impurity, Array("entropy", "gini"))
      .build()

val evaluator = new BinaryClassificationEvaluator()
      .setLabelCol("SurvivedIndexed")

val cv = new CrossValidator()
      .setEstimator(pipeline)
      .setEvaluator(evaluator)
      .setEstimatorParamMaps(paramGrid)
      .setNumFolds(10)

    // train the model
val crossValidatorModel = cv.fit(trainingData)

paramGrid = 


Array({
	rfc_ee75c738638e-impurity: entropy,
	rfc_ee75c738638e-maxBins: 25,
	rfc_ee75c738638e-maxDepth: 4
}, {
	rfc_ee75c738638e-impurity: entropy,
	rfc_ee75c738638e-maxBins: 25,
	rfc_ee75c738638e-maxDepth: 6
}, {
	rfc_ee75c738638e-impurity: entropy,
	rfc_ee75c738638e-maxBins: 25,
	rfc_ee75c738638e-maxDepth: 8
}, {
	rfc_ee75c738638e-impurity: gini,
	rfc_ee75c738638e-maxBins: 25,
	rfc_ee75c738638e-maxDepth: 4
}, {
	rfc_ee75c738638e-impurity: gini,
	rfc_ee75c738638e-maxBins: 25,
	rfc_ee75c738638e-maxDepth: 6
}, {
	rfc_ee75c738638e-impurity: gini,
	rfc_ee75c738638e-maxBins: 25,
	rfc_ee75c738638e-maxDepth: 8
}, {
	rfc_ee75c738638e-impurity: entropy,
	rfc_ee75c738638e-maxBins: 28,
	rfc_ee75c738638e-maxDepth: 4
}, {
	rfc_ee75c738638e-impu...


In [89]:
// make predictions
val predictions = crossValidatorModel.transform(testData)

//Accuracy
val accuracy = evaluator.evaluate(predictions)
println("Test Error DT= " + (1.0 - accuracy))


Test Error DT= 0.12609173462895495


predictions = [PassengerId: int, Survived: int ... 18 more fields]
accuracy = 0.873908265371045


0.873908265371045