In [1]:
val gender_submission = spark.read.format("csv").option("inferSchema", "true").option("header", "true").load("data/gender_submission.csv")

gender_submission = [PassengerId: int, Survived: int]


[PassengerId: int, Survived: int]

In [1]:
val test = spark.read.format("csv").option("inferSchema", "true").option("header", "true").load("data/test.csv")

test.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



test = [PassengerId: int, Pclass: int ... 9 more fields]


[PassengerId: int, Pclass: int ... 9 more fields]

In [2]:
val df = spark.read.format("csv").option("inferSchema", "true").option("header", "true").load("data/train.csv")

df = [PassengerId: int, Survived: int ... 10 more fields]


[PassengerId: int, Survived: int ... 10 more fields]

In [3]:
df.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [4]:
df.show(5)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| null|       S|
+-----------+--------+------+--------------------+------+----+-----+-----+------

In [5]:
import org.apache.spark.sql.functions._

val meanValue = df.agg(mean(df("Age"))).first.getDouble(0)
val fixedDf = df.na.fill(meanValue, Array("Age", "Embarked", "Cabin"))


meanValue = 29.69911764705882
fixedDf = [PassengerId: int, Survived: int ... 10 more fields]


[PassengerId: int, Survived: int ... 10 more fields]

In [6]:
val renamedDF = fixedDf.withColumnRenamed("Survived", "label")

renamedDF = [PassengerId: int, label: int ... 10 more fields]


[PassengerId: int, label: int ... 10 more fields]

In [7]:
// label columns
val colLabel = "label"

// categorical columns
val colCat = "Sex"

val ticketcol = "Ticket"

val namecol = "Name"

val cabinfilter = "Cabin"

val embarkedfilter = "Embarked"

// numerical columns
val colNum = renamedDF.columns.filter(_ != colLabel).filter(_ != colCat).filter(_ != ticketcol).filter(_ != namecol).filter(_ != embarkedfilter).filter(_ != cabinfilter)

colLabel = label
colCat = Sex
ticketcol = Ticket
namecol = Name
cabinfilter = Cabin
embarkedfilter = Embarked
colNum = Array(PassengerId, Pclass, Age, SibSp, Parch, Fare)


Array(PassengerId, Pclass, Age, SibSp, Parch, Fare)

In [8]:
for (c <- colNum) {
    val count_missing_values = renamedDF.filter(s"${c} is null").count()
    println(s"Number of missing values in column ${c}: ${count_missing_values}")
}

Number of missing values in column PassengerId: 0
Number of missing values in column Pclass: 0
Number of missing values in column Age: 0
Number of missing values in column SibSp: 0
Number of missing values in column Parch: 0
Number of missing values in column Fare: 0


In [9]:
import org.apache.spark.ml.feature.{VectorAssembler, StandardScaler}

val va = new VectorAssembler()  
    .setInputCols(colNum)
    .setOutputCol("features")
val featuredHousing = va.transform(renamedDF)



val scaler = new StandardScaler()
    .setInputCol("features")
    .setOutputCol("scaledFeatures")
    .setWithStd(true)
    .setWithMean(true)
val scaledHousing = scaler.fit(featuredHousing).transform(featuredHousing)

scaledHousing.show(5)

+-----------+-----+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+--------------------+--------------------+
|PassengerId|label|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|            features|      scaledFeatures|
+-----------+-----+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+--------------------+--------------------+
|          1|    0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|[1.0,3.0,22.0,1.0...|[-1.7291368044688...|
|          2|    1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|[2.0,1.0,38.0,1.0...|[-1.7252511037846...|
|          3|    1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|[3.0,3.0,26.0,0.0...|[-1.7213654031004...|
|          4|    1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|     

va = vecAssembler_63cb1c13ccf4
featuredHousing = [PassengerId: int, label: int ... 11 more fields]
scaler = stdScal_70334163b856
scaledHousing = [PassengerId: int, label: int ... 12 more fields]


[PassengerId: int, label: int ... 12 more fields]

In [10]:
import org.apache.spark.ml.feature.OneHotEncoderEstimator

import org.apache.spark.ml.feature.StringIndexer

import org.apache.spark.ml.feature.VectorAssembler

import org.apache.spark.ml.{Pipeline, PipelineModel}

import org.apache.spark.ml.PipelineStage

import org.apache.spark.ml.regression.RandomForestRegressor
import org.apache.spark.ml.evaluation.RegressionEvaluator

In [11]:
import org.apache.spark.ml.feature.StringIndexer


val sexIndexer = new StringIndexer().setInputCol("Sex").setOutputCol("Sex_index")

val ticketIndexer = new StringIndexer().setInputCol("Ticket").setOutputCol("Ticket_index")

val nameIndexer = new StringIndexer().setInputCol("Name").setOutputCol("Name_index")



sexIndexer = strIdx_6c2507d9e49e
ticketIndexer = strIdx_ba2a161078da
nameIndexer = strIdx_b8b8f382256c


strIdx_b8b8f382256c

In [12]:

import org.apache.spark.ml.feature.OneHotEncoderEstimator

val sex_encoder = new OneHotEncoderEstimator()
    .setInputCols(Array("Sex_index"))
    .setOutputCols(Array("Sex_index_Vec"))

val ticket_encoder = new OneHotEncoderEstimator()
    .setInputCols(Array("Ticket_index"))
    .setOutputCols(Array("Ticket_index_Vec"))

val name_encoder = new OneHotEncoderEstimator()
    .setInputCols(Array("Name_index"))
    .setOutputCols(Array("Name_index_Vec"))


import org.apache.spark.ml.{Pipeline, PipelineModel, PipelineStage}

val numPipeline = new Pipeline().setStages(Array(va,scaler))
val catPipeline = new Pipeline().setStages(Array(sexIndexer,ticketIndexer, nameIndexer, sex_encoder, ticket_encoder, name_encoder))

val pipeline = new Pipeline().setStages(Array(numPipeline, catPipeline))
val newHousing = pipeline.fit(renamedDF).transform(renamedDF)

newHousing.show(5)


+-----------+-----+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+--------------------+--------------------+---------+------------+----------+-------------+-----------------+-----------------+
|PassengerId|label|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|            features|      scaledFeatures|Sex_index|Ticket_index|Name_index|Sex_index_Vec| Ticket_index_Vec|   Name_index_Vec|
+-----------+-----+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+--------------------+--------------------+---------+------------+----------+-------------+-----------------+-----------------+
|          1|    0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|[1.0,3.0,22.0,1.0...|[-1.7291368044688...|      0.0|       257.0|     329.0|(1,[0],[1.0])|(680,[257],[1.0])|(890,[329],[1.0])|
|          2|    1|     1|Cumings, Mrs. Joh...|f

sex_encoder = oneHotEncoder_2df93762975e
ticket_encoder = oneHotEncoder_20b6b78d8081
name_encoder = oneHotEncoder_af15d5cb46ce
numPipeline = pipeline_a460ed5e43dd
catPipeline = pipeline_7760b55d86ef
pipeline = pipeline_b18b6732aea7
newHousing = [PassengerId: int, label: int ... 18 more fields]


[PassengerId: int, label: int ... 18 more fields]

In [13]:
//columns for training

val newHousing2 = newHousing.drop("features")
val va2 = new VectorAssembler().setInputCols(Array("scaledFeatures","Sex_index_Vec","Ticket_index_Vec","Name_index_Vec")).setOutputCol("features")
val dataset = va2.transform(newHousing2).select("features", "label")

dataset.show(5)

val Array(trainSet, testSet) = dataset.randomSplit(Array(0.8, 0.2))

+--------------------+-----+
|            features|label|
+--------------------+-----+
|(1577,[0,1,2,3,4,...|    0|
|(1577,[0,1,2,3,4,...|    1|
|(1577,[0,1,2,3,4,...|    1|
|(1577,[0,1,2,3,4,...|    1|
|(1577,[0,1,2,3,4,...|    0|
+--------------------+-----+
only showing top 5 rows



newHousing2 = [PassengerId: int, label: int ... 17 more fields]
va2 = vecAssembler_23ea40810019
dataset = [features: vector, label: int]
trainSet = [features: vector, label: int]
testSet = [features: vector, label: int]


[features: vector, label: int]

In [14]:
val rf = new RandomForestRegressor()
    .setLabelCol("label")
    .setFeaturesCol("features")

// train the model
val rfModel = rf.fit(trainSet)

// make predictions on the test data
val predictions = rfModel.transform(testSet)

predictions.select("prediction", "label", "features").show(5)

// select (prediction, true label) and compute test error
val evaluator = new RegressionEvaluator()
    .setLabelCol("label")
    .setPredictionCol("prediction")
    .setMetricName("rmse")

val rmse = evaluator.evaluate(predictions)
println(s"Root Mean Squared Error (RMSE) on test data = $rmse")

+-------------------+-----+--------------------+
|         prediction|label|            features|
+-------------------+-----+--------------------+
| 0.1438779990889574|    0|(1577,[0,1,2,3,4,...|
|0.12102975916817763|    0|(1577,[0,1,2,3,4,...|
|0.39306096595838647|    1|(1577,[0,1,2,3,4,...|
|0.13977975916817761|    0|(1577,[0,1,2,3,4,...|
| 0.1336873856061535|    0|(1577,[0,1,2,3,4,...|
+-------------------+-----+--------------------+
only showing top 5 rows

Root Mean Squared Error (RMSE) on test data = 0.3709202213797589


rf = rfr_9b524349e423
rfModel = RandomForestRegressionModel (uid=rfr_9b524349e423) with 20 trees
predictions = [features: vector, label: int ... 1 more field]
evaluator = regEval_73b40f4ac369
rmse = 0.3709202213797589


0.3709202213797589

In [15]:
import org.apache.spark.mllib.evaluation.MulticlassMetrics

In [16]:
val rdd = predictions.select("label", "prediction").rdd.map(row ⇒ (row.getInt(0).toDouble, row.getDouble(1)))

rdd = MapPartitionsRDD[165] at map at <console>:42


MapPartitionsRDD[165] at map at <console>:42

In [18]:
// Instantiate metrics object
new MulticlassMetrics(rdd).accuracy

0.0