In [0]:
val df = spark.read.option("header", "true").parquet("/user/qz2166_nyu_edu/tlc_trip_data/yellow_taxi_ml_clean.parquet");

val Array(trainDf, testDf) = df.randomSplit(Array(0.8, 0.2), seed=64)

In [1]:
import org.apache.spark.ml.feature.{OneHotEncoder, StringIndexer}

val categoricalCols = trainDf.dtypes.filter(_._2 == "StringType").map(_._1)
val indexOutputCols = categoricalCols.map(_ + "_index")
val oheOutputCols = categoricalCols.map(_ + "_OHE")

val stringIndexer = new StringIndexer()
  .setInputCols(categoricalCols)
  .setOutputCols(indexOutputCols)
  .setHandleInvalid("skip")

val oheEncoder = new OneHotEncoder()
  .setInputCols(indexOutputCols)
  .setOutputCols(oheOutputCols)

In [2]:
import org.apache.spark.ml.feature.VectorAssembler

val numericCols = trainDf.dtypes.filter{
  case (field, dataType) => dataType != "StringType" && field != "tip_amount"
}.map(_._1)
val assemblerInputs = oheOutputCols ++ numericCols
val vecAssembler = new VectorAssembler()
  .setInputCols(assemblerInputs)
  .setOutputCol("features")

In [3]:
import org.apache.spark.ml.regression.LinearRegression

val lr = new LinearRegression()
  .setLabelCol("tip_amount")
  .setFeaturesCol("features")

In [4]:
import org.apache.spark.ml.Pipeline

val pipeline = new Pipeline()
  .setStages(Array(stringIndexer, oheEncoder, vecAssembler, lr))

val pipelineModel = pipeline.fit(trainDf)
val predDF = pipelineModel.transform(testDf)
z.show(predDF.select("features", "tip_amount", "prediction"))

In [5]:
import org.apache.spark.ml.evaluation.RegressionEvaluator

val regressionEvaluator = new RegressionEvaluator()
  .setPredictionCol("prediction")
  .setLabelCol("tip_amount")
  .setMetricName("rmse")

val rmse = regressionEvaluator.evaluate(predDF)
println(f"RMSE is $rmse%1.2f")

val r2 = regressionEvaluator.setMetricName("r2").evaluate(predDF)
println(f"R2 is $r2%1.2f")