In [0]:
import ml.dmlc.xgboost4j.scala.spark.{XGBoostRegressionModel, XGBoostRegressor}
import org.apache.spark.ml.evaluation.{RegressionEvaluator}
import org.apache.spark.ml.tuning.ParamGridBuilder
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._

In [1]:

val schema =
    StructType(Array(
      StructField("vendor_id", FloatType),
      StructField("passenger_count", FloatType),
      StructField("trip_distance", FloatType),
      StructField("pickup_longitude", FloatType),
      StructField("pickup_latitude", FloatType),
      StructField("rate_code", FloatType),
      StructField("store_and_fwd", FloatType),
      StructField("dropoff_longitude", FloatType),
      StructField("dropoff_latitude", FloatType),
      StructField("fare_amount", FloatType),
      StructField("trip_time",LongType),
      StructField("year", IntegerType),
      StructField("month", IntegerType),
      StructField("day", FloatType),
      StructField("day_of_week", FloatType),
      StructField("is_weekend", FloatType),
      StructField("hour", FloatType),
      StructField("h_distance",DoubleType)
    ))

In [2]:

import ml.dmlc.xgboost4j.scala.spark.rapids.CrossValidator


In [3]:

//var df = spark.read.option("inferSchema", "false").option("header", true).schema(schema).csv("s3://bucketcjm/data/taxismall.csv") 
var df = spark.read.parquet("s3://bucketcjm/data/pyellow_tripdata_2016-02") 

In [4]:

df.show

In [5]:

def dropUseless(dataFrame: DataFrame): DataFrame = {
    dataFrame.drop(
      "year",
      "passenger_count",
      "month",
      "vendor_id",
      "is_weekend",
      "trip_distance",
      "store_and_fwd")
  }


In [6]:
var tdf = dropUseless(df)

In [7]:
tdf.cache
tdf.createOrReplaceTempView("taxi")
spark.catalog.cacheTable("taxi")

In [8]:
tdf.select("h_distance","fare_amount","trip_time").describe().show

In [9]:
tdf.select("h_distance","fare_amount","trip_time").describe().show

In [10]:
%%sql
select hour, avg(fare_amount)
from taxi
group by hour order by hour 

In [11]:
tdf.groupBy("hour").avg("fare_amount", "h_distance").orderBy("hour").show(24)

In [12]:
%%sql 
select trip_time from taxi

In [13]:
var cor = tdf.stat.corr("fare_amount", "h_distance")


In [14]:
tdf.select(corr("fare_amount","hour")).show()

In [15]:
tdf.groupBy("hour").avg("fare_amount", "h_distance").orderBy("hour").show(4)

In [16]:
%%sql
 select hour, avg(fare_amount)
 from taxi
 group by hour order by hour 

In [17]:
%%sql
 select day_of_week, avg(fare_amount), avg(h_distance)
 from taxi
 group by day_of_week order by day_of_week

In [18]:
%%sql
 select hour, avg(fare_amount), avg(h_distance)
 from taxi
 group by hour order by hour


In [19]:
%
 %sql
 select day_of_week, avg(fare_amount)
 from taxi
 group by day_of_week order by day_of_week

In [20]:
%%sql
 select h_distance, fare_amount
from taxi

In [21]:
%%sql
select trip_time, fare_amount
 from taxi


In [22]:
var featureNames = Array("h_distance", "pickup_longitude","pickup_latitude","dropoff_longitude", "dropoff_latitude","rate_code","hour", "day_of_week","trip_time")


In [23]:
val Array(traindf, evaldf) = tdf.randomSplit(Array(0.7, 0.3), 5043)
traindf.show

In [24]:
val labelColName = "fare_amount"

var featureNames = Array("h_distance", "pickup_longitude","pickup_latitude","dropoff_longitude", "dropoff_latitude","rate_code","hour", "day_of_week","trip_time")

val regressorParam = Map(
    "learning_rate" -> 0.05,
    "gamma" -> 1,
    "objective" ->"reg:gamma",
    "max_depth" -> 8,
    "subsample" -> 0.8,
    "num_round" -> 100,
    "tree_method" -> "gpu_hist")


val regressor = new XGBoostRegressor(regressorParam).setLabelCol(labelColName).setFeaturesCols(featureNames)

val paramGrid = new ParamGridBuilder().addGrid(regressor.maxDepth, Array(3, 8)).addGrid(regressor.eta, Array(0.2, 0.6)).build()

val evaluator = new RegressionEvaluator().setLabelCol(labelColName)
val cv = new CrossValidator().setEstimator(regressor).setEvaluator(evaluator).setEstimatorParamMaps(paramGrid).setNumFolds(3)

regressor.explainParams()



In [25]:
val cvmodel = cv.fit(traindf)

In [26]:
val model = cvmodel.bestModel.asInstanceOf[XGBoostRegressionModel]
cvmodel.getEstimatorParamMaps.zip(cvmodel.avgMetrics)
model.extractParamMap()

In [27]:

val pdf = model.transform(evaldf).cache()
pdf.select("fare_amount", "prediction").describe().show()

In [28]:

val evaluator = new RegressionEvaluator().setLabelCol(labelColName)
val rmse = evaluator.evaluate(pdf)
println(s"RMSE is $rmse")

In [29]:

val r2 = evaluator.setMetricName("r2").evaluate(pdf)

In [30]:

var predictions = pdf.withColumn("error", $"prediction" - $"fare_amount")
val avgFare = tdf.select(avg("fare_amount")).first().getDouble(0)
predictions = predictions.withColumn("avgPrediction", lit(avgFare))
val regressionMeanEvaluator = new RegressionEvaluator().setPredictionCol("avgPrediction").setLabelCol("fare_amount").setMetricName("rmse")

val rmsep = regressionMeanEvaluator.evaluate(predictions)

In [31]:
val df=pdf
df.cache
df.createOrReplaceTempView("taxi")


In [32]:
df.show

In [33]:
%%sql
select h_distance, avg(fare_amount) from taxi group by h_distance order by h_distance;

In [34]:
%
%sql
select h_distance, fare_amount, prediction from taxi ;