### Sección Spark
#### Daniel Sharp 138176
Ejecutamos el grid search en spark

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import numpy as np
spark = SparkSession.builder.master("local[*]").getOrCreate()

Carga de los datos como fueron procesados en el notebook de Dask

In [2]:
trips_df = spark.read.csv('/home/jovyan/work/0.part', header =True, inferSchema=True)

In [3]:
trips_df.limit(10).toPandas()

Unnamed: 0,_c0,fare_amount,passenger_count,trip_distance,dow_0,dow_1,dow_3,dow_4,dow_6,dow_2,hour_buck_4,hour_buck_3,hour_buck_2,car_type_B,target
0,0,22.0,1,6.9,0,0,0,0,0,0,0,0,0,0,0.209091
1,1,9.0,1,1.81,1,0,0,0,0,0,1,0,0,0,0.0
2,2,7.5,1,0.96,0,1,0,0,0,0,0,1,0,0,0.133333
3,3,8.5,1,1.9,0,0,1,0,0,0,0,0,1,0,0.117647
4,4,7.5,1,1.0,0,0,1,0,0,0,0,0,1,0,0.221333
5,5,9.5,5,1.71,0,0,0,0,0,0,1,0,0,0,0.157895
6,6,8.0,1,1.27,0,0,1,0,0,0,0,0,1,0,0.1875
7,7,7.5,4,1.55,0,0,0,1,0,0,0,0,0,0,0.213333
8,8,6.0,5,0.54,0,1,0,0,0,0,1,0,0,0,0.26
9,9,52.0,1,15.38,0,1,0,0,0,0,1,0,0,0,0.576923


In [4]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import RFormula

In [5]:
trips_df = trips_df.drop('_c0')

Se crea la columna label y features como las requiere Spark en sus modelos.

In [6]:
formula = RFormula(formula = "target ~ .")

In [7]:
df = formula.fit(trips_df).transform(trips_df)

In [8]:
df.limit(10).toPandas()

Unnamed: 0,fare_amount,passenger_count,trip_distance,dow_0,dow_1,dow_3,dow_4,dow_6,dow_2,hour_buck_4,hour_buck_3,hour_buck_2,car_type_B,target,features,label
0,22.0,1,6.9,0,0,0,0,0,0,0,0,0,0,0.209091,"(22.0, 1.0, 6.9, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.209091
1,9.0,1,1.81,1,0,0,0,0,0,1,0,0,0,0.0,"(9.0, 1.0, 1.81, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.0
2,7.5,1,0.96,0,1,0,0,0,0,0,1,0,0,0.133333,"(7.5, 1.0, 0.96, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0,...",0.133333
3,8.5,1,1.9,0,0,1,0,0,0,0,0,1,0,0.117647,"(8.5, 1.0, 1.9, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...",0.117647
4,7.5,1,1.0,0,0,1,0,0,0,0,0,1,0,0.221333,"(7.5, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...",0.221333
5,9.5,5,1.71,0,0,0,0,0,0,1,0,0,0,0.157895,"(9.5, 5.0, 1.71, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.157895
6,8.0,1,1.27,0,0,1,0,0,0,0,0,1,0,0.1875,"(8.0, 1.0, 1.27, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0,...",0.1875
7,7.5,4,1.55,0,0,0,1,0,0,0,0,0,0,0.213333,"(7.5, 4.0, 1.55, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0,...",0.213333
8,6.0,5,0.54,0,1,0,0,0,0,1,0,0,0,0.26,"(6.0, 5.0, 0.54, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0,...",0.26
9,52.0,1,15.38,0,1,0,0,0,0,1,0,0,0,0.576923,"(52.0, 1.0, 15.38, 0.0, 1.0, 0.0, 0.0, 0.0, 0....",0.576923


Se define el etimador y el grid de parametros que se utilizará:

In [9]:
lr = LinearRegression(elasticNetParam=1.0)
paramGrid = ParamGridBuilder() \
    .addGrid(lr.maxIter, [50, 100, 200]) \
    .addGrid(lr.regParam, [0.01, 0.001, 0.1,1.0])\
    .build()
evaluator = RegressionEvaluator(metricName='rmse')

Se define el grid search con cross-validation

In [10]:
crossval = CrossValidator(estimatorParamMaps=paramGrid,
                          estimator=lr,
                          evaluator=evaluator,
                          numFolds=10,
                         parallelism = 4)

In [11]:
%%time
cvModel = crossval.fit(df)

CPU times: user 5.49 s, sys: 2.63 s, total: 8.12 s
Wall time: 22.4 s


In [12]:
cvModel.getEstimatorParamMaps()[ np.argmin(cvModel.avgMetrics) ]

{Param(parent='LinearRegression_463bb8be94a4199c8ceb', name='maxIter', doc='max number of iterations (>= 0).'): 50,
 Param(parent='LinearRegression_463bb8be94a4199c8ceb', name='regParam', doc='regularization parameter (>= 0).'): 0.001}

In [13]:
np.min(cvModel.avgMetrics)

0.12711482604725943