# Model Building

#### Starting the Spark Session

In [1]:
from pyspark.sql import SparkSession

# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName('Project 1 model')
    .config('spark.sql.repl.eagerEval.enabled', True) 
    .config('spark.sql.parquet.cacheMetadata', True)
    .config('spark.executor.memory', '4g')
    .config('spark.driver.memory', '5g')
    .config('spark.sql.session.timeZone', 'Etc/UTC')
    .getOrCreate()
)

22/08/22 07:53:47 WARN Utils: Your hostname, DESKTOP-AKL6QQR resolves to a loopback address: 127.0.1.1; using 192.168.86.177 instead (on interface eth0)
22/08/22 07:53:47 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/08/22 07:53:58 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


#### Read in the Data

In [2]:
full_sdf = spark.read.parquet('../data/curated/processed_data2')

                                                                                

#### Prepare Data for Modelling

In [3]:
# indexing PULocationID
from pyspark.ml.feature import (StringIndexer, OneHotEncoder, VectorAssembler)
indexer = StringIndexer(inputCol='PULocationID', outputCol='PU_num')
index_sdf = indexer.fit(full_sdf).transform(full_sdf)

# indexing DOLocationID
indexer = StringIndexer(inputCol='DOLocationID', outputCol='DO_num')
index_sdf = indexer.fit(index_sdf).transform(index_sdf)

                                                                                

In [4]:
# one hot encoding PULocationID
encoder = OneHotEncoder(inputCol = 'PU_num', outputCol = 'PU_vec')
onehot_sdf = encoder.fit(index_sdf).transform(index_sdf)

# one hot encoding PULocationID
encoder = OneHotEncoder(inputCol = 'DO_num', outputCol = 'DO_vec')
onehot_sdf = encoder.fit(onehot_sdf).transform(onehot_sdf)

In [5]:
# VectorAssembler creates new vectors from existing columns
from pyspark.ml.feature import VectorAssembler

features = 'features'
input_cols = ['trip_distance', 'PU_vec', 'DO_vec', 'tolls_amount', 'time_delta_mins']

assembler = VectorAssembler(inputCols=input_cols, outputCol=features)

assemble_sdf = assembler.transform(onehot_sdf.dropna('any'))


In [6]:
# drop unnecessary features
model_sdf = assemble_sdf.select(['tip_amount', 'features'])
model_sdf = model_sdf.withColumnRenamed('tip_amount', 'label')
model_sdf.show(5)

[Stage 7:>                                                          (0 + 1) / 1]

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  1.7|(522,[0,21,260,52...|
| 1.26|(522,[0,4,278,521...|
| 1.59|(522,[0,35,260,52...|
| 1.56|(522,[0,4,271,521...|
| 1.96|(522,[0,4,305,521...|
+-----+--------------------+
only showing top 5 rows



                                                                                

In [7]:
# split data for training and testing
split = model_sdf.randomSplit([0.7, 0.3])
train_sdf = split[0]
test_sdf = split[1]

#### Hyperparameter Tuning of Linear Regression

In [9]:
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.tuning import TrainValidationSplit
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import LinearRegression

# adapted from https://spark.apache.org/docs/latest/ml-tuning.html

# We use a ParamGridBuilder to construct a grid of parameters to search over.
# TrainValidationSplit will try all combinations of values and determine best model using
# the evaluator.
lr = LinearRegression(maxIter=10)
paramGrid = ParamGridBuilder()\
    .addGrid(lr.regParam, [0.1, 0.01, 0.3]) \
    .addGrid(lr.fitIntercept, [False, True])\
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])\
    .build()

# In this case the estimator is simply the linear regression.
# A TrainValidationSplit requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
tvs = TrainValidationSplit(estimator=lr,
                           estimatorParamMaps=paramGrid,
                           evaluator=RegressionEvaluator().setLabelCol('label'),
                           # 80% of the data will be used for training, 20% for validation.
                           trainRatio=0.8)
model = tvs.fit(train_sdf)



22/08/22 03:05:49 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
22/08/22 03:05:49 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS




22/08/22 03:05:50 WARN MemoryStore: Not enough space to cache rdd_26_0 in memory! (computed 151.0 MiB so far)
22/08/22 03:05:51 WARN BlockManager: Persisting block rdd_26_0 to disk instead.
22/08/22 03:05:51 WARN MemoryStore: Not enough space to cache rdd_26_1 in memory! (computed 236.5 MiB so far)
22/08/22 03:05:51 WARN BlockManager: Persisting block rdd_26_1 to disk instead.
22/08/22 03:05:58 WARN MemoryStore: Not enough space to cache rdd_26_3 in memory! (computed 151.1 MiB so far)
22/08/22 03:05:58 WARN BlockManager: Persisting block rdd_26_3 to disk instead.




22/08/22 03:05:58 WARN MemoryStore: Not enough space to cache rdd_26_4 in memory! (computed 236.7 MiB so far)
22/08/22 03:05:58 WARN BlockManager: Persisting block rdd_26_4 to disk instead.




22/08/22 03:06:01 WARN MemoryStore: Not enough space to cache rdd_26_4 in memory! (computed 236.7 MiB so far)
22/08/22 03:06:03 WARN MemoryStore: Not enough space to cache rdd_26_3 in memory! (computed 236.6 MiB so far)


                                                                                

22/08/22 03:06:06 WARN InstanceBuilder$NativeLAPACK: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK
22/08/22 03:06:06 WARN MemoryStore: Not enough space to cache rdd_26_3 in memory! (computed 236.6 MiB so far)
22/08/22 03:06:06 WARN MemoryStore: Not enough space to cache rdd_26_4 in memory! (computed 236.7 MiB so far)




22/08/22 03:07:33 WARN MemoryStore: Not enough space to cache rdd_26_4 in memory! (computed 236.7 MiB so far)
22/08/22 03:07:33 WARN MemoryStore: Not enough space to cache rdd_26_5 in memory! (computed 236.7 MiB so far)
22/08/22 03:07:33 WARN MemoryStore: Not enough space to cache rdd_26_3 in memory! (computed 364.9 MiB so far)


                                                                                

22/08/22 03:07:38 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
22/08/22 03:07:38 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
22/08/22 03:07:38 WARN MemoryStore: Not enough space to cache rdd_26_5 in memory! (computed 236.7 MiB so far)




22/08/22 03:07:39 WARN MemoryStore: Not enough space to cache rdd_26_4 in memory! (computed 364.9 MiB so far)
22/08/22 03:07:39 WARN MemoryStore: Not enough space to cache rdd_26_3 in memory! (computed 364.9 MiB so far)




22/08/22 03:07:48 WARN MemoryStore: Not enough space to cache rdd_26_4 in memory! (computed 364.9 MiB so far)
22/08/22 03:07:48 WARN MemoryStore: Not enough space to cache rdd_26_3 in memory! (computed 364.9 MiB so far)
22/08/22 03:07:48 WARN MemoryStore: Not enough space to cache rdd_26_2 in memory! (computed 364.8 MiB so far)
22/08/22 03:07:48 WARN MemoryStore: Not enough space to cache rdd_26_5 in memory! (computed 365.4 MiB so far)




22/08/22 03:07:53 WARN MemoryStore: Not enough space to cache rdd_26_4 in memory! (computed 364.9 MiB so far)
22/08/22 03:07:53 WARN MemoryStore: Not enough space to cache rdd_26_3 in memory! (computed 364.9 MiB so far)
22/08/22 03:07:53 WARN MemoryStore: Not enough space to cache rdd_26_2 in memory! (computed 364.8 MiB so far)
22/08/22 03:07:53 WARN MemoryStore: Not enough space to cache rdd_26_5 in memory! (computed 365.4 MiB so far)




22/08/22 03:08:02 WARN MemoryStore: Not enough space to cache rdd_26_4 in memory! (computed 364.9 MiB so far)
22/08/22 03:08:02 WARN MemoryStore: Not enough space to cache rdd_26_2 in memory! (computed 364.8 MiB so far)
22/08/22 03:08:02 WARN MemoryStore: Not enough space to cache rdd_26_5 in memory! (computed 365.4 MiB so far)
22/08/22 03:08:02 WARN MemoryStore: Not enough space to cache rdd_26_3 in memory! (computed 364.9 MiB so far)




22/08/22 03:08:07 WARN MemoryStore: Not enough space to cache rdd_26_5 in memory! (computed 365.4 MiB so far)
22/08/22 03:08:07 WARN MemoryStore: Not enough space to cache rdd_26_3 in memory! (computed 364.9 MiB so far)
22/08/22 03:08:07 WARN MemoryStore: Not enough space to cache rdd_26_2 in memory! (computed 364.8 MiB so far)
22/08/22 03:08:07 WARN MemoryStore: Not enough space to cache rdd_26_4 in memory! (computed 364.9 MiB so far)




22/08/22 03:08:15 WARN MemoryStore: Not enough space to cache rdd_26_5 in memory! (computed 365.4 MiB so far)
22/08/22 03:08:15 WARN MemoryStore: Not enough space to cache rdd_26_2 in memory! (computed 364.8 MiB so far)
22/08/22 03:08:15 WARN MemoryStore: Not enough space to cache rdd_26_4 in memory! (computed 364.9 MiB so far)
22/08/22 03:08:15 WARN MemoryStore: Not enough space to cache rdd_26_3 in memory! (computed 364.9 MiB so far)




22/08/22 03:08:19 WARN MemoryStore: Not enough space to cache rdd_26_4 in memory! (computed 364.9 MiB so far)
22/08/22 03:08:19 WARN MemoryStore: Not enough space to cache rdd_26_5 in memory! (computed 365.4 MiB so far)
22/08/22 03:08:19 WARN MemoryStore: Not enough space to cache rdd_26_3 in memory! (computed 364.9 MiB so far)
22/08/22 03:08:19 WARN MemoryStore: Not enough space to cache rdd_26_2 in memory! (computed 364.8 MiB so far)




22/08/22 03:08:28 WARN MemoryStore: Not enough space to cache rdd_26_2 in memory! (computed 364.8 MiB so far)
22/08/22 03:08:28 WARN MemoryStore: Not enough space to cache rdd_26_4 in memory! (computed 364.9 MiB so far)
22/08/22 03:08:28 WARN MemoryStore: Not enough space to cache rdd_26_5 in memory! (computed 365.4 MiB so far)
22/08/22 03:08:28 WARN MemoryStore: Not enough space to cache rdd_26_3 in memory! (computed 364.9 MiB so far)




22/08/22 03:08:33 WARN MemoryStore: Not enough space to cache rdd_26_5 in memory! (computed 365.4 MiB so far)
22/08/22 03:08:33 WARN MemoryStore: Not enough space to cache rdd_26_4 in memory! (computed 364.9 MiB so far)
22/08/22 03:08:33 WARN MemoryStore: Not enough space to cache rdd_26_2 in memory! (computed 364.8 MiB so far)
22/08/22 03:08:33 WARN MemoryStore: Not enough space to cache rdd_26_3 in memory! (computed 364.9 MiB so far)




22/08/22 03:08:41 WARN MemoryStore: Not enough space to cache rdd_26_5 in memory! (computed 365.4 MiB so far)
22/08/22 03:08:41 WARN MemoryStore: Not enough space to cache rdd_26_4 in memory! (computed 364.9 MiB so far)
22/08/22 03:08:41 WARN MemoryStore: Not enough space to cache rdd_26_2 in memory! (computed 364.8 MiB so far)
22/08/22 03:08:41 WARN MemoryStore: Not enough space to cache rdd_26_3 in memory! (computed 364.9 MiB so far)




22/08/22 03:08:46 WARN MemoryStore: Not enough space to cache rdd_26_4 in memory! (computed 364.9 MiB so far)
22/08/22 03:08:46 WARN MemoryStore: Not enough space to cache rdd_26_5 in memory! (computed 365.4 MiB so far)
22/08/22 03:08:46 WARN MemoryStore: Not enough space to cache rdd_26_3 in memory! (computed 364.9 MiB so far)
22/08/22 03:08:46 WARN MemoryStore: Not enough space to cache rdd_26_2 in memory! (computed 364.8 MiB so far)




22/08/22 03:08:53 WARN MemoryStore: Not enough space to cache rdd_26_3 in memory! (computed 364.9 MiB so far)
22/08/22 03:08:54 WARN MemoryStore: Not enough space to cache rdd_26_4 in memory! (computed 364.9 MiB so far)
22/08/22 03:08:54 WARN MemoryStore: Not enough space to cache rdd_26_5 in memory! (computed 365.4 MiB so far)
22/08/22 03:08:54 WARN MemoryStore: Not enough space to cache rdd_26_2 in memory! (computed 364.8 MiB so far)




22/08/22 03:08:59 WARN MemoryStore: Not enough space to cache rdd_26_2 in memory! (computed 364.8 MiB so far)
22/08/22 03:08:59 WARN MemoryStore: Not enough space to cache rdd_26_5 in memory! (computed 365.4 MiB so far)
22/08/22 03:08:59 WARN MemoryStore: Not enough space to cache rdd_26_3 in memory! (computed 364.9 MiB so far)
22/08/22 03:08:59 WARN MemoryStore: Not enough space to cache rdd_26_4 in memory! (computed 364.9 MiB so far)




22/08/22 03:09:06 WARN MemoryStore: Not enough space to cache rdd_26_2 in memory! (computed 364.8 MiB so far)
22/08/22 03:09:06 WARN MemoryStore: Not enough space to cache rdd_26_4 in memory! (computed 364.9 MiB so far)
22/08/22 03:09:06 WARN MemoryStore: Not enough space to cache rdd_26_3 in memory! (computed 364.9 MiB so far)
22/08/22 03:09:06 WARN MemoryStore: Not enough space to cache rdd_26_5 in memory! (computed 365.4 MiB so far)




22/08/22 03:09:11 WARN MemoryStore: Not enough space to cache rdd_26_4 in memory! (computed 364.9 MiB so far)
22/08/22 03:09:11 WARN MemoryStore: Not enough space to cache rdd_26_2 in memory! (computed 364.8 MiB so far)
22/08/22 03:09:11 WARN MemoryStore: Not enough space to cache rdd_26_3 in memory! (computed 364.9 MiB so far)
22/08/22 03:09:11 WARN MemoryStore: Not enough space to cache rdd_26_5 in memory! (computed 365.4 MiB so far)


                                                                                

22/08/22 03:09:18 WARN MemoryStore: Not enough space to cache rdd_26_2 in memory! (computed 364.8 MiB so far)
22/08/22 03:09:18 WARN MemoryStore: Not enough space to cache rdd_26_3 in memory! (computed 364.9 MiB so far)
22/08/22 03:09:18 WARN MemoryStore: Not enough space to cache rdd_26_4 in memory! (computed 364.9 MiB so far)
22/08/22 03:09:18 WARN MemoryStore: Not enough space to cache rdd_26_5 in memory! (computed 365.4 MiB so far)




22/08/22 03:09:23 WARN MemoryStore: Not enough space to cache rdd_26_5 in memory! (computed 365.4 MiB so far)
22/08/22 03:09:23 WARN MemoryStore: Not enough space to cache rdd_26_4 in memory! (computed 364.9 MiB so far)
22/08/22 03:09:23 WARN MemoryStore: Not enough space to cache rdd_26_2 in memory! (computed 364.8 MiB so far)
22/08/22 03:09:23 WARN MemoryStore: Not enough space to cache rdd_26_3 in memory! (computed 364.9 MiB so far)




22/08/22 03:09:31 WARN MemoryStore: Not enough space to cache rdd_26_3 in memory! (computed 364.9 MiB so far)
22/08/22 03:09:31 WARN MemoryStore: Not enough space to cache rdd_26_2 in memory! (computed 364.8 MiB so far)
22/08/22 03:09:31 WARN MemoryStore: Not enough space to cache rdd_26_5 in memory! (computed 365.4 MiB so far)
22/08/22 03:09:31 WARN MemoryStore: Not enough space to cache rdd_26_4 in memory! (computed 364.9 MiB so far)




22/08/22 03:09:35 WARN MemoryStore: Not enough space to cache rdd_26_4 in memory! (computed 364.9 MiB so far)
22/08/22 03:09:35 WARN MemoryStore: Not enough space to cache rdd_26_2 in memory! (computed 364.8 MiB so far)
22/08/22 03:09:35 WARN MemoryStore: Not enough space to cache rdd_26_5 in memory! (computed 365.4 MiB so far)
22/08/22 03:09:35 WARN MemoryStore: Not enough space to cache rdd_26_3 in memory! (computed 364.9 MiB so far)




22/08/22 03:09:43 WARN MemoryStore: Not enough space to cache rdd_26_3 in memory! (computed 364.9 MiB so far)
22/08/22 03:09:43 WARN MemoryStore: Not enough space to cache rdd_26_2 in memory! (computed 364.8 MiB so far)
22/08/22 03:09:43 WARN MemoryStore: Not enough space to cache rdd_26_4 in memory! (computed 364.9 MiB so far)
22/08/22 03:09:43 WARN MemoryStore: Not enough space to cache rdd_26_5 in memory! (computed 365.4 MiB so far)




22/08/22 03:09:48 WARN MemoryStore: Not enough space to cache rdd_26_4 in memory! (computed 364.9 MiB so far)
22/08/22 03:09:48 WARN MemoryStore: Not enough space to cache rdd_26_2 in memory! (computed 364.8 MiB so far)
22/08/22 03:09:48 WARN MemoryStore: Not enough space to cache rdd_26_3 in memory! (computed 364.9 MiB so far)
22/08/22 03:09:48 WARN MemoryStore: Not enough space to cache rdd_26_5 in memory! (computed 365.4 MiB so far)




22/08/22 03:09:55 WARN MemoryStore: Not enough space to cache rdd_26_5 in memory! (computed 365.4 MiB so far)
22/08/22 03:09:55 WARN MemoryStore: Not enough space to cache rdd_26_3 in memory! (computed 364.9 MiB so far)
22/08/22 03:09:55 WARN MemoryStore: Not enough space to cache rdd_26_4 in memory! (computed 364.9 MiB so far)
22/08/22 03:09:55 WARN MemoryStore: Not enough space to cache rdd_26_2 in memory! (computed 364.8 MiB so far)




22/08/22 03:10:01 WARN MemoryStore: Not enough space to cache rdd_26_5 in memory! (computed 365.4 MiB so far)
22/08/22 03:10:01 WARN MemoryStore: Not enough space to cache rdd_26_4 in memory! (computed 364.9 MiB so far)
22/08/22 03:10:01 WARN MemoryStore: Not enough space to cache rdd_26_2 in memory! (computed 364.8 MiB so far)
22/08/22 03:10:01 WARN MemoryStore: Not enough space to cache rdd_26_3 in memory! (computed 364.9 MiB so far)




22/08/22 03:10:09 WARN MemoryStore: Not enough space to cache rdd_26_5 in memory! (computed 365.4 MiB so far)
22/08/22 03:10:09 WARN MemoryStore: Not enough space to cache rdd_26_3 in memory! (computed 364.9 MiB so far)
22/08/22 03:10:09 WARN MemoryStore: Not enough space to cache rdd_26_4 in memory! (computed 364.9 MiB so far)
22/08/22 03:10:09 WARN MemoryStore: Not enough space to cache rdd_26_2 in memory! (computed 364.8 MiB so far)




22/08/22 03:10:13 WARN MemoryStore: Not enough space to cache rdd_26_2 in memory! (computed 364.8 MiB so far)
22/08/22 03:10:13 WARN MemoryStore: Not enough space to cache rdd_26_4 in memory! (computed 364.9 MiB so far)
22/08/22 03:10:13 WARN MemoryStore: Not enough space to cache rdd_26_3 in memory! (computed 364.9 MiB so far)
22/08/22 03:10:13 WARN MemoryStore: Not enough space to cache rdd_26_5 in memory! (computed 365.4 MiB so far)




22/08/22 03:10:21 WARN MemoryStore: Not enough space to cache rdd_26_4 in memory! (computed 364.9 MiB so far)
22/08/22 03:10:21 WARN MemoryStore: Not enough space to cache rdd_26_3 in memory! (computed 364.9 MiB so far)
22/08/22 03:10:21 WARN MemoryStore: Not enough space to cache rdd_26_5 in memory! (computed 365.4 MiB so far)
22/08/22 03:10:21 WARN MemoryStore: Not enough space to cache rdd_26_2 in memory! (computed 364.8 MiB so far)




22/08/22 03:10:26 WARN MemoryStore: Not enough space to cache rdd_26_2 in memory! (computed 364.8 MiB so far)
22/08/22 03:10:26 WARN MemoryStore: Not enough space to cache rdd_26_4 in memory! (computed 364.9 MiB so far)
22/08/22 03:10:26 WARN MemoryStore: Not enough space to cache rdd_26_3 in memory! (computed 364.9 MiB so far)
22/08/22 03:10:26 WARN MemoryStore: Not enough space to cache rdd_26_5 in memory! (computed 365.4 MiB so far)




22/08/22 03:10:34 WARN MemoryStore: Not enough space to cache rdd_26_5 in memory! (computed 365.4 MiB so far)
22/08/22 03:10:34 WARN MemoryStore: Not enough space to cache rdd_26_3 in memory! (computed 364.9 MiB so far)
22/08/22 03:10:34 WARN MemoryStore: Not enough space to cache rdd_26_2 in memory! (computed 364.8 MiB so far)
22/08/22 03:10:34 WARN MemoryStore: Not enough space to cache rdd_26_4 in memory! (computed 364.9 MiB so far)




22/08/22 03:10:38 WARN MemoryStore: Not enough space to cache rdd_26_2 in memory! (computed 364.8 MiB so far)
22/08/22 03:10:38 WARN MemoryStore: Not enough space to cache rdd_26_5 in memory! (computed 365.4 MiB so far)
22/08/22 03:10:38 WARN MemoryStore: Not enough space to cache rdd_26_4 in memory! (computed 364.9 MiB so far)
22/08/22 03:10:38 WARN MemoryStore: Not enough space to cache rdd_26_3 in memory! (computed 364.9 MiB so far)




22/08/22 03:10:45 WARN MemoryStore: Not enough space to cache rdd_26_5 in memory! (computed 365.4 MiB so far)
22/08/22 03:10:45 WARN MemoryStore: Not enough space to cache rdd_26_3 in memory! (computed 364.9 MiB so far)
22/08/22 03:10:45 WARN MemoryStore: Not enough space to cache rdd_26_2 in memory! (computed 364.8 MiB so far)
22/08/22 03:10:46 WARN MemoryStore: Not enough space to cache rdd_26_4 in memory! (computed 364.9 MiB so far)




22/08/22 03:10:51 WARN MemoryStore: Not enough space to cache rdd_26_2 in memory! (computed 364.8 MiB so far)
22/08/22 03:10:51 WARN MemoryStore: Not enough space to cache rdd_26_3 in memory! (computed 364.9 MiB so far)
22/08/22 03:10:51 WARN MemoryStore: Not enough space to cache rdd_26_5 in memory! (computed 365.4 MiB so far)
22/08/22 03:10:51 WARN MemoryStore: Not enough space to cache rdd_26_4 in memory! (computed 364.9 MiB so far)




22/08/22 03:10:58 WARN MemoryStore: Not enough space to cache rdd_26_4 in memory! (computed 364.9 MiB so far)
22/08/22 03:10:58 WARN MemoryStore: Not enough space to cache rdd_26_2 in memory! (computed 364.8 MiB so far)
22/08/22 03:10:58 WARN MemoryStore: Not enough space to cache rdd_26_5 in memory! (computed 365.4 MiB so far)
22/08/22 03:10:58 WARN MemoryStore: Not enough space to cache rdd_26_3 in memory! (computed 364.9 MiB so far)




22/08/22 03:11:03 WARN MemoryStore: Not enough space to cache rdd_26_3 in memory! (computed 364.9 MiB so far)
22/08/22 03:11:03 WARN MemoryStore: Not enough space to cache rdd_26_5 in memory! (computed 365.4 MiB so far)
22/08/22 03:11:03 WARN MemoryStore: Not enough space to cache rdd_26_4 in memory! (computed 364.9 MiB so far)
22/08/22 03:11:03 WARN MemoryStore: Not enough space to cache rdd_26_2 in memory! (computed 364.8 MiB so far)


                                                                                

#### Model Testing of Linear Regression

In [14]:
# RMSE results for all hyperperam tuning tests
list(zip(model.validationMetrics, model.getEstimatorParamMaps()))

[(1.4669672532445595,
  {Param(parent='LinearRegression_54d1588a4616', name='regParam', doc='regularization parameter (>= 0).'): 0.1,
   Param(parent='LinearRegression_54d1588a4616', name='fitIntercept', doc='whether to fit an intercept term.'): False,
   Param(parent='LinearRegression_54d1588a4616', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0.0}),
 (1.5694412658754782,
  {Param(parent='LinearRegression_54d1588a4616', name='regParam', doc='regularization parameter (>= 0).'): 0.1,
   Param(parent='LinearRegression_54d1588a4616', name='fitIntercept', doc='whether to fit an intercept term.'): False,
   Param(parent='LinearRegression_54d1588a4616', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0.5}),
 (1.5972153785161136,
  {Param(parent='LinearRegression_

In [10]:
best_model = model.bestModel
print('R Squared on the training set is: %f' % best_model.summary.r2)

R Squared on the training set is: 0.835646


In [16]:
print('RMSE on the training set is: %f' % best_model.summary.rootMeanSquaredError)

RMSE on the training set is: 1.465548


In [11]:
best_predictions = best_model.transform(test_sdf)
best_predictions.select('prediction','label','features')

                                                                                

prediction,label,features
1.20872259186389,0.0,"(522,[0,1,260,521..."
1.123984468090041,0.0,"(522,[0,1,260,521..."
1.1041985112631263,0.0,"(522,[0,1,260,521..."
1.129292713702004,0.0,"(522,[0,1,260,521..."
1.0631590731357672,0.0,"(522,[0,1,260,521..."
1.1016277810778743,0.0,"(522,[0,1,260,521..."
1.164826372697051,0.0,"(522,[0,1,260,521..."
1.2197816697572044,0.0,"(522,[0,1,260,521..."
1.466804705099964,0.0,"(522,[0,1,260,521..."
1.2058555613981137,0.0,"(522,[0,1,260,521..."


In [12]:
from pyspark.ml.evaluation import RegressionEvaluator
best_evaluator = RegressionEvaluator(predictionCol='prediction', \
    labelCol='label',metricName='r2')
print('R Squared on test data = %g' % best_evaluator.evaluate(best_predictions))



R Squared on test data = 0.645656


                                                                                

In [13]:
from pyspark.ml.evaluation import RegressionEvaluator
best_evaluator = RegressionEvaluator(predictionCol='prediction', \
    labelCol='label',metricName='rmse')
print('RMSE on test data = %g' % best_evaluator.evaluate(best_predictions))



RMSE on test data = 1.47223


                                                                                

#### Hyperparameter Tuning of Random Forest Regression

In [None]:
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.tuning import TrainValidationSplit
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import RandomForestRegressor

rfr = RandomForestRegressor(labelCol='label', featuresCol='features')
paramGrid = ParamGridBuilder()\
  .addGrid(rfr.maxDepth, [2, 10])\
  .addGrid(rfr.maxBins, [5, 20])\
  .build()

tvs2 = TrainValidationSplit(estimator=rfr,
                           estimatorParamMaps=paramGrid,
                           evaluator=RegressionEvaluator().setLabelCol('label'),
                           # 80% of the data will be used for training, 20% for validation.
                           trainRatio=0.8)
model2 = tvs2.fit(train_sdf)

#### Model Testing of Random Forest Regression

In [None]:
best_model2 = model2.bestModel
print('R Squared on the training set is: %f' % best_model2.summary.r2)

In [None]:
print('RMSE on the training set is: %f' % best_model2.summary.rootMeanSquaredError)

In [None]:
best_predictions2 = best_model2.transform(test_sdf)
best_predictions2.select('prediction','label','features')

In [None]:
from pyspark.ml.evaluation import RegressionEvaluator
best_evaluator2 = RegressionEvaluator(predictionCol='prediction', \
    labelCol='label',metricName='r2')
print('R Squared on test data = %g' % best_evaluator2.evaluate(best_predictions2))

In [None]:
from pyspark.ml.evaluation import RegressionEvaluator
best_evaluator2 = RegressionEvaluator(predictionCol='prediction', \
    labelCol='label',metricName='rmse')
print('RMSE on test data = %g' % best_evaluator2.evaluate(best_predictions2))