In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
pd.set_option('display.max_rows', 20000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 500)
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import sys
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder \
.master("local") \
.appName("Exercise3")\
.getOrCreate()

df = spark.read.csv('exampleData.csv', header =True)

df.dtypes

[('_c0', 'string'),
 ('dateTime', 'string'),
 ('indicator_rain', 'string'),
 ('precipitation', 'string'),
 ('indicator_temp', 'string'),
 ('air_temperature', 'string'),
 ('indicator_wetb', 'string'),
 ('wetb', 'string'),
 ('dewpt', 'string'),
 ('vappr', 'string'),
 ('relative_humidity', 'string'),
 ('msl', 'string'),
 ('indicator_wdsp', 'string'),
 ('wind_speed', 'string'),
 ('indicator_wddir', 'string'),
 ('wind_from_direction', 'string')]

In [3]:
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer(inputCol="dateTime" ,outputCol="indexedTime")
indexed = indexer.fit(df).transform(df)
indexed.show(vertical=False)

+---+-----------------+--------------+-------------+--------------+---------------+--------------+----+-----+-----+-----------------+------+--------------+----------+---------------+-------------------+-----------+
|_c0|         dateTime|indicator_rain|precipitation|indicator_temp|air_temperature|indicator_wetb|wetb|dewpt|vappr|relative_humidity|   msl|indicator_wdsp|wind_speed|indicator_wddir|wind_from_direction|indexedTime|
+---+-----------------+--------------+-------------+--------------+---------------+--------------+----+-----+-----+-----------------+------+--------------+----------+---------------+-------------------+-----------+
| 14|01-jan-1990 00:00|             0|          0.3|             0|            9.1|             0| 9.0|  8.9| 11.4|               99|1006.7|             2|         7|              2|                190|     2667.0|
| 15|01-jan-1990 01:00|             0|          0.2|             0|            8.2|             0| 7.4|  6.4|  9.7|               89|1006.7|

In [4]:
 #Change the data types of parameters to correct ones to vectorize
parsed_data = indexed.selectExpr("cast(dateTime as string)dateTime",
                                 "cast(relative_humidity as float)relative_humidity",
                                 "cast(wind_speed as float) wind_speed",
                                 "cast(indexedTime as double)indexedTime",
                                 "cast(air_temperature as float) label",
                                 "cast(msl as float) msl")

In [5]:
from pyspark.ml.feature import VectorAssembler
vectorAssembler = VectorAssembler(inputCols = ['relative_humidity',
'wind_speed', 'indexedTime', 'msl'], handleInvalid="skip",outputCol =
'features')
vectorized_df = vectorAssembler.transform(parsed_data)
dataset = vectorized_df.select("features", "label")


In [6]:
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = dataset.randomSplit([0.7, 0.3])


In [7]:
from pyspark.ml.regression import GBTRegressor
gbt = GBTRegressor(featuresCol="features", maxIter=30, maxDepth = 11)
# Train model. This also runs the indexer.
model = gbt.fit(trainingData)
# Make predictions.
prediction = model.transform(testData)

In [8]:
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml import Pipeline

featureIndexer =VectorIndexer(inputCol="features", outputCol="indexedFeatures", handleInvalid="skip").fit(dataset)
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = parsed_data.randomSplit([0.7, 0.3])

# Train a GBT model.
gbt = GBTRegressor(featuresCol="indexedFeatures", maxIter=30, maxDepth = 11)
# Chain indexer and GBT in a Pipeline
pipeline = Pipeline(stages=[featureIndexer, gbt])
# Train model. This also runs the indexer.
model_gbt = pipeline.fit(trainingData)
# Make predictions.
preds = model_gbt.transform(testData)
# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")

rmse = evaluator.evaluate(preds)
print("R Squared on test data = %g" % rmse)

ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.3-src.zip/py4j/clientserver.py", line 480, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.3-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/spark/python/lib/py4j-0.10.9.3-src.zip/py4j/clientserver.py", line 503, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving
ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/opt/conda/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3369, in run_code
    exec(code_obj, self.user_global_ns, self.

ConnectionRefusedError: [Errno 111] Connection refused

In [None]:
## MAE:
evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="mae")
mae= evaluator.evaluate(preds)
print("MAE on test data = %g" % mae)

## RMSE:
evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(preds)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)


In [None]:
pred_pd = preds.toPandas()
import seaborn
from scipy.stats import *
seaborn.set(style="whitegrid", font_scale = 1.8)
fig, ax = plt.subplots()
seaborn.set(color_codes=True)
seaborn.set(rc={'figure.figsize':(20, 10)})
seaborn.regplot(x="label", y="prediction", fit_reg=False, ax=ax,data
=pred_pd,scatter_kws={"color": "b"});
seaborn.regplot(x="label", y="prediction",scatter=False, ax=ax, data
=pred_pd, line_kws={"color": "red"});
