In [None]:
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
import sys
import seaborn
from scipy.stats import *
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorIndexer, VectorAssembler, IndexToString
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml import Pipeline

In [None]:
spark = SparkSession.builder \
.master("local") \
.appName("Exercise3")\
.getOrCreate()

In [None]:
df = spark.read.csv('exampleData.csv', header=True)

In [None]:
df.schema

In [None]:
df.dtypes

In [None]:
df.show(1, vertical=True)

### String Indexer Example

In [None]:
indexer = StringIndexer(inputCol="dateTime" ,outputCol="indexedTime")

In [None]:
indexed = indexer.fit(df).transform(df) # return a df

In [None]:
indexed.show(2, vertical=True) 

In [None]:
indexed.dtypes

### Task - Predict air_temp from date, humidity, wind speed and msl

In [None]:
# Parse data type
parsed_df = indexed.selectExpr("cast(dateTime as string) dateTime", 
                          "cast(relative_humidity as float) relative_humidity",
                          "cast(wind_speed as float) wind_speed",
                          "cast(indexedTime as double) indexedTime",
                          "cast(msl as float) msl",
                          "cast(air_temperature as float) label") # df after parsed

In [None]:
# Put all discreted data/features into 1D vector (as 1 column in the df)
vectorAssembler = VectorAssembler(inputCols = ['relative_humidity', 'wind_speed', 'indexedTime', 'msl'], handleInvalid="skip",outputCol ='features')

In [None]:
vectorized_df = vectorAssembler.transform(parsed_df)

In [None]:
vectorized_df.show(1)

In [None]:
dataset = vectorized_df.select("features", "label") # remove unnecessary cols

In [None]:
dataset.show(1)

### Model - Gradient-Boosted Trees

In [None]:
# Split data
(trainData, testData) = dataset.randomSplit([0.8, 0.2])

In [None]:
print("Train sample: {}".format(trainData.count()))
print("Test sample: {}".format(testData.count()))

In [None]:
testData.show(1)

In [None]:
#### NORMAL 

In [None]:
gbt = GBTRegressor(featuresCol="features", maxIter=30, maxDepth = 11)
s = time.time()
model_direct = gbt.fit(trainData)
print(time.time() - s)

In [None]:
pred_direct = model_direct.transform(testData)

In [None]:
pred_direct.show(5)

In [None]:
#### WITH PIPELINE

In [None]:
# Transform step - Index features (if needed)
featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", handleInvalid="skip").fit(dataset)

# Transform step - Index labels (if needed)
labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(dataset)

# Transform step - Indexed to orginal (if the labels are indexed)
labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",labels=labelIndexer.labels)

In [None]:
gbt = GBTRegressor(featuresCol="indexedFeatures", maxIter=30, maxDepth = 11)

In [None]:
# No idea what the param meaning?
# gbt.explainParam("maxIter") # explain specific param
# gbt.explainParams() # explain all params

In [None]:
pipeline = Pipeline(stages=[featureIndexer, gbt])
# pipeline = Pipeline(stages=[featureIndexer, labelIndexer, gbt])
# pipeline = Pipeline(stages=[labelIndexer, featureIndexer, gbt, labelConverter])

In [None]:
s = time.time()
model_gbt = pipeline.fit(trainData)
print(time.time() - s)

In [None]:
pred = model_gbt.transform(testData) # return a df

In [None]:
pred.show(5)

### Model - Random Forest

In [None]:
rforest = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", numTrees=40, maxDepth=7)

In [None]:
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rforest, labelConverter])

In [None]:
s = time.time()
model_forest = pipeline.fit(trainData)
print(time.time() - s)

In [None]:
pred_forest = model_forest.transform(testData).select("features", "label", "predictedLabel")

In [None]:
pred_forest.show(10)

### Evaluation metrics

In [None]:
rmse = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse") # root mean square err

In [None]:
rmse_result = rmse.evaluate(pred)

In [None]:
rmse_result

In [None]:
rsquare = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2") # r-squared

In [None]:
rsquare_result = rsquare.evaluate(pred)

In [None]:
rsquare_result

In [None]:
mae = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="mae") # mean absolute err

In [None]:
mae_result = mae.evaluate(pred)

In [None]:
mae_result

### Plotting

In [None]:
pred_pd = pred.toPandas()

In [None]:
seaborn.set(style="whitegrid", font_scale = 1.9)

In [None]:
# Using Seaborn
fig, ax = plt.subplots()
seaborn.set(color_codes=True)
seaborn.set(rc={'figure.figsize':(15, 10)})
seaborn.regplot(x="label", y="prediction", fit_reg=False, ax=ax,data= pred_pd,scatter_kws={"color": "green"});
seaborn.regplot(x="label", y="prediction",scatter=False, ax=ax, data= pred_pd, line_kws={"color": "red"});

In [None]:
# Using matplotlib
fig_, ax_ = plt.subplots(figsize=(15, 10))
ax_.scatter(pred_pd.label, pred_pd.prediction, color='green', marker='o')
# Line plot (regression line)
m, b = np.polyfit(pred_pd.label, pred_pd.prediction, 1) # compute linear regression
ax_.plot(pred_pd.label, m*pred_pd.label + b, color='red')
ax_.set_xlabel('label', fontsize=20)
ax_.set_ylabel('prediction', fontsize=20)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.show()