# Load Python Packages

In [0]:
import numpy as np 
import pandas as pd 
import os
import pyspark



# Load dataset for weather prediction

In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('weather_pred').getOrCreate()

In [0]:
df1 = spark.read.csv('/FileStore/tables/oslo.csv', header=True, inferSchema=True)

In [0]:
df1.printSchema()

root
 |-- DATE: integer (nullable = true)
 |-- BBQ_weather: boolean (nullable = true)
 |-- OSLO_cloud_cover: integer (nullable = true)
 |-- OSLO_wind_speed: double (nullable = true)
 |-- OSLO_wind_gust: double (nullable = true)
 |-- OSLO_humidity: double (nullable = true)
 |-- OSLO_pressure: double (nullable = true)
 |-- OSLO_global_radiation: double (nullable = true)
 |-- OSLO_precipitation: double (nullable = true)
 |-- OSLO_sunshine: double (nullable = true)
 |-- OSLO_temp_mean: double (nullable = true)
 |-- OSLO_temp_min: double (nullable = true)
 |-- OSLO_temp_max: double (nullable = true)



In [0]:
#pick the columns corresponding to Maastricht
df1.columns[0:13]

Out[5]: ['DATE',
 'BBQ_weather',
 'OSLO_cloud_cover',
 'OSLO_wind_speed',
 'OSLO_wind_gust',
 'OSLO_humidity',
 'OSLO_pressure',
 'OSLO_global_radiation',
 'OSLO_precipitation',
 'OSLO_sunshine',
 'OSLO_temp_mean',
 'OSLO_temp_min',
 'OSLO_temp_max']

In [0]:
import pyspark.sql.functions as F
from functools import reduce

#convert the Boolean True/Fase to 0 or 1
cols = ["BBQ_weather"]
df1 = reduce(lambda df, c: df1.withColumn(c, F.when(df[c] == 'false', 0).otherwise(1)), cols, df1)


In [0]:
df1.show()

+--------+-----------+----------------+---------------+--------------+-------------+-------------+---------------------+------------------+-------------+--------------+-------------+-------------+
|    DATE|BBQ_weather|OSLO_cloud_cover|OSLO_wind_speed|OSLO_wind_gust|OSLO_humidity|OSLO_pressure|OSLO_global_radiation|OSLO_precipitation|OSLO_sunshine|OSLO_temp_mean|OSLO_temp_min|OSLO_temp_max|
+--------+-----------+----------------+---------------+--------------+-------------+-------------+---------------------+------------------+-------------+--------------+-------------+-------------+
|20000101|          0|               7|            0.9|           5.1|         0.94|        1.013|                 0.04|               0.6|          0.0|          -5.0|         -8.6|         -3.2|
|20000102|          0|               6|            1.9|           5.7|         0.94|       1.0076|                 0.11|               0.0|          1.6|          -0.8|         -6.7|          2.4|
|20000103|     

In [0]:
df1.printSchema()

root
 |-- DATE: integer (nullable = true)
 |-- BBQ_weather: integer (nullable = false)
 |-- OSLO_cloud_cover: integer (nullable = true)
 |-- OSLO_wind_speed: double (nullable = true)
 |-- OSLO_wind_gust: double (nullable = true)
 |-- OSLO_humidity: double (nullable = true)
 |-- OSLO_pressure: double (nullable = true)
 |-- OSLO_global_radiation: double (nullable = true)
 |-- OSLO_precipitation: double (nullable = true)
 |-- OSLO_sunshine: double (nullable = true)
 |-- OSLO_temp_mean: double (nullable = true)
 |-- OSLO_temp_min: double (nullable = true)
 |-- OSLO_temp_max: double (nullable = true)



# Convert data to Spark ML format

In [0]:
#The data format required by spark ML is (features, label) so we need to assemble all features into a feature vector. 
from pyspark.ml.feature import VectorAssembler

In [0]:
#create an instance of the Assembler which takes in a series of columns to be used as features
#and returns a condensed vector
assembler = VectorAssembler(inputCols = ['OSLO_cloud_cover',
 'OSLO_wind_speed',
 'OSLO_wind_gust',
 'OSLO_humidity',
 'OSLO_pressure',
 'OSLO_global_radiation',
 'OSLO_precipitation',
 'OSLO_sunshine',
 'OSLO_temp_mean',
 'OSLO_temp_min',
 'OSLO_temp_max'], outputCol = 'features')

In [0]:
#Call the 'transform' method on the dataframe returns a new dataframe with the newly created 'features' column
# Chi tranform cac cot co khai bao trong VectorAssembler, cac cot con lai giu nguyen
data = assembler.transform(df1)

In [0]:
# Chon 3 cot DATE, features va Label de tao bo dataset, sau do chia thanh Train 70 Test 30
final_data = data.select(['DATE','features','BBQ_weather'])
final_data.show(150)

+--------+--------------------+-----------+
|    DATE|            features|BBQ_weather|
+--------+--------------------+-----------+
|20000101|[7.0,0.9,5.1,0.94...|          0|
|20000102|[6.0,1.9,5.7,0.94...|          0|
|20000103|[6.0,1.7,8.7,0.88...|          0|
|20000104|[1.0,3.4,11.8,0.5...|          0|
|20000105|[8.0,1.2,5.7,0.94...|          0|
|20000106|[8.0,7.0,17.0,0.9...|          0|
|20000107|[8.0,5.3,15.9,0.7...|          0|
|20000108|[4.0,6.0,17.5,0.7...|          0|
|20000109|[1.0,2.9,10.8,0.9...|          0|
|20000110|[5.0,1.5,6.2,0.89...|          0|
|20000111|[3.0,0.8,9.8,0.94...|          0|
|20000112|[8.0,0.7,9.8,0.96...|          0|
|20000113|[6.0,1.2,5.7,0.79...|          0|
|20000114|[5.0,2.7,9.3,0.65...|          0|
|20000115|[4.0,1.4,3.6,0.84...|          0|
|20000116|[5.0,1.3,5.1,0.96...|          0|
|20000117|[1.0,7.0,18.5,0.4...|          0|
|20000118|[0.0,10.6,23.2,0....|          0|
|20000119|[4.0,2.3,16.5,0.6...|          0|
|20000120|[3.0,2.2,8.2,0.71...| 

# Training and Predicting with ML Models: DecisionTree, RandomForest, Gradient-boosted Tree 

In [0]:
from pyspark.ml.classification import DecisionTreeClassifier,GBTClassifier,RandomForestClassifier
from pyspark.ml import Pipeline

In [0]:
train_data,test_data = final_data.randomSplit([0.7,0.3])

In [0]:
dtc = DecisionTreeClassifier(labelCol='BBQ_weather',featuresCol='features')
rfc = RandomForestClassifier(labelCol='BBQ_weather',featuresCol='features')
gbt = GBTClassifier(labelCol='BBQ_weather',featuresCol='features')

In [0]:
# Train the models (its three models, so it might take some time)
dtc_model = dtc.fit(train_data)
rfc_model = rfc.fit(train_data)
gbt_model = gbt.fit(train_data)

In [0]:
#Call 'transform' on the test_data
#these will create 3 new dataframes for each model
dtc_predictions = dtc_model.transform(test_data)
rfc_predictions = rfc_model.transform(test_data)
gbt_predictions = gbt_model.transform(test_data)

In [0]:
#let's look at one of the newly created dataframes
rfc_predictions.show()

+--------+--------------------+-----------+--------------------+--------------------+----------+
|    DATE|            features|BBQ_weather|       rawPrediction|         probability|prediction|
+--------+--------------------+-----------+--------------------+--------------------+----------+
|20000101|[7.0,0.9,5.1,0.94...|          0|[19.9907344548696...|[0.99953672274348...|       0.0|
|20000103|[6.0,1.7,8.7,0.88...|          0|[19.9907344548696...|[0.99953672274348...|       0.0|
|20000106|[8.0,7.0,17.0,0.9...|          0|[19.9907344548696...|[0.99953672274348...|       0.0|
|20000108|[4.0,6.0,17.5,0.7...|          0|[19.9832615565293...|[0.99916307782646...|       0.0|
|20000112|[8.0,0.7,9.8,0.96...|          0|[19.9907344548696...|[0.99953672274348...|       0.0|
|20000118|[0.0,10.6,23.2,0....|          0|[19.9953435507342...|[0.99976717753671...|       0.0|
|20000119|[4.0,2.3,16.5,0.6...|          0|[19.9907103461010...|[0.99953551730505...|       0.0|
|20000120|[3.0,2.2,8.2,0.71...

# Accuracy evaluation

In [0]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [0]:
# Select (prediction, true label) and compute test error
acc_evaluator = MulticlassClassificationEvaluator(labelCol="BBQ_weather", predictionCol="prediction", metricName="accuracy")

In [0]:
dtc_acc = acc_evaluator.evaluate(dtc_predictions)
rfc_acc = acc_evaluator.evaluate(rfc_predictions)
gbt_acc = acc_evaluator.evaluate(gbt_predictions)

In [0]:
print('-'*80)
print('A single decision tree had an accuracy of: {0:2.2f}%'.format(dtc_acc*100))
print('-'*80)
print('A random forest ensemble had an accuracy of: {0:2.2f}%'.format(rfc_acc*100))
print('-'*80)
print('A ensemble using GBT had an accuracy of: {0:2.2f}%'.format(gbt_acc*100))

--------------------------------------------------------------------------------
A single decision tree had an accuracy of: 99.55%
--------------------------------------------------------------------------------
A random forest ensemble had an accuracy of: 99.64%
--------------------------------------------------------------------------------
A ensemble using GBT had an accuracy of: 99.46%


All methods have the accuracy of 1 (this is the same result obtained using RandomForestClassifier from sklearn.ensemble in the other notebook). 

#Save the best model

In [0]:
# Remove folder recursively using dbutils.fs.rm
dbutils.fs.rm("/tmp/mllib-persistence-example/", True)

Out[23]: True

In [0]:
%scala
val basePath = "/tmp/mllib-persistence-example"
dbutils.fs.rm(basePath, recurse=true)
dbutils.fs.mkdirs(basePath)

In [0]:
%py
basePath = "/tmp/mllib-persistence-example"
dtc_model.save(basePath + "/dtc_model")
rfc_model.save(basePath + "/rfc_model")
gbt_model.save(basePath + "/gbt_model")