In [0]:
from pyspark.sql.functions import col, unix_timestamp, from_unixtime, udf
from pyspark.sql.types import DateType, TimestampType

In [0]:
df = spark.read.json('/mnt/dacoursedatabricksstg/dacoursedatabricksdata/busFile')

In [0]:
df = df.drop("direction", "poiId", "poiId2", "probability", "loc")\
               .withColumn("id", col("_id.$oid"))\
               .withColumn("calendar", col("calendar.$numberLong").cast("bigint"))\
               .withColumn("timestamp", col("timestamp.$numberLong").cast("bigint"))\
               .withColumn("congestion", col("congestion").cast("integer"))\
               .drop("_id")\
               .drop("anomaly")\
               .drop("dateTypeEnum")

In [0]:
df_t = df.withColumn("timestamp", col("timestamp")/1000)
df_t = df_t.withColumn("date_timestamp", col("timestamp").cast(TimestampType()))

In [0]:
get_hour_date = udf(lambda x: str(x)[:13])
df_t = df_t.withColumn("hourRounded", get_hour_date(col("date_timestamp")))

In [0]:
w = spark.read.option("inferSchema", True).option("header", True).csv("/FileStore/tables/relevant_weather_updated.csv")
w = w.withColumn("hourRounded", get_hour_date(col("date")))

In [0]:
import pyspark.sql.functions as F
display(w.select('hourRounded'))

hourRounded
2017-07-03 10
2017-07-03 11
2017-07-03 12
2017-07-03 13
2017-07-03 14
2017-07-03 15
2017-07-03 16
2017-07-03 17
2017-07-03 18
2017-07-03 19


In [0]:
temp = w.select("hourRounded", "rain", "temp", "wetb", "dewpt", "vappr", "rhum", "msl", "vis").groupby("hourRounded").agg({"rain": "avg", "temp":"avg", "wetb":"avg", "dewpt":"avg", "vappr":"avg", "rhum":"avg", "msl":"avg", "vis":"avg"}).withColumnRenamed("avg(temp)", "temp").withColumnRenamed("avg(msl)", "msl").withColumnRenamed("avg(vis)", "vis").withColumnRenamed("avg(rain)", "rain").withColumnRenamed("avg(vappr)", "vappr").withColumnRenamed("avg(rhum)", "rhum").withColumnRenamed("avg(dewpt)", "dewpt").withColumnRenamed("avg(wetb)", "wetb")

In [0]:
joint_data = df_t.join(temp, df_t.hourRounded == w.hourRounded)

In [0]:
true_values = joint_data.filter(col("congestion") == True)
sample = joint_data.filter(col("congestion") == False).sample(withReplacement=False, fraction=0.001)
balanced_data = true_values.union(sample)
train, test = balanced_data.randomSplit([0.8, 0.2])

In [0]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, VectorIndexer
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator

features_with_weather = ["currentHour", "dateType", "delay", "latitude", "longitude", "vehicleSpeed", "rain", "temp", "msl"]
features = ["currentHour", "dateType", "delay", "latitude", "longitude", "vehicleSpeed"]
va = VectorAssembler(inputCols=features, outputCol="features")
va_w = VectorAssembler(inputCols=features_with_weather, outputCol="features")
rf = RandomForestClassifier(labelCol="congestion", featuresCol="features")
pipeline_w = Pipeline(stages=[va_w, rf])
pipeline = Pipeline(stages=[va, rf])
f1_evaluator = MulticlassClassificationEvaluator(labelCol="congestion", predictionCol="prediction", metricName="f1")
recall_evaluator = MulticlassClassificationEvaluator(labelCol="congestion", predictionCol="prediction", metricName="weightedRecall")

In [0]:
model = pipeline.fit(train)
model_w = pipeline_w.fit(train)
predictions = model.transform(test)
predictions_w = model_w.transform(test)

In [0]:
f1 = f1_evaluator.evaluate(predictions)
recall = recall_evaluator.evaluate(predictions)
print(f"Regular model performance:\nf1: {f1}, recall: {recall}")

In [0]:
f1_w = f1_evaluator.evaluate(predictions_w)
recall_w = recall_evaluator.evaluate(predictions_w)
print(f"Joint data model performance:\nf1: {f1_w}, recall: {recall_w}")