<a href="https://colab.research.google.com/github/adrian-ja-projects/train-prediction-project/blob/fea_data_analisys/pl_raw_to_uc_train_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from pyspark.sql import functions as F
from pyspark.sql.types import TimestampType
from pyspark.sql import Window

In [None]:
#create spark session
spark = SparkSession.builder\
        .master("local")\
        .appName("pl_data_analysis")\
        .config('spark.ui.port', '4050')\
        .getOrCreate()

In [None]:
raw_file_path = "/content/raw/digitraffic/27_schedule"
df_27_schedule = spark.read.format("parquet").load(raw_file_path)

In [None]:
dateWindow = Window.partitionBy("departureDate").orderBy("actualTime")
dfTrainAvgTime = (df_27_schedule
 #First as best practice filter out data out of scope for the use case
 .select("trainNumber","departureDate","stationShortCode", "actualTime")
 .where(((F.col("stationShortCode").isin(["HKI"]))&(F.col("type")=="DEPARTURE"))
        | ((F.col("stationShortCode").isin(["TPE"]))&(F.col("type")=="ARRIVAL")))
 #Transform data and get average travel duration time in minutes
 .withColumn("actualTime", 
             F.col("actualTime").cast(TimestampType()))
 .withColumn("travelDurationMinutes", (F.unix_timestamp(F.col("actualTime")) - F.unix_timestamp(F.lag(F.col("actualTime"), 1)
                                       .over(dateWindow)))/60)
 .where(F.col("stationShortCode")=="TPE")
)

In [None]:
write_mode = "overwrite"
table_path = "/content/uc_train_prediction/27_travel_duration"
table_name = "27_travel_duration"
if overwrite:
  dfTrainAvgTime.repartition(1).write.mode(write_mode).save(table_path)
  print(f"INFO: table {table_name} has been overwriten on the path {table_path}. Table ready for analysis")
elif append:
  dfTrainAvgTime.repartition(1).write.mode(write_mode).save(table_path)
  print(f"INFO: new data has been appended on the table {table_name} with the path {table_path}. Table ready for analysis")

In [None]:
spark.stop()