<a href="https://colab.research.google.com/github/adrian-ja-projects/train-prediction-project/blob/fea_data_analisys/data_analysis_train_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!git clone https://github.com/adrian-ja-projects/train-prediction-project.git

Cloning into 'train-prediction-project'...
remote: Enumerating objects: 62, done.[K
remote: Counting objects: 100% (62/62), done.[K
remote: Compressing objects: 100% (60/60), done.[K
remote: Total 62 (delta 41), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (62/62), done.


In [2]:
%run /content/train-prediction-project/pl_extraction_api_to_staging.ipynb

INFO: Creating list for extraction loop...
INFO: Extraction dates are between 2020-01-01 and 2020-04-09
INFO: Starting extraction...
INFO: Extraction completed a total of 100 file were successfully extracted into the staging area


In [3]:
%run /content/train-prediction-project/spark_dependencies.ipynb

INFO: Installing Spark dependencies...
INFO: Spark dependencies installed
INFO: env variables created


In [4]:
%run /content/train-prediction-project/pl_transf_staging_to_raw.ipynb

INFO: Starting transformation of json files...
Data in raw ready to upload to a db


In [9]:
from pyspark.sql import functions as F
from pyspark.sql.types import TimestampType
from pyspark.sql import Window

In [10]:
#create spark session
spark = SparkSession.builder\
        .master("local")\
        .appName("pl_data_analysis")\
        .config('spark.ui.port', '4050')\
        .getOrCreate()

In [11]:
raw_file_path = "/content/raw/digitraffic/27_schedule"
df_27_schedule = spark.read.format("parquet").load(raw_file_path)

In [23]:
dateWindow = Window.partitionBy("departureDate").orderBy("actualTime")
dfTrainAvgTime = (df_27_schedule
 #First as best practice filter out data out of scope for the use case
 .select("trainNumber","departureDate","stationShortCode", "actualTime")
 .where(((F.col("stationShortCode").isin(["HKI"]))&(F.col("type")=="DEPARTURE"))
        | ((F.col("stationShortCode").isin(["TPE"]))&(F.col("type")=="ARRIVAL")))
 #Transform data and get average travel duration time in minutes
 .withColumn("actualTime", 
             F.col("actualTime").cast(TimestampType()))
 .withColumn("travelDurationMinutes", (F.unix_timestamp(F.col("actualTime")) - F.unix_timestamp(F.lag(F.col("actualTime"), 1)
                                       .over(dateWindow)))/60)
 .where(F.col("stationShortCode")=="TPE")
)

In [24]:
write_mode = "overwrite"
table_path = "/content/uc_train_prediction/27_travel_duration"
table_name = "27_travel_duration"
if overwrite:
  dfTrainAvgTime.repartition(1).write.mode(write_mode).save(table_path)
  print(f"INFO: table {table_name} has been overwriten on the path {table_path}")
elif append:
  dfTrainAvgTime.repartition(1).write.mode(write_mode).save(table_path)
  print(f"INFO: new data has been appended on the table {table_name} with the path {table_path}")

INFO: table 27_travel_duration has been overwriten on the path /content/uc_train_prediction/27_travel_duration
