In [64]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode
from pyspark.sql.functions import split
from pyspark.sql.functions import *


spark = SparkSession \
    .builder \
    .appName("Session5") \
    .getOrCreate()

In [128]:
df= spark.read.option("multiline","true").json("./data/train_schedules.json")
df.show(4)

+-------+---+---------+------+------------+-------------------+--------------------+------------+
|arrival|day|departure|    id|station_code|       station_name|          train_name|train_number|
+-------+---+---------+------+------------+-------------------+--------------------+------------+
|   None|  1| 07:55:00|302214|          FM|KACHEGUDA FALAKNUMA|Falaknuma Lingamp...|       47154|
|   None|  1| 18:55:00|281458|         TCR|            THRISUR|Thrissur Guruvayu...|       56044|
|   None|  1| 15:05:00|309335|         PBR|          PORBANDAR|Porbandar Muzaffa...|       19269|
|   None|  1| 13:30:00|283774|           R|          RAIPUR JN|  RAIPUR ITWARI PASS|       58205|
+-------+---+---------+------+------------+-------------------+--------------------+------------+
only showing top 4 rows



In [111]:
df.select("train_number", "station_code", "departure",).show(4)

+------------+------------+---------+
|train_number|station_code|departure|
+------------+------------+---------+
|       47154|          FM| 07:55:00|
|       56044|         TCR| 18:55:00|
|       19269|         PBR| 15:05:00|
|       58205|           R| 13:30:00|
+------------+------------+---------+
only showing top 4 rows



In [112]:
df.select(df.train_number, df.station_code, df.departure).show(4)

+------------+------------+---------+
|train_number|station_code|departure|
+------------+------------+---------+
|       47154|          FM| 07:55:00|
|       56044|         TCR| 18:55:00|
|       19269|         PBR| 15:05:00|
|       58205|           R| 13:30:00|
+------------+------------+---------+
only showing top 4 rows



In [115]:
df.select(col("train_number"), col("station_code"), col("departure")) \
.show(4)

+------------+------------+---------+
|train_number|station_code|departure|
+------------+------------+---------+
|       47154|          FM| 07:55:00|
|       56044|         TCR| 18:55:00|
|       19269|         PBR| 15:05:00|
|       58205|           R| 13:30:00|
+------------+------------+---------+
only showing top 4 rows



In [116]:
df.printSchema()

root
 |-- arrival: string (nullable = true)
 |-- day: long (nullable = true)
 |-- departure: string (nullable = true)
 |-- id: long (nullable = true)
 |-- station_code: string (nullable = true)
 |-- station_name: string (nullable = true)
 |-- train_name: string (nullable = true)
 |-- train_number: string (nullable = true)



In [117]:
df.columns

['arrival',
 'day',
 'departure',
 'id',
 'station_code',
 'station_name',
 'train_name',
 'train_number']

In [126]:
df.groupBy("station_name").count().orderBy("station_name").show(6)

+------------------+-----+
|      station_name|count|
+------------------+-----+
|                  |    2|
|A-CABIN BONDAMUNDA|   48|
|             ABADA|  182|
|          ABHAIPUR|   56|
|  ABHAYAPURI ASSAM|   32|
|          ABJUGANJ|   14|
+------------------+-----+
only showing top 6 rows



In [73]:
df.dtypes

[('arrival', 'string'),
 ('day', 'bigint'),
 ('departure', 'timestamp'),
 ('id', 'bigint'),
 ('station_code', 'string'),
 ('station_name', 'string'),
 ('train_name', 'string'),
 ('train_number', 'string')]

In [75]:
df= df.withColumn("departure",to_timestamp("departure"))
df= df.withColumn("arrival",to_timestamp("arrival"))

In [76]:
# Create temporary table called schedules
df.createOrReplaceTempView("schedules")
spark.sql("DESCRIBE schedules").show()

+------------+---------+-------+
|    col_name|data_type|comment|
+------------+---------+-------+
|     arrival|timestamp|   null|
|         day|   bigint|   null|
|   departure|timestamp|   null|
|          id|   bigint|   null|
|station_code|   string|   null|
|station_name|   string|   null|
|  train_name|   string|   null|
|train_number|   string|   null|
+------------+---------+-------+



In [105]:
# Adding row numbers
# Upcoming arrival time
query= """
SELECT train_number, station_code , station_name, departure, ROW_NUMBER() OVER (ORDER BY train_number) AS row_number, 
        LEAD(departure, 1) OVER (ORDER BY train_number) AS upcoming_arrival
        FROM    schedules
        WHERE train_number= 12301

"""
spark.sql(query).show(20)

+------------+------------+-----------------+-------------------+----------+-------------------+
|train_number|station_code|     station_name|          departure|row_number|   upcoming_arrival|
+------------+------------+-----------------+-------------------+----------+-------------------+
|       12301|         HWH|        HOWRAH JN|2021-04-26 16:55:00|         1|2021-04-26 16:58:00|
|       12301|         LLH|           LILUAH|2021-04-26 16:58:00|         2|2021-04-26 17:00:00|
|       12301|         BEQ|            BELUR|2021-04-26 17:00:00|         3|2021-04-26 17:01:00|
|       12301|         BLY|            BALLY|2021-04-26 17:01:00|         4|2021-04-26 17:03:00|
|       12301|         BZL|        BELANAGAR|2021-04-26 17:03:00|         5|2021-04-26 17:05:00|
|       12301|        DKAE|          DANKUNI|2021-04-26 17:05:00|         6|2021-04-26 17:07:00|
|       12301|        GBRA|            GOBRA|2021-04-26 17:07:00|         7|2021-04-26 17:10:00|
|       12301|         JOX|   

In [89]:
# Adding row numbers
# Upcoming arrival time
query= """
SELECT train_number, station_code , (UNIX_TIMESTAMP(departure, 'Yyyy-mm-dd')), 
        LEAD(departure, 1) OVER (ORDER BY train_number) AS upcoming_arrival
        FROM    schedules
        WHERE train_number= 12301

"""
spark.sql(query).show(4)

+------------+------------+-------------------------------------+-------------------+
|train_number|station_code|unix_timestamp(departure, Yyyy-mm-dd)|   upcoming_arrival|
+------------+------------+-------------------------------------+-------------------+
|       12301|         HWH|                           1619436300|2021-04-26 16:58:00|
|       12301|         LLH|                           1619436480|2021-04-26 17:00:00|
|       12301|         BEQ|                           1619436600|2021-04-26 17:01:00|
|       12301|         BLY|                           1619436660|2021-04-26 17:03:00|
+------------+------------+-------------------------------------+-------------------+
only showing top 4 rows



### Window Function

In [130]:
# OVER Clause: Adding row numbers
df.createOrReplaceTempView("schedules")
query= """
SELECT train_number, station_code , departure, ROW_NUMBER() OVER (ORDER BY train_number) AS row_number
        FROM schedules
        WHERE train_number= 12301

"""
spark.sql(query).show(5)

+------------+------------+---------+----------+
|train_number|station_code|departure|row_number|
+------------+------------+---------+----------+
|       12301|         HWH| 16:55:00|         1|
|       12301|         LLH| 16:58:00|         2|
|       12301|         BEQ| 17:00:00|         3|
|       12301|         BLY| 17:01:00|         4|
|       12301|         BZL| 17:03:00|         5|
+------------+------------+---------+----------+
only showing top 5 rows



In [132]:
# LEAD Clause: Upcoming arrival time
query= """
SELECT train_number, station_code , departure, ROW_NUMBER() OVER (ORDER BY train_number) AS row_number,
        LEAD(departure, 1) OVER (ORDER BY train_number) AS upcoming_arrival
        FROM schedules
        WHERE train_number= 12301

"""
spark.sql(query).show(5)

+------------+------------+---------+----------+----------------+
|train_number|station_code|departure|row_number|upcoming_arrival|
+------------+------------+---------+----------+----------------+
|       12301|         HWH| 16:55:00|         1|        16:58:00|
|       12301|         LLH| 16:58:00|         2|        17:00:00|
|       12301|         BEQ| 17:00:00|         3|        17:01:00|
|       12301|         BLY| 17:01:00|         4|        17:03:00|
|       12301|         BZL| 17:03:00|         5|        17:05:00|
+------------+------------+---------+----------+----------------+
only showing top 5 rows



In [96]:
# Adding row numbers
# Upcoming arrival time
query= """
SELECT train_number, station_code , (UNIX_TIMESTAMP(departure, 'Yyyy-mm-dd Hh:mm:ss')- (UNIX_TIMESTAMP(upcoming_arrival, 'Yyyy-mm-dd Hh:mm:ss'))) AS diff 
        FROM    schedules
        WHERE train_number= 12301

"""
schedules_with_upcoming_arrival= spark.sql(query).show(4)

AnalysisException: cannot resolve '`upcoming_arrival`' given input columns: [schedules.arrival, schedules.day, schedules.departure, schedules.id, schedules.station_code, schedules.station_name, schedules.train_name, schedules.train_number]; line 2 pos 103;
'Project [train_number#1285, station_code#1282, (unix_timestamp(departure#1339, Yyyy-mm-dd Hh:mm:ss, Some(Asia/Kolkata), false) - 'UNIX_TIMESTAMP('upcoming_arrival, Yyyy-mm-dd Hh:mm:ss)) AS diff#1619]
+- Filter (cast(train_number#1285 as int) = 12301)
   +- SubqueryAlias schedules
      +- Project [to_timestamp('arrival, None) AS arrival#1348, day#1279L, departure#1339, id#1281L, station_code#1282, station_name#1283, train_name#1284, train_number#1285]
         +- Project [arrival#1278, day#1279L, to_timestamp('departure, None) AS departure#1339, id#1281L, station_code#1282, station_name#1283, train_name#1284, train_number#1285]
            +- Project [arrival#1278, day#1279L, to_timestamp('departure, None) AS departure#1294, id#1281L, station_code#1282, station_name#1283, train_name#1284, train_number#1285]
               +- Relation[arrival#1278,day#1279L,departure#1280,id#1281L,station_code#1282,station_name#1283,train_name#1284,train_number#1285] json


In [85]:
# Create temporary table called schedules
df.createOrReplaceTempView("schedules_with_upcoming_arrival")

query= """
SELECT train_number, station_code , departure, (UNIX_TIMESTAMP(departure, 'Yyyy-mm-dd HH:mm:ss')- UNIX_TIMESTAMP(upcoming_arrival, 'Yyyy-mm-dd HH:mm:ss'))
        FROM schedules_with_upcoming_arrival
        WHERE train_number= 12301

"""
spark.sql(query).show(5)

AnalysisException: cannot resolve '`upcoming_arrival`' given input columns: [schedules_with_upcoming_arrival.arrival, schedules_with_upcoming_arrival.day, schedules_with_upcoming_arrival.departure, schedules_with_upcoming_arrival.id, schedules_with_upcoming_arrival.station_code, schedules_with_upcoming_arrival.station_name, schedules_with_upcoming_arrival.train_name, schedules_with_upcoming_arrival.train_number]; line 2 pos 113;
'Project [train_number#1285, station_code#1282, departure#1339, unresolvedalias((unix_timestamp(departure#1339, Yyyy-mm-dd HH:mm:ss, Some(Asia/Kolkata), false) - 'UNIX_TIMESTAMP('upcoming_arrival, Yyyy-mm-dd HH:mm:ss)), None)]
+- Filter (cast(train_number#1285 as int) = 12301)
   +- SubqueryAlias schedules_with_upcoming_arrival
      +- Project [to_timestamp('arrival, None) AS arrival#1348, day#1279L, departure#1339, id#1281L, station_code#1282, station_name#1283, train_name#1284, train_number#1285]
         +- Project [arrival#1278, day#1279L, to_timestamp('departure, None) AS departure#1339, id#1281L, station_code#1282, station_name#1283, train_name#1284, train_number#1285]
            +- Project [arrival#1278, day#1279L, to_timestamp('departure, None) AS departure#1294, id#1281L, station_code#1282, station_name#1283, train_name#1284, train_number#1285]
               +- Relation[arrival#1278,day#1279L,departure#1280,id#1281L,station_code#1282,station_name#1283,train_name#1284,train_number#1285] json
