In [1]:
sc

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
5,application_1643003322553_0009,pyspark,idle,Link,Link,✔


SparkSession available as 'spark'.

In [2]:
blob_account_name = 'springcapital'
blob_container_name = 'springcapitalfiles'
account_key = ''
trade_blob_relative_path = 'processed_data/*/partition=T/*.parquet'
quote_blob_relative_path = 'processed_data/*/partition=Q/*.parquet'
filepath = 'wasbs://{}@{}.blob.core.windows.net/'.format(blob_container_name,blob_account_name)

In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local').appName('app').getOrCreate()
# spark.conf.set("spark.sql.adaptive.enabled", True)
spark.conf.set('fs.azure.account.key.{}.blob.core.windows.net'.format(blob_account_name), account_key)

In [4]:
from pyspark.sql.functions import *

# Trade Data

In [5]:
#read trade data
trade_common = spark.read.parquet(filepath + trade_blob_relative_path)

In [6]:
trade_common.printSchema()

root
 |-- trade_dt: date (nullable = true)
 |-- rec_type: string (nullable = true)
 |-- symbol: string (nullable = true)
 |-- exchange: string (nullable = true)
 |-- event_tm: timestamp (nullable = true)
 |-- event_seq_nb: integer (nullable = true)
 |-- arrival_tm: timestamp (nullable = true)
 |-- trade_pr: decimal(5,2) (nullable = true)
 |-- bid_pr: decimal(5,2) (nullable = true)
 |-- bid_size: integer (nullable = true)
 |-- ask_pr: decimal(5,2) (nullable = true)
 |-- ask_size: integer (nullable = true)

In [7]:
trade_common = trade_common.select('trade_dt','symbol','exchange','event_tm','event_seq_nb','arrival_tm','trade_pr')

In [8]:
trade_common.show()

+----------+------+--------+--------------------+------------+-------------------+--------+
|  trade_dt|symbol|exchange|            event_tm|event_seq_nb|         arrival_tm|trade_pr|
+----------+------+--------+--------------------+------------+-------------------+--------+
|2020-08-06|  SYMA|  NASDAQ|2020-08-06 10:42:...|          10|2020-08-06 09:30:00|   78.93|
|2020-08-06|  SYMA|  NASDAQ|2020-08-06 12:00:...|          20|2020-08-06 09:30:00|   77.10|
|2020-08-06|  SYMA|  NASDAQ|2020-08-06 13:09:...|          30|2020-08-06 09:30:00|   78.31|
|2020-08-06|  SYMA|  NASDAQ|2020-08-06 14:27:...|          40|2020-08-06 09:30:00|   75.84|
|2020-08-06|  SYMA|  NASDAQ|2020-08-06 15:39:...|          50|2020-08-06 09:30:00|   77.63|
|2020-08-06|  SYMA|  NASDAQ|2020-08-06 16:43:...|          60|2020-08-06 09:30:00|   77.57|
|2020-08-06|  SYMA|  NASDAQ|2020-08-06 18:01:...|          70|2020-08-06 09:30:00|   76.98|
|2020-08-06|  SYMA|  NASDAQ|2020-08-06 19:09:...|          80|2020-08-06 09:30:0

In [9]:
trade_corrected = trade_common.groupBy("trade_dt","symbol", "exchange", "event_tm", "event_seq_nb").agg(max('arrival_tm').alias('arrival_time'))

In [10]:
trade_corrected.write.partitionBy("trade_dt").mode("overwrite").parquet(filepath + 'trade/')

# Quote Data

In [11]:
# read quote data
quote_common = spark.read.parquet(filepath + quote_blob_relative_path)

In [12]:
quote_common.printSchema()

root
 |-- trade_dt: date (nullable = true)
 |-- rec_type: string (nullable = true)
 |-- symbol: string (nullable = true)
 |-- exchange: string (nullable = true)
 |-- event_tm: timestamp (nullable = true)
 |-- event_seq_nb: integer (nullable = true)
 |-- arrival_tm: timestamp (nullable = true)
 |-- trade_pr: decimal(5,2) (nullable = true)
 |-- bid_pr: decimal(5,2) (nullable = true)
 |-- bid_size: integer (nullable = true)
 |-- ask_pr: decimal(5,2) (nullable = true)
 |-- ask_size: integer (nullable = true)

In [13]:
quote_common = quote_common.select('trade_dt','symbol','exchange','event_tm','event_seq_nb','arrival_tm','bid_pr',
                                   'bid_size','ask_pr','ask_size')

In [14]:
quote_common.show()

+----------+------+--------+--------------------+------------+-------------------+------+--------+------+--------+
|  trade_dt|symbol|exchange|            event_tm|event_seq_nb|         arrival_tm|bid_pr|bid_size|ask_pr|ask_size|
+----------+------+--------+--------------------+------------+-------------------+------+--------+------+--------+
|2020-08-06|  SYMA|  NASDAQ|2020-08-06 09:38:...|           1|2020-08-06 09:30:00| 78.13|     100| 79.83|     100|
|2020-08-06|  SYMA|  NASDAQ|2020-08-06 09:46:...|           2|2020-08-06 09:30:00| 76.52|     100| 76.57|     100|
|2020-08-06|  SYMA|  NASDAQ|2020-08-06 09:52:...|           3|2020-08-06 09:30:00| 78.75|     100| 79.09|     100|
|2020-08-06|  SYMA|  NASDAQ|2020-08-06 09:58:...|           4|2020-08-06 09:30:00| 75.61|     100| 76.95|     100|
|2020-08-06|  SYMA|  NASDAQ|2020-08-06 10:07:...|           5|2020-08-06 09:30:00| 77.45|     100| 78.73|     100|
|2020-08-06|  SYMA|  NASDAQ|2020-08-06 10:15:...|           6|2020-08-06 09:30:0

In [15]:
quote_corrected = quote_common.groupBy("trade_dt","symbol", "exchange", 
                                       "event_tm", "event_seq_nb").agg(max('arrival_tm').alias('arrival_time'))

In [16]:
quote_corrected.write.partitionBy("trade_dt").mode("overwrite").parquet(filepath + 'quote/')