In [0]:
blob_account_name = 'springcapital'
blob_container_name = 'springcapitalfiles'
account_key = ''
csv_blob_relative_path = 'springcapital_data/csv/*/NYSE/part-*.txt'
json_blob_relative_path = 'springcapital_data/json/*/NASDAQ/part-*.txt'

### Create Spark Session + Spark DataFrame Schema

In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local').appName('app').getOrCreate()
spark.conf.set(f"fs.azure.account.key.{blob_account_name}.blob.core.windows.net",account_key)

In [0]:
from pyspark.sql.types import StructType, StringType, IntegerType,StructField, DateType, TimestampType,DecimalType
 
common_event_schema = StructType([
    StructField('trade_dt', DateType(), True),
    StructField('rec_type', StringType(), True),
    StructField('symbol', StringType(), True),
    StructField('exchange', StringType(), True),
    StructField('event_tm', TimestampType(), True),
    StructField('event_seq_nb', IntegerType(), True),
    StructField('arrival_tm', TimestampType(), True),
    StructField('trade_pr', DecimalType(5,2), True),
    StructField('bid_pr', DecimalType(5,2), True),
    StructField('bid_size', IntegerType(), True),
    StructField('ask_pr', DecimalType(5,2), True),
    StructField('ask_size', IntegerType(), True),
    StructField('partition', StringType(), True)])



### Load and Parse CSV files

In [0]:
from datetime import datetime
from decimal import Decimal
def parse_csv(line:str):
    record_type_pos = 2
    record = line.split(",")
    try:
        trade_dt = datetime.strptime(record[0], '%Y-%m-%d')
        record_type = record[record_type_pos]
        symbol = record[3]
        exchange = record[6]
        event_tm =datetime.strptime(record[4],'%Y-%m-%d %H:%M:%S.%f')
        event_seq = int(record[5])
        file_tm = datetime.strptime(record[1],'%Y-%m-%d %H:%M:%S.%f')
    # [logic to parse records]
        if record[record_type_pos] == "T":
            price = Decimal(record[7])
            event = [trade_dt,record_type,symbol,exchange,event_tm,event_seq,file_tm,price,None,None,None,None,'T']
            return event
        elif record[record_type_pos] == "Q":
            bid_price = Decimal(record[7])
            bid_size = int(record[8])
            ask_price = Decimal(record[9])
            ask_size = int(record[10])
            event = [trade_dt,record_type,symbol,exchange,event_tm,event_seq,file_tm,None,bid_price,bid_size,ask_price,ask_size,'Q']
            return event
    except Exception as e:
        event = [None,'B',None,None,None,None,None,None,None,None,None,None,'B']
        return event

In [0]:
# from pyspark.sql import SparkSession
# spark = SparkSession.builder.master('local').appName('app').getOrCreate()
# spark.conf.set(f"fs.azure.account.key.{blob_account_name}.blob.core.windows.net",account_key)
raw = spark.sparkContext.textFile(f"wasbs://{blob_container_name}@{blob_account_name}.blob.core.windows.net/{csv_blob_relative_path}")
parsed = raw.map(lambda line: parse_csv(line))
data = spark.createDataFrame(parsed,schema=common_event_schema)
data.show(10)


### Write processed CSV file to Blob Storage

In [0]:
data.write.partitionBy("partition").mode("overwrite").parquet(f"wasbs://{blob_container_name}@{blob_account_name}.blob.core.windows.net/processed_data/csv/")

### Load and Parse JSON files

In [0]:
import json
def parse_json(line):
    record = json.loads(line)
    record_type = record['event_type']
    try:
        trade_dt = datetime.strptime(record['trade_dt'], '%Y-%m-%d')
        record_type = record['event_type']
        symbol = record['symbol']
        exchange = record['exchange']
        event_tm =datetime.strptime(record['event_tm'],'%Y-%m-%d %H:%M:%S.%f')
        event_seq = int(record['event_seq_nb'])
        file_tm = datetime.strptime(record['file_tm'],'%Y-%m-%d %H:%M:%S.%f')
    # [logic to parse records]
        if record_type == "T":
            price = Decimal(record['price'])
            event =[trade_dt,record_type,symbol,exchange,event_tm,event_seq,file_tm,
                    price,None,None,None,None,'T']
            return event
        elif record_type == "Q":
            bid_price = Decimal(record['bid_pr'])
            bid_size = int(record['bid_size'])
            ask_price = Decimal(record['ask_pr'])
            ask_size = int(record['ask_size'])
            event = [trade_dt,record_type,symbol,exchange,event_tm,event_seq,file_tm,
                     None,bid_price,bid_size,ask_price,ask_size,'Q']
            return event
    except Exception as e:
        event = [None,'B',None,None,None,None,None,None,None,None,None,None,'B']
        return event
    # [save record to dummy event in bad partition]
    # [fill in the fields as None or empty string

In [0]:
# from pyspark.sql import SparkSession
# spark = SparkSession.builder.master('local').appName('app').getOrCreate()
# spark.conf.set(f"fs.azure.account.key.{blob_account_name}.blob.core.windows.net",account_key)
raw = spark.sparkContext.textFile(f"wasbs://{blob_container_name}@{blob_account_name}.blob.core.windows.net/{json_blob_relative_path}")
parsed = raw.map(lambda line: parse_json(line))
data = spark.createDataFrame(parsed,schema=common_event_schema)
data.show(10)

### Write processed JSON file to Blob Storage

In [0]:
data.write.partitionBy("partition").mode("overwrite").parquet(f"wasbs://{blob_container_name}@{blob_account_name}.blob.core.windows.net/processed_data/json/")