In [0]:
import sys
module_path = '/dbfs/spark/stocksETL/script/'
if module_path not in sys.path:
  sys.path.insert(0,'/dbfs/spark/stocksETL/script/')

In [0]:
import os
import json
from datetime import datetime
from decimal import Decimal
from pyspark.sql import SparkSession
from pyspark.sql import functions as func
from pyspark.sql.types import StructType,StructField,DateType
from pyspark.sql.types import TimestampType,StringType,IntegerType,DecimalType
import psycopg2
from random import randint
import jobTracker as t

In [0]:
# Database connection details

dbutils.widgets.text("DB_NAME", "","")
dbName = dbutils.widgets.get("DB_NAME")

dbutils.widgets.text("DB_HOST", "","")
dbHost = dbutils.widgets.get("DB_HOST")

dbutils.widgets.text("DB_USER", "","")
dbUser = dbutils.widgets.get("DB_USER")

dbutils.widgets.text("DB_PWD", "","")
dbPwd = dbutils.widgets.get("DB_PWD")

dbutils.widgets.text("DB_PORT", "","")
dbPort = dbutils.widgets.get("DB_PORT")

#  Getting blob connection details

dbutils.widgets.text("storage_acct", "","")
storage_acct = dbutils.widgets.get("storage_acct")

dbutils.widgets.text("container_name", "","")
container_name = dbutils.widgets.get("container_name")

dbutils.widgets.text("blob_key", "","")
blob_key = dbutils.widgets.get("blob_key")

In [0]:
mountName = "stocksETL"
mounts = [str(i) for i in dbutils.fs.ls('/mnt/')] 

if "FileInfo(path='dbfs:/mnt/" + mountName + "/', name='" + mountName + "/', size=0)" in mounts:
  print("mount already created")
else:
  dbutils.fs.mount(
    source = "wasbs://" + container_name + "@" + storage_acct +".blob.core.windows.net",
    mount_point = "/mnt/stocksETL",
    extra_configs = {"fs.azure.account.key." + storage_acct + ".blob.core.windows.net":blob_key})

In [0]:
def parse_csv(line):
    
    record_type_pos = 2
    record = line.split(",") 
    
    try:        
        # [logic to parse records]
        if record[record_type_pos] == "T":
            event = [record[0],record[1],record[2],record[3],record[4],int(record[5]),record[6],\
                    Decimal(record[7]),int(record[8]),Decimal(record[9]),int(record[10]),"T"]
            return event
        elif record[record_type_pos] == "Q":
            event = [record[0],record[1],record[2],record[3],record[4],int(record[5]),record[6],\
                    Decimal(record[7]),int(record[8]),Decimal(record[9]),int(record[10]),"Q"]
            
            return event
    except Exception as e:      
        event = [None,None,record[record_type_pos],None,None,None,None,None,None,None,None,"B"]
        return event
    
def parse_json(line):
    
    record = json.loads(line)
    record_type = record['event_type']  

    try:                           
        # [logic to parse records]
        if record_type == "T":            
            event = [record['trade_dt'],record['file_tm'],record['event_type'],record['symbol'],\
                record['event_tm'],int(record['event_seq_nb']),record['exchange'],Decimal(record['bid_pr']),\
                    int(record['bid_size']),Decimal(record['ask_pr']),int(record['ask_size']),"T"]
            return event
        elif record_type == "Q":
            event = [record['trade_dt'],record['file_tm'],record['event_type'],record['symbol'],\
                record['event_tm'],int(record['event_seq_nb']),record['exchange'],Decimal(record['bid_pr']),\
                int(record['bid_size']),Decimal(record['ask_pr']),int(record['ask_size']),"Q"]
            return event
    except Exception as e:
        event = [None,None,record['event_type'],None,None,None,None,None,None,None,None,"B"]
        return event

In [0]:
def main():
    
    try:
        csv_input_file_path = "/mnt/stocksETL/spark/inputfiles/data/csv/"
        json_input_file_path = "/mnt/stocksETL/spark/inputfiles/data/json/"
        output_path = "/mnt/stocksETL/spark/outputfiles"
        
        jobTrack = t.Tracker("stockETL1")
        jobId = jobTrack.assign_job_id()
        dbConn = jobTrack.get_db_connection(dbName,dbHost,dbUser,dbPwd,dbPort)
        
        sc = spark.sparkContext
        
        # Define the common schema for both CSV and JSON files
        common_event = StructType([StructField("trade_dt", StringType(),True),\
                                    StructField("arrival_tm", StringType(),True),\
                                    StructField("rec_type", StringType(),True),\
                                    StructField("symbol", StringType(),True),\
                                    StructField("event_tm", StringType(),True),\
                                    StructField("event_seq_nb", IntegerType(),True),\
                                    StructField("exchange", StringType(),True),\
                                    StructField("bid_pr", DecimalType(),True),\
                                    StructField("bid_size", IntegerType(),True),\
                                    StructField("ask_pr", DecimalType(),True),\
                                    StructField("ask_size", IntegerType(),True),\
                                    StructField("partition", StringType(),True)])
        
        job_sts = "processing csv files"
        jobTrack.update_job_status(jobId,job_sts,dbConn)
        
        # Processing CSV input files
        filepath = csv_input_file_path

        raw = sc.textFile(filepath)
        parsed = raw.map(lambda line: parse_csv(line))

        data = spark.createDataFrame(parsed,common_event)
        out_path = output_path
        data.write.partitionBy("partition").mode("append").parquet(out_path)

        job_sts = "processing JSON files"
        jobTrack.update_job_status(jobId,job_sts,dbConn)
        
        # Processing JSON output files
        filepath = json_input_file_path

        raw = sc.textFile(filepath)
        parsed = raw.map(lambda line: parse_json(line))

        job_sts = "writing output to blob folder"
        jobTrack.update_job_status(jobId,job_sts,dbConn)
        
        data = spark.createDataFrame(parsed,common_event)
        out_path = output_path
        data.write.partitionBy("partition").mode("append").parquet(out_path)
        
        dbConn.commit()
        dbConn.close()

    except Exception as e:
        print(str(e))

In [0]:
if __name__ == "__main__":
    main()

In [0]:
main()