In [1]:
%run "../Common/Enums"

In [2]:
# from pyspark.sql.functions import lit 
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.sql.dataframe import DataFrame

def AppendCustomColumns(
  dataframe,
  customColumns: list = []
):
  print('Adding custom columns...')  
  
  if not customColumns:
    print('There is no custom columns provided.')
    
  outcols = dataframe.columns
  for column in customColumns:
    if not column in outcols:
      print(f'Custom column added: {column}')
      outcols.append(F.lit(None).cast(StringType()).alias('{0}'.format(column)))
  
  return dataframe.select(outcols)


In [3]:
from datetime import datetime
from enum import Enum

def PrepareDataBronze( 
      lakeName,    
      instance: Enum, 
      entity: BronzeTable,
      entityName,
      startDate: datetime, #date from which data will be read  
      customColumns: list = []
    ):
  
  fileSystem = 'bronze'
  
  dataSource = entity.value[TableParams.DataSource] 
  tableName = entity.value[TableParams.TableName] 
  pkColumns = entity.value[TableParams.PkColumns] 
  modificationTimeColumn = entity.value[TableParams.ModificationTimeColumn] 
  
  year = str(startDate.year)
  month = str(startDate.month)
  day = str(startDate.day)
  
  startDateStr = str(startDate)
  
  #reading only requred partitions starting from startDate
  rawData = spark.read \
    .format('parquet') \
    .load("wasbs://"+ fileSystem +"@"+ lakeName +".blob.core.windows.net/"+ dataSource +"/"+ instance.name +"/"+tableName) \
    .where("""
             ( Year > """ + year + """ )
          or ( Year = """ + year + """ and Month > """ + month + """ )
          or ( Year = """ + year + """ and Month = """ + month + """ and Day >= """ + day + """ )
    """) \
    .where(modificationTimeColumn + ' > "' + startDateStr.replace('T',' ') + '"') # probably need to change to startDate obj variable.
    
  # add aditional filter by partition column modificationTimeColumn >= startDateStr
  
  rawData = AppendCustomColumns(rawData, customColumns)
  rawData.registerTempTable("Raw"+entityName)
  
  #removing duplicates from rawData dataset
  data = spark.sql(""" 
    with cteData
    as (
      select 
          row_number() over(partition by """ + pkColumns + " order by " + modificationTimeColumn + """ desc) as RowNum
        , *
      from raw"""+entityName+"""
    )
    select * 
    from cteData
    where RowNum = 1
  """);

  data.registerTempTable(entityName)

In [4]:

def PrepareDataBronzeWithoutInstance( 
      lakeName,    
      entity: BronzeTable,
      entityName,
      startDate: datetime, #date from which data will be read 
      customColumns: list = []
    ):
  
  fileSystem = 'bronze'
  
  dataSource = entity.value[TableParams.DataSource] 
  tableName = entity.value[TableParams.TableName] 
  pkColumns = entity.value[TableParams.PkColumns] 
  modificationTimeColumn = entity.value[TableParams.ModificationTimeColumn]
  
  year = str(startDate.year)
  month = str(startDate.month)
  day = str(startDate.day)
  
  startDateStr = str(startDate)
  
  #reading only requred partitions starting from startDate
  rawData = spark.read \
    .format('parquet') \
    .load("wasbs://"+ fileSystem +"@"+ lakeName +".blob.core.windows.net/"+ dataSource +"/"+ tableName) \
    .where("""
             ( Year > """ + year + """ )
          or ( Year = """ + year + """ and Month > """ + month + """ )
          or ( Year = """ + year + """ and Month = """ + month + """ and Day >= """ + day + """ )
    """) \
    .where(modificationTimeColumn + ' > "' + startDateStr.replace('T',' ') + '"') # probably need to change to startDate obj variable.
    
  # add aditional filter by partition column modificationTimeColumn >= startDateStr
            
  rawData = AppendCustomColumns(rawData, customColumns)
  rawData.registerTempTable("Raw"+entityName)
  
  #removing duplicates from rawData dataset
  data = spark.sql(f""" 
    with cteData
    as (
      select 
          row_number() over(partition by {pkColumns} order by {modificationTimeColumn} desc) as RowNum
        , *
      from raw{entityName}
    )
    select * 
    from cteData
    where RowNum = 1
  """);

  data.registerTempTable(entityName)

In [5]:
def PrepareDataBronzeSimple( 
      lakeName, 
      instance: Enum, 
      entity: BronzeTable, 
      entityName, 
      customColumns: list = [] 
    ) :
  fileSystem = 'bronze'
  
  dataSource = entity.value[TableParams.DataSource] 
  tableName = entity.value[TableParams.TableName] 
  
  data = spark.read.format('parquet').load("wasbs://"+ fileSystem +"@"+ lakeName +".blob.core.windows.net/"+ dataSource + "/" + instance.name + "/" + tableName)
  data = AppendCustomColumns(data, customColumns)
  data.registerTempTable(entityName)
  

In [6]:
def PrepareDataBronzeSimpleWithoutInstance( 
      lakeName, 
      entity: BronzeTable, 
      entityName,
      customColumns: list = []
  ) :
  fileSystem = 'bronze'
  
  dataSource = entity.value[TableParams.DataSource] 
  tableName = entity.value[TableParams.TableName] 
  
  data = spark.read.format('parquet').load("wasbs://"+ fileSystem +"@"+ lakeName +".blob.core.windows.net/"+ dataSource + "/" + tableName)
  data = AppendCustomColumns(data, customColumns)
  data.registerTempTable(entityName)

In [7]:
# def GetLastSyncDate(Entity, Source):
#   FileName = "wasbs://system@"+ lakeName +".blob.core.windows.net/"+ Entity + "/" + Entity + "_" + Source + ".json"
#   try:
#     df = spark.read.format('json').load(FileName)    
#     LastSyncDate = str(df.first().LastSyncDate).replace(' ','T')
#     if LastSyncDate.count('.')==0:
#       LastSyncDate += ".0"
    
# #     print(df)
# #     print(str(df.first().LastSyncDate))
# #     print(LastSyncDate)
#   except:
#       LastSyncDate = "2000-01-01T00:00:00.0"
#   return LastSyncDate

In [8]:
def GetLastSyncDate(Entity, Source):
  FileName = "wasbs://system@"+ lakeName +".blob.core.windows.net/silver/"+ Entity + "/" + Entity + "_" + Source + ".json"
  try:
    df = spark.read.format('json').load(FileName)    
    LastSyncDate = str(df.first().LastSyncDate).replace(' ','T')
    if LastSyncDate.count('.')==0:
      LastSyncDate += ".0"

    LastSyncDate = datetime.strptime(LastSyncDate,'%Y-%m-%dT%H:%M:%S.%f')
    
  except:
      LastSyncDate = datetime.strptime("2000-01-01T00:00:00.0",'%Y-%m-%dT%H:%M:%S.%f')
  
  return LastSyncDate

In [9]:
def UpdateLastSyncDate(LastSyncDate, Entity, Source):
  LastSyncDate = str(LastSyncDate)
  if LastSyncDate.count('.')==0:
    LastSyncDate += ".0"
  Content = '{"LastSyncDate":"'+LastSyncDate.replace(" ","T")+'"}'
  FileName = "wasbs://system@"+ lakeName +".blob.core.windows.net/silver/"+ Entity + "/" + Entity + "_" + Source + ".json"
  print("Save ", Content,sep="")
  dbutils.fs.put(
    file = FileName, 
    contents = Content,
    overwrite = True
  )

In [10]:
data = spark.sql(""" 
    select *
    from(
                select 0,	'Active'
      union all select 1,	'Inactive'
      union all select 2,	'Suspended'
      union all select 3,	'OperationalPending'
      union all select 4,	'Deleted'
      union all select 100,	'Initializing'
      union all select 200,	'Migrating'
      union all select 201,	'Migrated'
    ) t ( CompanyStatusId, CompanyStatusName )
  """);

data.registerTempTable("tmpCompanyStatuses")

In [11]:
data = spark.sql(""" 
    select *
    from(
                select 1, 'Data Collection > Build New Module'
      union all select 2, 'Data Collection > Edit Module'
      union all select 3, 'Data Collection > Edit Charts/Dashboard'
      union all select 4, 'Scheduled Reports'
      union all select 5, 'Dispatching > Orders Screen Filter Infographics'
      union all select 6, 'Dispatching > Orders Screen User List/Drag and Drop'
      union all select 7, 'Dispatching > Edit Dispatch Module'
      union all select 8, 'Timekeeping > Create Clock In/Out Forms'
      union all select 9, 'Auto Email Conditional'
      union all select 10, 'Tracking Included by default'
      union all select 11, 'Enable Groups'
      union all select 12, 'Dispatching > Add Dispatch Module From Library'
      union all select 13, 'Add Messaging Module'
      union all select 14, 'Enable Push-to-talk'
      union all select 15, 'Data Collection > Edit Submitted Data'
      union all select 16, 'Enable File import'
      union all select 17, 'Enable Ad hoc reports'
      union all select 18, 'Timekeeping > Add Timekeeping'
      union all select 19, 'VIN Lookup'
      union all select 20, 'Edit Submitted Data'
      union all select 21, 'API Access'
      union all select 22, 'Data Collection > Form Routing'
      union all select 23, 'Data Collection > Form Sharing'
      union all select 24, 'Emails > Standalone forms'
      union all select 25, 'Emails > Dispatching orders'
      union all select 26, 'Emails > Alerts'
      union all select 27, 'Data Collection > GetFormDefinitions'
      union all select 28, 'Data Collection > Edit address'
      union all select 29, 'Custom tracking interval'
      union all select 30, 'Dispatching > Scheduler'
      union all select 31, 'Custom lists'
    ) t ( FeatureId, FeatureName )
  """);

data.registerTempTable("tmpFeatures")

In [12]:
data = spark.sql(""" 
    select *
    from(
                select 0,	'Handsets'
      union all select 1,	'CalAmp'
      union all select 2,	'GeoTab'
      union all select 3,	'Xirgo'
      union all select 4,	'Suntech'
    ) t ( DeviceTypeId, DeviceTypeName )
  """);

data.registerTempTable("tmpDeviceTypes")

In [13]:
data = spark.sql(""" 
    select *
    from(
                select 0,	'Completed'
      union all select 1,	'Assigned'
      union all select 2,	'Unassigned'
      union all select 3,	'Declined'
      union all select 4,	'Cancelled'
    ) t ( StandaloneFormStatusId, StandaloneFormStatusName )
  """);

data.registerTempTable("tmpStandaloneFormStatus")

In [14]:
data = spark.sql(""" 
    select *
    from(
                select 1,	'ClockIn'
      union all select 2,	'Break'
      union all select 3,	'Lunch'
    ) t ( TimekeepingStatusId, TimekeepingStatusName )
  """);

data.registerTempTable("tmpTimekeepingStatusName")

In [15]:
data = spark.sql("""
    select *
    from(
                select 1, 'Create'
      union all select 2, 'Update'
      union all select 3, 'Delete'
      union all select 4, 'Access'
  )
  t ( OperationId, OperationName )
  """);

data.registerTempTable("tmpAuditOperation")

In [16]:
data = spark.sql("""
    select *
    from(
                select 0, 'Unknown'
      union all select 1, 'Create'
      union all select 2, 'Update'
      union all select 3, 'Delete'
      union all select 4, 'Activate'
      union all select 5, 'Deactivate'
      union all select 11, 'Cascade'
      union all select 12, 'Merge'
      union all select 13, 'Assign'
      union all select 14, 'Share'
      union all select 15, 'Retrieve' 
      union all select 16, 'Close'
      union all select 17, 'Cancel'
      union all select 18, 'Complete'
      union all select 20, 'Resolve'
      union all select 21, 'Reopen'
      union all select 22, 'Fulfill'
      union all select 23, 'Paid'
      union all select 24, 'Qualify'
      union all select 25, 'Disqualify'
      union all select 26, 'Submit'
      union all select 27, 'Reject'
      union all select 28, 'Approve'
      union all select 29, 'Invoice'
      union all select 30, 'Hold'
      union all select 31, 'Add Member'
      union all select 32, 'Remove Member'
      union all select 33, 'Associate Entities'
      union all select 34, 'Disassociate Entities'
      union all select 35, 'Add Members'
      union all select 36, 'Remove Members'
      union all select 37, 'Add Item'
      union all select 38, 'Remove Item'
      union all select 39, 'Add Substitute'
      union all select 40, 'Remove Substitute'
      union all select 41, 'Set State'
      union all select 42, 'Renew'
      union all select 43, 'Revise'
      union all select 44, 'Win'
      union all select 45, 'Lose'
      union all select 46, 'Internal Processing'
      union all select 47, 'Reschedule'
      union all select 48, 'Modify Share'
      union all select 49, 'Unshare'
      union all select 50, 'Book'
      union all select 51, 'Generate Quote From Opportunity'
      union all select 52, 'Add To Queue'
      union all select 53, 'Assign Role To Team'
      union all select 54, 'Remove Role From Team'
      union all select 55, 'Assign Role To User'
      union all select 56, 'Remove Role From User'
      union all select 57, 'Add Privileges to Role'
      union all select 58, 'Remove Privileges From Role'
      union all select 59, 'Replace Privileges In Role'
      union all select 60, 'Import Mappings'
      union all select 61, 'Clone'
      union all select 62, 'Send Direct Email'
      union all select 63, 'Enabled for organization'
      union all select 64, 'User Access via Web'
      union all select 65, 'User Access via Web Services'
      union all select 100, 'Delete Entity'
      union all select 101, 'Delete Attribute'
      union all select 102, 'Audit Change at Entity Level'
      union all select 103, 'Audit Change at Attribute Level'
      union all select 104, 'Audit Change at Org Level'
      union all select 105, 'Entity Audit Started'
      union all select 106, 'Attribute Audit Started'
      union all select 107, 'Audit Enabled'
      union all select 108, 'Entity Audit Stopped'
      union all select 109, 'Attribute Audit Stopped'
      union all select 110, 'Audit Disabled'
      union all select 111, 'Audit Log Deletion'
      union all select 112, 'User Access Audit Started'
      union all select 113, 'User Access Audit Stopped'
  ) t ( ActionId, ActionName )
  """);
  
data.registerTempTable("tmpAuditAction")

In [17]:
data = spark.sql(""" 
    select *
    from(
                select 1,	'Administrator'
      union all select 3,	'User'
      union all select 5,	'Power User'
    ) t ( UserRoleId, UserRoleName )
  """);

data.registerTempTable("tmpUserRole")

In [18]:
data = spark.sql(""" 
    select *
    from(
                select 0,	'Undefined'
      union all select 1,	'Regular'
      union all select 2,	'WebOnly'
      union all select 3,	'Employee'
    ) t ( UserLicenseTypeId, UserLicenseTypeName )
  """);

data.registerTempTable("tmpUserLicenseType")

In [19]:
data = spark.sql(""" 
    select *
    from(
                select 0,	'Active'
      union all select 1,	'Resolved'
      union all select 2,	'Canceled'
    ) t ( StateCode, StateCodeName )
  """);

data.registerTempTable("tmpIncidentStateCode")

In [20]:
data = spark.sql(""" 
    select *
    from(
                select 1,	'In Progress'
      union all select 2,	'On Hold'
      union all select 3,	'Waiting for Details'
      union all select 4,	'Researching'
      union all select 5,	'Problem Solved'
      union all select 1000,	'Information Provided'
      union all select 100000000, 'Non-Responsive'
      union all select 6, 'Canceled'
      union all select 2000, 'Merged'
    ) t ( StatusCode, StatusCodeName )
  """);

data.registerTempTable("tmpIncidentStatusCode")

In [21]:
data = spark.sql(""" 
    select *
    from(
                select 0,	'Open'
      union all select 1,	'Won'
      union all select 2,	'Lost'
    ) t ( StateCode, StateCodeName )
  """);

data.registerTempTable("tmpOpportunityStateCode")

In [22]:
data = spark.sql(""" 
    select *
    from(
                select 1,	'In Progress'
      union all select 2,	'On Hold'
      union all select 3,	'Won'
      union all select 100000001,	'Missing Information'
      union all select 100000004,	'Duplicate'
      union all select 100000000,	'Competitor'
      union all select 4, 'Canceled'
      union all select 5, 'Out-Sold'
      union all select 100000002, 'Deact User/users'
      union all select 100000003, 'Deact Account'
      union all select 100000005, 'User Already Active'
      union all select 100000006, 'Billing Not Verified'
      union all select 100000007, 'Account Suspended'
    ) t ( StatusCode, StatusCodeName )
  """);

data.registerTempTable("tmpOpportunityStatusCode")

In [23]:
data = spark.sql(""" 
    select *
    from(
                select 0,	'Active'
      union all select 1,	'Inactive'
      union all select 2,	'Suspended'
    ) t ( LicenseStatusId, LicenseStatusName )
  """);

data.registerTempTable("tmpLicenseStatus")

In [24]:
data = spark.sql(""" 
    select *
    from(
                select 0, 'Unexpected'
      union all select 1, 'MultipleLicensesWithPtn'
      union all select 2, 'LimitedWhenCarrierReceived'
      union all select 3, 'ChangeTier'
      union all select 4, 'ChangeBan'
      union all select 5, 'ChangePtn'
      union all select 6, 'ReplaceLimited'
      union all select 7, 'CarrierCancel'
      union all select 8, 'UserCancel'
      union all select 9, 'DeactivateOldPromo'
      union all select 10, 'ChangeBanAndPtn'
      union all select 11, 'NoActiveNonPromoLicenses'
      union all select 12, 'Expired'
      union all select 13, 'CompanyAbandoned'
      union all select 14, 'Suspended'
      union all select 15, 'SuspendedCompanyDeactivated'
      union all select 16, 'CompanyMerged'
      union all select 17, 'CompanyCreationFailed'
      union all select 18, 'ChangeLicenseParams'
      union all select 19, 'CarrierTierChange'
      union all select 20, 'CarrierPriceLevelChange'
      union all select 21, 'CarrierReactivation'
      union all select 22, 'Transfer'
    ) t ( LicenseDeactivationReasonId, LicenseDeactivationReasonName )
  """);

data.registerTempTable("tmpLicenseDeactivationReason")

In [25]:
data = spark.sql(""" 
    select *
    from(
                select 0,	'Unexpected'
      union all select 1,	'CarrierInitiated'
      union all select 2,	'ChangeTier'
      union all select 3,	'LicenseChangeRequest'
      union all select 4,	'ReplaceLimited'
      union all select 5,	'ChangeBan'
      union all select 6,	'ChangePtn'
      union all select 7,	'Renew'
      union all select 8,	'CarrierNotification'
      union all select 9,	'DeactivateOldPromo'
      union all select 10,	'AddPromo'
      union all select 11,	'ChangeBanAndPtn'
      union all select 12,	'AppDirectNotification'
      union all select 13,	'OrphanModify'
      union all select 14,	'CompanyMerged'
      union all select 15,	'CarrierTierChange'
      union all select 16,	'CarrierPriceLevelChange'
      union all select 17,	'CarrierReactivation'
      union all select 18,	'LicenseTransfer'
      union all select 19,	'ResumeNotification'
      union all select 20,	'CompanyMigration'
    ) t ( LicenseActivationReasonId, LicenseActivationReasonName )
  """);

data.registerTempTable("tmpLicenseActivationReason")

In [26]:
data = spark.sql(""" 
    select *
    from(
                select 0,	'New'
      union all select 1,	'Dispatched'
      union all select 2,	'InProgress'
      union all select 3,	'Complete'
      union all select 4,	'Cancelled'
    ) t ( OrderStatusTypeId, OrderStatusType )
  """);

data.registerTempTable("tmpOrderStatusTypes")

In [27]:
data = spark.sql(""" 
    select *
    from(
                select 0,	'Circle'
      union all select 1,	'Polygon'
    ) t ( ShapeTypeId, ShapeTypeName )
  """);

data.registerTempTable("tmpShapeTypes")

In [28]:
data = spark.sql("""
  select * from (
              select 1, 'Phone'
    union all select 2, 'Email'
    union all select 3, 'Web'
    union all select 100000000, 'After Hours'
    union all select 100000001, 'Qwest'
    union all select 100000002, 'Customer Portal'
    union all select 2483, 'Facebook'
    union all select 3986, 'Twitter')
    t (SourceId, Source) """);

data.registerTempTable("tmpIncidentSource")

In [29]:
data = spark.sql("""
  select * from (
            select 100000003, 'Bell WFM'
  union all select 100000005, 'CAB Manager'
  union all select 100000000, 'Comet Tracker'
  union all select 100000002, 'Encore'
  union all select 100000007, 'Geotab'
  union all select 100000010, 'Maxis mWorkforce'
  union all select 100000008, 'Mobile Warrior ELD'
  union all select 100000006, 'Mobilise IT'
  union all select 100000009, 'VisTracks ELD'
  union all select 100000004, 'Vodafone MWE'
  union all select 100000001, 'Workforce Manager (AT&T)')
  t (ProductId, Product) """);

data.registerTempTable("tmpIncidentProduct")

In [30]:
data = spark.sql("""
  select * from (
            select 1, 'Question'
  union all select 2, 'Problem'
  union all select 3, 'Request')
  t (CaseTypeCode, CaseType)
""");

data.registerTempTable("tmpIncidentCaseType");

In [31]:
data = spark.sql("""
  select * from (
            select 1, 'High'
  union all select 2, 'Normal'
  union all select 3, 'Low')
  t (PriorityCode, Priority)
""");

data.registerTempTable("tmpIncidentPriority");

In [32]:
data = spark.sql("""
  select * from (
            select 100000002, 'Normal'
  union all select 100000001, 'High'
  union all select 100000000, 'Urgent')
  t (SeverityCode, Severity)
""");

data.registerTempTable("tmpIncidentSeverity");