In [1]:
from pyspark.sql.functions import *

def CopyClientData(lakeName, instance: Enum):
  print("LakeName:", lakeName)
  print("Instance:", instance.name)
  print("InstanceId:", instance.value)
  
  # get last date from the loaded dataset
  LastSyncDate = GetLastSyncDate('sent_messages', 'client.'+instance.name)
  print("Reading data since", LastSyncDate)  
  
  print("Load SentMessages data")
  PrepareDataBronze (
    lakeName = lakeName,
    instance = instance, 
    entity = BronzeTable.SentMessages,
    entityName = 'tmpSentMessages',
    startDate = LastSyncDate
  )
  
  print("Load Users data")
  PrepareDataBronze (
    lakeName = lakeName,
    instance = instance, 
    entity = BronzeTable.Users,
    entityName = 'tmpUsers',
    startDate = LastSyncDate
  )
  
  print("Preparing data to write...")
  
  sourceData = spark.sql("""
    select
          sm.CompanyId
        , """+str(instance.value)+""" as InstanceId
        , sm.FromUserId
        , if(sm.FromUserId is null, null, if(u.FullName is null, 'BAD DATA', u.FullName)) as FromUserName
        , sm.MessageId
        , sm.SentTimeUtc
        , year(sm.SentTimeUtc) as Year
        , sm.SentTimeLocal
        , sm.DeletedChangeTimeUtc
        , sm.Deleted
        , sm.MessageSubject
        , sm.MessageContent
        , sm.ToUsers
        , sm.ModifiedDateUtc
        , sm.CreatedDateUtc
        , current_timestamp() as SilverModifiedUtc
    from tmpSentMessages sm
      left join tmpUsers u on u.UserId = sm.FromUserId and u.CompanyId = sm.CompanyId
""")  
  
  sourceData.registerTempTable("SourceData")
 
  aggRow = spark.sql("""
   select min(Year), max(Year), max(ModifiedDateUtc)
   from SourceData  
  """).collect()[0];

  MinYear = aggRow[0]
  MaxYear = aggRow[1]
  LastSyncDate = aggRow[2]
  
  if LastSyncDate is None:
    print('Source data is empty, nothing to write.')
    return
  
  print("Writing data...")
  
  print('MinYear', MinYear)
  print('MaxYear', MaxYear)  
  print('LastSyncDate', LastSyncDate)
  
  spark.sql(f"""
    merge into silver.SentMessages as t
    using SourceData as s
      on t.CompanyId = s.CompanyId 
     and t.InstanceId = s.InstanceId 
     and t.MessageId = s.MessageId
     and t.Year = s.Year
     and t.Year >= {MinYear}
     and t.Year <= {MaxYear}
    when matched then update set 
          FromUserId = s.FromUserId
        , FromUserName = s.FromUserName
        , MessageId = s.MessageId
        , SentTimeUtc = s.SentTimeUtc
        , SentTimeLocal = s.SentTimeLocal
        , DeletedChangeTimeUtc = s.DeletedChangeTimeUtc
        , Deleted = s.Deleted
        , MessageSubject = s.MessageSubject
        , MessageContent = s.MessageContent
        , ToUsers = s.ToUsers
        , ModifiedDateUtc = s.ModifiedDateUtc
        , CreatedDateUtc = s.CreatedDateUtc   
        , SilverModifiedUtc = s.SilverModifiedUtc
    when not matched then 
      insert ( Year, CompanyId, InstanceId, MessageId, FromUserId, FromUserName, SentTimeUtc, SentTimeLocal, DeletedChangeTimeUtc
              , Deleted, MessageSubject, MessageContent, ToUsers, ModifiedDateUtc, CreatedDateUtc, SilverModifiedUtc )
      values ( Year, CompanyId, InstanceId, MessageId, FromUserId, FromUserName, SentTimeUtc, SentTimeLocal, DeletedChangeTimeUtc
              , Deleted, MessageSubject, MessageContent, ToUsers, ModifiedDateUtc, CreatedDateUtc, SilverModifiedUtc )
  """);

  print("Write completed.")
  
  print("Updating last sync date.")
  
  UpdateLastSyncDate(LastSyncDate, 'sent_messages', 'client.'+instance.name)
  
  print("Done.")
  print("")