In [1]:
from pyspark.sql import SparkSession
from delta.tables import DeltaTable

In [2]:
spark = SparkSession.builder \
    .appName('User Raw to Trusted') \
    .config('spark.sql.extensions','io.delta.sql.DeltaSparkSessionExtension') \
    .config('spark.sql.catalog.spark_catalog','org.apache.spark.sql.delta.catalog.DeltaCatalog') \
    .getOrCreate()

In [3]:
spark

In [4]:
raw = 's3a://raw/user'
trusted = 's3a://trusted/user'

In [5]:
df_raw = spark.read.json(raw)

In [6]:
from pyspark.sql import Window
from pyspark.sql.functions import row_number 
from pyspark.sql.functions import col

w = Window.partitionBy('id').orderBy(df_raw.uid.desc())
df_raw = df_raw.withColumn("rank",row_number().over(w))
df_raw = df_raw.where(df_raw.rank == 1)

In [7]:
df_merge = df_raw.select( \
                          col('username').alias('username') \
                         ,col('id').alias('id')  \
                         ,col('first_name').alias('first_name')  \
                         ,col('last_name').alias('last_name')  \
                         ,col('email').alias('email')  \
                        )

In [8]:
df_merge.show()

+------------------+---+----------+-----------+--------------------+
|          username| id|first_name|  last_name|               email|
+------------------+---+----------+-----------+--------------------+
|       keven.bogan|  1|     Keven|      Bogan|keven.bogan@email...|
|       adan.reilly|  2|      Adan|     Reilly|adan.reilly@email...|
|       otha.legros|  4|      Otha|     Legros|otha.legros@email...|
|     gladis.zemlak|  5|    Gladis|     Zemlak|gladis.zemlak@ema...|
|  roseanna.carroll|  8|  Roseanna|    Carroll|roseanna.carroll@...|
|    linwood.miller| 10|   Linwood|     Miller|linwood.miller@em...|
|   sunny.lueilwitz| 11|     Sunny|  Lueilwitz|sunny.lueilwitz@e...|
|     andrea.harris| 12|    Andrea|     Harris|andrea.harris@ema...|
|   johnetta.carter| 13|  Johnetta|     Carter|johnetta.carter@e...|
|  eusebio.cummings| 14|   Eusebio|   Cummings|eusebio.cummings@...|
|   evita.mcdermott| 15|     Evita|  McDermott|evita.mcdermott@e...|
|    scott.schimmel| 16|     Scott

In [9]:
if DeltaTable.isDeltaTable(spark,trusted):
    deltaTable = DeltaTable.forPath(spark,trusted)
else:
    df_merge.write.format('delta').save(trusted)
    deltaTable = DeltaTable.forPath(spark,trusted)

In [10]:
##Verificação

df_verificacao = spark.read.format('binaryFile').load(trusted)
df_verificacao.count()

15

In [11]:
deltaTable.alias('trusted') \
    .merge(
        df_merge.alias('raw'),
        'trusted.id = raw.id') \
    .whenMatchedUpdate(set = {
        'first_name': 'raw.first_name',
        'last_name': 'raw.last_name',
        'email': 'raw.email',
        'username': 'raw.username'}) \
    .whenNotMatchedInsert(values={
         'first_name': 'raw.first_name',
        'last_name': 'raw.last_name',
        'email': 'raw.email',
        'username': 'raw.username',
        'id': 'raw.id'
        }) \
    .execute()