In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql import functions as F
from pyspark.sql import Window

# https://mvnrepository.com/artifact/io.openlineage/openlineage-spark

marquez_host = 'http://marquez.marquez:80'

spark = (SparkSession.builder
         .appName('sample_spark')
         .config('spark.extraListeners', 'io.openlineage.spark.agent.OpenLineageSparkListener')
         .config('spark.jars.packages', 'io.openlineage:openlineage-spark:1.4.1')
         .config("spark.openlineage.namespace", "andreyolv")
         .config('spark.openlineage.transport.type', 'http')
         .config('spark.openlineage.transport.url', marquez_host)
         .getOrCreate()
        )



In [11]:
df_produtos = spark.read.format('parquet').load('tables/produtos')

df_produtos.show()

+----------+------------+
|produto_id|produto_nome|
+----------+------------+
|         1|       Arroz|
|         2|      Feijão|
|         4|        Óleo|
+----------+------------+



In [10]:
df_vendas = spark.read.format('parquet').load('tables/vendas')

df_vendas.show()

+----------+----------+-----+
|      data|produto_id|valor|
+----------+----------+-----+
|2022-01-01|         1|100.0|
|2022-01-02|         2| 50.0|
|2022-01-03|         3| 25.0|
+----------+----------+-----+



In [12]:
df_final = df_vendas.join(df_produtos, "produto_id", "left")

df_final.show()

+----------+----------+-----+------------+
|produto_id|      data|valor|produto_nome|
+----------+----------+-----+------------+
|         1|2022-01-01|100.0|       Arroz|
|         2|2022-01-02| 50.0|      Feijão|
|         3|2022-01-03| 25.0|        null|
+----------+----------+-----+------------+



In [13]:
df_final.write.mode('overwrite').format("parquet").save("tables/final")