In [0]:
# importação de funções para manipulação de dados no DataFrame do Spark
from pyspark.sql.functions import to_date, first, col, round

In [0]:
# exibindo os primeiros registros de todos os subdiretórios e arquivos dentro da pasta 'bronze'
spark.read.parquet("dbfs:/databricks-results/bronze/*/*/*")

+-----+----------+----------+
|moeda|      taxa|      data|
+-----+----------+----------+
|  USD|  0.201077|2024-02-05|
|  GBP|  0.160427|2024-02-05|
|  EUR|  0.187198|2024-02-05|
|  JPY| 29.889123|2024-02-05|
|  CNY|  1.431367|2024-02-05|
|  CAD|  0.272331|2024-02-05|
|  ZAR|  3.833315|2024-02-05|
|  ARS|166.702274|2024-02-05|
|  EUR|  0.187288|2024-02-06|
|  JPY| 29.779429|2024-02-06|
|  CNY|  1.432392|2024-02-06|
|  CAD|  0.271765|2024-02-06|
|  ZAR|  3.793992|2024-02-06|
|  ARS|167.106885|2024-02-06|
|  USD|  0.201175|2024-02-07|
|  GBP|  0.159326|2024-02-07|
|  EUR|  0.186701|2024-02-07|
|  JPY| 29.792147|2024-02-07|
|  CNY|  1.430573|2024-02-07|
|  CAD|  0.270912|2024-02-07|
+-----+----------+----------+
only showing top 20 rows



In [0]:
df_dados_juntos = spark.read.parquet("dbfs:/databricks-results/bronze/*/*/*")

+-----+---------+----------+
|moeda|     taxa|      data|
+-----+---------+----------+
|  USD| 0.201077|2024-02-05|
|  GBP| 0.160427|2024-02-05|
|  EUR| 0.187198|2024-02-05|
|  JPY|29.889123|2024-02-05|
|  CNY| 1.431367|2024-02-05|
+-----+---------+----------+
only showing top 5 rows



In [0]:
# filtrando conjunto de dados com apenas três moedas 
moedas = ['USD', 'EUR', 'GBP']

df_moedas = df_dados_juntos.filter(df_dados_juntos.moeda.isin(moedas))

+-----+--------+----------+
|moeda|    taxa|      data|
+-----+--------+----------+
|  USD|0.201077|2024-02-05|
|  GBP|0.160427|2024-02-05|
|  EUR|0.187198|2024-02-05|
|  EUR|0.187288|2024-02-06|
|  USD|0.201175|2024-02-07|
+-----+--------+----------+
only showing top 5 rows



In [0]:
# convertendo os valores na coluna 'data' de string para date
df_moedas = df_moedas.withColumn("data", to_date("data"))

In [0]:
# informando o valor de cada taxa das 3 moedas por data ordenados por data de forma descendente
resultado_taxas_conversao = df_moedas.groupBy('data')\
                                    .pivot('moeda')\
                                    .agg(first('taxa'))\
                                    .orderBy('data', ascending=False)

+----------+--------+--------+--------+
|      data|     EUR|     GBP|     USD|
+----------+--------+--------+--------+
|2024-04-04| 0.18374|0.157575|0.199664|
|2024-04-03|0.183063|0.156812|0.198409|
|2024-04-02|0.183317|0.156959|0.197398|
|2024-04-01|0.184242|0.157692| 0.19782|
|2024-03-31|0.184771|0.157804|0.199414|
|2024-03-30|0.184629|0.157923|0.199394|
|2024-03-27|0.185265|0.158791|0.200329|
|2024-03-25|0.185512|0.159099|0.201058|
|2024-03-24|0.184969|0.158678|0.199904|
|2024-03-23|0.185875|0.160247|0.201927|
|2024-03-22|0.183994|0.158625|0.199884|
|2024-03-21|0.185018|0.158734|0.200966|
|2024-03-20|0.184056|0.157292|  0.2013|
|2024-03-19|0.182958|0.156266|0.198748|
|2024-03-18|0.182815|0.156195|0.198784|
|2024-03-17|0.184115|0.157381|0.200361|
|2024-03-16|0.183642| 0.15721|0.200144|
|2024-03-15|0.183642|0.157185|0.200144|
|2024-03-14|0.183984|0.157103|0.200232|
|2024-03-12|0.184126|0.157247|0.201203|
+----------+--------+--------+--------+
only showing top 20 rows



In [0]:
# selecionando todas as colunas de um dataframe e criando um novo df com os mesmo dados
resultado_valores_em_reais = resultado_taxas_conversao.select('*')

+----------+--------+--------+--------+
|      data|     EUR|     GBP|     USD|
+----------+--------+--------+--------+
|2024-04-04| 0.18374|0.157575|0.199664|
|2024-04-03|0.183063|0.156812|0.198409|
|2024-04-02|0.183317|0.156959|0.197398|
|2024-04-01|0.184242|0.157692| 0.19782|
|2024-03-31|0.184771|0.157804|0.199414|
+----------+--------+--------+--------+
only showing top 5 rows



In [0]:
# calculando o valor equivalente em R$ para cada moeda disponível no dataframe
for moeda in moedas:
    resultado_valores_em_reais = resultado_valores_em_reais\
                                    .withColumn(
                                        moeda, round(1/col(moeda), 4)
                                    )

In [0]:
# diminuindo o número de partições do dataframe para 1, simplificando e garantindo neste projeto a geração de um único arquivo de saída
resultado_taxas_conversao = resultado_taxas_conversao.coalesce(1)
resultado_valores_em_reais = resultado_valores_em_reais.coalesce(1)

In [0]:
# salvando dados transformados na camada 'prata'
resultado_taxas_conversao.write\
    .mode ("overwrite")\
    .format("csv")\
    .option("header", "true")\
    .save("dbfs:/databricks-results/prata/taxas_conversao")
    
resultado_valores_em_reais.write\
    .mode ("overwrite")\
    .format("csv")\
    .option("header", "true")\
    .save("dbfs:/databricks-results/prata/valores_reais")


In [0]:
display(dbutils.fs.ls("dbfs:/databricks-results/bronze/2024/04/04"))

path,name,size,modificationTime
dbfs:/databricks-results/bronze/2024/04/04/_SUCCESS,_SUCCESS,0,1712263055000
dbfs:/databricks-results/bronze/2024/04/04/_committed_1206669460558963195,_committed_1206669460558963195,424,1712246715000
dbfs:/databricks-results/bronze/2024/04/04/_committed_920614296692032253,_committed_920614296692032253,830,1712263054000
dbfs:/databricks-results/bronze/2024/04/04/_committed_vacuum3431864782692298856,_committed_vacuum3431864782692298856,96,1712263055000
dbfs:/databricks-results/bronze/2024/04/04/_started_920614296692032253,_started_920614296692032253,0,1712263054000
dbfs:/databricks-results/bronze/2024/04/04/part-00000-tid-920614296692032253-85e15ddc-e0f0-4dec-914e-1978cf4ba32c-823-1-c000.snappy.parquet,part-00000-tid-920614296692032253-85e15ddc-e0f0-4dec-914e-1978cf4ba32c-823-1-c000.snappy.parquet,1039,1712263054000
dbfs:/databricks-results/bronze/2024/04/04/part-00001-tid-920614296692032253-85e15ddc-e0f0-4dec-914e-1978cf4ba32c-824-1-c000.snappy.parquet,part-00001-tid-920614296692032253-85e15ddc-e0f0-4dec-914e-1978cf4ba32c-824-1-c000.snappy.parquet,1039,1712263054000
dbfs:/databricks-results/bronze/2024/04/04/part-00002-tid-920614296692032253-85e15ddc-e0f0-4dec-914e-1978cf4ba32c-825-1-c000.snappy.parquet,part-00002-tid-920614296692032253-85e15ddc-e0f0-4dec-914e-1978cf4ba32c-825-1-c000.snappy.parquet,1039,1712263054000
dbfs:/databricks-results/bronze/2024/04/04/part-00003-tid-920614296692032253-85e15ddc-e0f0-4dec-914e-1978cf4ba32c-826-1-c000.snappy.parquet,part-00003-tid-920614296692032253-85e15ddc-e0f0-4dec-914e-1978cf4ba32c-826-1-c000.snappy.parquet,1039,1712263054000
