In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when
from azure.storage.blob import BlobServiceClient
import os

# Configurações do Azure Data Lake
account_name = os.environ["AZURE_STORAGE_ACCOUNT_NAME"]
sas_token = os.environ["AZURE_STORAGE_SAS_TOKEN"]
account_url = f"https://{account_name}.blob.core.windows.net"

# Configurações de caminhos para camada Bronze e Silver
bronze_path = f"{account_url}/bronze"
silver_path = f"{account_url}/silver"

# Inicialização do SparkSession
spark = SparkSession.builder \
    .appName("Transformar Bronze para Silver") \
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-azure:3.3.4") \
    .config("spark.hadoop.fs.azure", "org.apache.hadoop.fs.azure.NativeAzureFileSystem") \
    .config("spark.driver.extraJavaOptions", "-Dlog4j.configuration=file:/path/to/log4j.properties") \
    .getOrCreate()

# Listar todos os arquivos na camada Bronze (com base em arquivos .parquet)
blob_service_client = BlobServiceClient(account_url=f"{account_url}?{sas_token}")
container_client = blob_service_client.get_container_client("bronze")

try:
    # Inicialize a lista de tabelas
    bronze_tables = [blob.name for blob in container_client.list_blobs() if blob.name.endswith(".parquet")]
    print(f"Tabelas encontradas no container 'bronze': {bronze_tables}")
except Exception as e:
    print(f"Erro ao acessar o container ou listar blobs: {e}")

if not bronze_tables:
    print("Nenhum arquivo .parquet encontrado no container.")
else:
    # Loop para processar cada tabela da camada Bronze
    for table_path in bronze_tables:
        # Nome da tabela com base no caminho do arquivo
        table_name = table_path.split("/")[-1].replace(".parquet", "")

        # Leitura dos dados da camada Bronze
        df_bronze = spark.read.parquet(f"abfss://bronze@{os.environ['AZURE_STORAGE_ACCOUNT_NAME']}.dfs.core.windows.net/{table_path}")

        # Tratamentos básicos
        # Remover duplicatas
        # Preencher valores nulos com um padrão genérico
        # Preencher valores nulos em campos numéricos com zero
        df_silver = df_bronze \
            .dropDuplicates() \
            .na.fill("N/A") \
            .na.fill(0)

        # Normalização de tipos e colunas
        for column in df_silver.columns:
            # Exemplo de normalização: converter todas as strings para minúsculas
            if df_silver.schema[column].dataType.simpleString() == "string":
                df_silver = df_silver.withColumn(column, col(column).alias(column.lower()))

        # Caminho da tabela transformada na camada Silver
        silver_table_path = f"{silver_path}/{table_name}.parquet"

        # Gravação no Azure Data Lake em formato Parquet
        df_silver.write \
            .mode("overwrite") \
            .parquet(f"abfss://silver@{os.environ['AZURE_STORAGE_ACCOUNT_NAME']}.dfs.core.windows.net/{silver_table_path}")

        print(f"Tabela {table_name} transformada e salva em {silver_table_path}")

# Encerrar sessão Spark
spark.stop()


Erro ao acessar o container ou listar blobs: Server failed to authenticate the request. Make sure the value of Authorization header is formed correctly including the signature.
RequestId:be49afd1-d01e-00ab-0f79-408585000000
Time:2024-11-27T03:05:51.0707790Z
ErrorCode:AuthenticationFailed
authenticationerrordetail:Signature did not match. String to sign used was datalakef87b4e367e786396
rwdlacupyx
bfqt
sco
2024-11-21T23:29:55Z
2024-12-22T07:29:55Z

https
2022-11-02


Content: <?xml version="1.0" encoding="utf-8"?><Error><Code>AuthenticationFailed</Code><Message>Server failed to authenticate the request. Make sure the value of Authorization header is formed correctly including the signature.
RequestId:be49afd1-d01e-00ab-0f79-408585000000
Time:2024-11-27T03:05:51.0707790Z</Message><AuthenticationErrorDetail>Signature did not match. String to sign used was datalakef87b4e367e786396
rwdlacupyx
bfqt
sco
2024-11-21T23:29:55Z
2024-12-22T07:29:55Z

https
2022-11-02

</AuthenticationErrorDetail><