In [None]:
from pyspark.sql import SparkSession
from azure.storage.blob import BlobServiceClient
from delta import configure_spark_with_delta_pip
from pyspark.sql.functions import col
import os

# Configurações do Azure Data Lake
account_name = os.getenv("AZURE_STORAGE_ACCOUNT_NAME")
sas_token = os.getenv("AZURE_STORAGE_SAS_TOKEN")
account_url = f"https://{account_name}.blob.core.windows.net"

try:
    spark = SparkSession.builder \
        .appName("Transform Bronze To Silver") \
        .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
        .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
        .config(f"fs.azure.sas.silver.{account_name}.blob.core.windows.net", sas_token)

    spark = configure_spark_with_delta_pip(spark).getOrCreate()
except Exception as e:
    print(f"Erro ao acessar configurar a sessão do spark: {e}")
    raise

# Listar todos os arquivos na camada Bronze (com base em arquivos .parquet)
blob_service_client = BlobServiceClient(account_url=f"{account_url}?{sas_token}")
container_client = blob_service_client.get_container_client("silver")

try:
    # Inicialize a lista de tabelas
    bronze_tables = [blob.name for blob in container_client.list_blobs() if blob.name.endswith(".parquet")]
    print(f"Tabelas encontradas no container 'silver': {bronze_tables}")
except Exception as e:
    print(f"Erro ao acessar o container ou listar blobs: {e}")

if not bronze_tables:
    print("Nenhum arquivo .parquet encontrado no container.")
else:
    for table_path in bronze_tables:
        try:
            table_name = table_path.split(".")[0]
            bronze_path = f"wasbs://silver@{account_name}.blob.core.windows.net/{table_name + ".parquet"}"
            silver_path = f"wasbs://silver@{account_name}.blob.core.windows.net/{table_name + ".parquet"}"

            df_bronze = spark.read.parquet(bronze_path)

            # Tratamentos básicos
            # Remover duplicatas
            # Preencher valores nulos com um padrão genérico
            # Preencher valores nulos em campos numéricos com zero
            df_silver = df_bronze \
                .dropDuplicates() \
                .na.fill("N/A") \
                .na.fill(0)

            # Normalização de tipos e colunas
            for column in df_silver.columns:
                # Exemplo de normalização: converter todas as strings para minúsculas
                if df_silver.schema[column].dataType.simpleString() == "string":
                    df_silver = df_silver.withColumn(column, col(column).alias(column.lower()))
            
            # Salvar os dados no formato Delta no Azure Blob Storage
            df_silver.write.format("delta").mode("overwrite").save(silver_path)

            print(f"Tabela {table_name} transformada e salva em {silver_path}")
        except Exception as e:
            print(f"Erro ao processar a coleção {table_name}: {e}")


Erro ao acessar o container ou listar blobs: Server failed to authenticate the request. Make sure the value of Authorization header is formed correctly including the signature.
RequestId:be49afd1-d01e-00ab-0f79-408585000000
Time:2024-11-27T03:05:51.0707790Z
ErrorCode:AuthenticationFailed
authenticationerrordetail:Signature did not match. String to sign used was datalakef87b4e367e786396
rwdlacupyx
bfqt
sco
2024-11-21T23:29:55Z
2024-12-22T07:29:55Z

https
2022-11-02


Content: <?xml version="1.0" encoding="utf-8"?><Error><Code>AuthenticationFailed</Code><Message>Server failed to authenticate the request. Make sure the value of Authorization header is formed correctly including the signature.
RequestId:be49afd1-d01e-00ab-0f79-408585000000
Time:2024-11-27T03:05:51.0707790Z</Message><AuthenticationErrorDetail>Signature did not match. String to sign used was datalakef87b4e367e786396
rwdlacupyx
bfqt
sco
2024-11-21T23:29:55Z
2024-12-22T07:29:55Z

https
2022-11-02

</AuthenticationErrorDetail><