In [0]:
client_id     = dbutils.secrets.get(scope="kv-olist", key="secret-client-id")
tenant_id     = dbutils.secrets.get(scope="kv-olist", key="secret-tenant-id")
client_secret = dbutils.secrets.get(scope="kv-olist", key="secret-client-mdp")

storage_account = "oliststorageaccountbth74"
container_name = "data"

spark.conf.set(f"fs.azure.account.auth.type.{storage_account}.dfs.core.windows.net", "OAuth")
spark.conf.set(f"fs.azure.account.oauth.provider.type.{storage_account}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set(f"fs.azure.account.oauth2.client.id.{storage_account}.dfs.core.windows.net", client_id)
spark.conf.set(f"fs.azure.account.oauth2.client.secret.{storage_account}.dfs.core.windows.net", client_secret)
spark.conf.set(f"fs.azure.account.oauth2.client.endpoint.{storage_account}.dfs.core.windows.net", f"https://login.microsoftonline.com/{tenant_id}/oauth2/token")

### 1) olist_order_reviews_dataset cleaning

In [0]:
df = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .option("multiLine", "true") \
    .option("quote", "\"") \
    .option("escape", "\"") \
    .load(f"abfss://{container_name}@{storage_account}.dfs.core.windows.net/bronze/olist_order_reviews_dataset.csv")

df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").save(f"abfss://{container_name}@{storage_account}.dfs.core.windows.net/silver/olist_order_reviews_dataset")




Databricks data profile. Run in Databricks to view.

### 2) olist_products_dataset cleaning

In [0]:
from pyspark.sql.functions import col


df=spark.read.format("delta").load(f"abfss://{container_name}@{storage_account}.dfs.core.windows.net/silver/olist_products_dataset")

df_filtered=df.fillna({
    "product_category_name":"Unknown",
    "product_name_lenght":0,
    "product_description_lenght":0,
    "product_photos_qty":0,
    "product_weight_g":0,
    "product_length_cm":0,
    "product_height_cm":0,
    "product_width_cm":0
})

df_filtered.write.format("delta").mode("overwrite").save(f"abfss://{container_name}@{storage_account}.dfs.core.windows.net/silver/olist_products_dataset")


Databricks data profile. Run in Databricks to view.

### 3) olist_order_payments_dataset cleaning

In [0]:
from pyspark.sql.functions import col,when


df=spark.read.format("delta").load(f"abfss://{container_name}@{storage_account}.dfs.core.windows.net/silver/olist_order_payments_dataset")

df_filtered=df.filter(col("payment_type")!="not_defined")

df_filtered.write.format("delta").mode("overwrite").save(f"abfss://{container_name}@{storage_account}.dfs.core.windows.net/silver/olist_order_payments_dataset")



### 4) olist_orders_dataset cleaning

In [0]:
from pyspark.sql.functions import col

df=spark.read.format("delta").load(f"abfss://{container_name}@{storage_account}.dfs.core.windows.net/silver/olist_orders_dataset")

df_filtered=df.filter(col("order_status")=="delivered")

df_filtered.write.format("delta").mode("overwrite").save(f"abfss://{container_name}@{storage_account}.dfs.core.windows.net/silver/olist_orders_dataset")


Databricks data profile. Run in Databricks to view.