### DLT Wymagane biblioteki



In [0]:
import dlt
from pyspark.sql.functions import current_date, expr, col, lit, hash, regexp_extract, current_timestamp
from pyspark.sql import functions as F

## Parametry

In [0]:
param_environment = spark.conf.get("param_environment", "dev")
param_source_name = spark.conf.get("param_source_name", "")
schema = spark.conf.get("silver_schema")
param_scd_type = spark.conf.get("param_scd_type", "scd1")
bronze_schema = spark.conf.get("bronze_schema")

## Pobierasz z bronze 

- Definjuje tabele bronze 
- Pobieram dane ze schematu bronze.dbo przy użciu spark.readStream


## Jak działa funkcja `silver_table_scd1` w Delta Live Tables

Funkcja dekorowana `@dlt.table` tworzy w potoku **Delta Live Tables** (DLT) tabelę warstwy *silver* o nazwie `silver.<schema>.<param_source_name>` i oznacza ją metadanymi:

* `quality = "silver"` – poziom jakości danych  
* `scd_type = "scd1"` – uproszczona implementacja Slowly Changing Dimension Type 1 

### Logika w funkcji

1. **Sprawdzenie parametru**  
   Funkcja działa tylko, gdy zewnętrzny parametr `param_scd_type` ma wartość `"scd1"`. W innym przypadku zwraca pusty DataFrame, dzięki czemu pipeline się nie wywraca.

2. **Odczyt danych z warstwy *bronze***  
   ```python
   df_bronze = dlt.read_stream(f"bronze.{schema}.{param_source_name}")


In [0]:
@dlt.table(
    name=f"{schema}.{param_source_name}",
    comment=f"Silver table for {param_source_name} with SCD Typ 1",
    table_properties={
        "quality": "silver",
        "scd_type": "scd1"
    }
)
def silver_table_scd1():
    """
    SCD Typ 1
    """
    if param_scd_type == "scd1":
        # tabela źrodłowa
        df_bronze = dlt.read_stream(f"{bronze_schema}.{param_source_name}")
        
        df_processed = (df_bronze
            .dropDuplicates()
            .withColumn("hash_value", hash(*[col for col in df_bronze.columns if col not in ['_rescued_data']]))
            .withColumn("rating_value", regexp_extract('rating', r'(\d+\.?\d*)', 1).cast("double"))
            .withColumn("item_weight", regexp_extract('item_weight', r'(\d+\.?\d*)', 1).cast("double"))
            .withColumn("inserted_at", current_timestamp())
            .withColumn("updated_at", current_timestamp())
            .withColumn("scd_is_current", lit(True))
            .withColumn("scd_start_date", current_date())
            .withColumn("scd_end_date", lit('9999-12-31'))
        )
        
        return df_processed
    else:
        return spark.createDataFrame([], schema="dummy STRING")