In [0]:

%run "/Workspace/my_databricks_learning/databricks_learning/Usecases/4_logistics_usecase_generic/generic_func_nb/generic_functions_nb"

In [0]:

def bonus_calculator(role:str,age:int):
    if role.upper()=="DRIVER" and age>50:
        return 0.15
    elif role.upper()=="DRIVER" and age<30:
        return 0.05
    else:
        return 0
bonus_udf = udf(bonus_calculator)



In [0]:
def mask_string(name:str):
    if name is None:
        return None
    if len(name) <= 2:
        return name
    return name[:2] + "*" * (len(name) - 3) + name[-1]
mask_string_udf=udf(mask_string)

In [0]:
%python
from pyspark.sql.functions import *
def staff_data_standardisation_func(df):
    return (
        df.withColumn("role",lower(col("role")))
        .withColumn("hub_location",initcap(col("hub_location")))
        .withColumn("age",word_to_num_udf(col("age")).cast("int"))
        .withColumn("shipment_id",word_to_num_udf(col("shipment_id"))
        .cast("int"))
        .withColumnRenamed("first_name","staff_first_name")
        .withColumnRenamed("last_name","staff_last_name")
        .withColumnRenamed("hub_location","origin_hub_city"))

In [0]:
def logistics_shipment_data_standarisation_func(df):
    return (
        df.withColumn("domain",lit("Logistics"))
        .withColumn("is_expedited",lit(False))
        .withColumn("ingestion_timestamp",lit(current_timestamp()))
        .withColumn("vehicle_type",upper(col("vehicle_type")))
        .withColumn("shipment_date",to_date("shipment_date","yy-MM-dd"))
        .withColumn("shipment_cost",round(col("shipment_cost"),2))
        .withColumn("shipment_weight_kg",col("shipment_weight_kg").cast("double"))
        .withColumn("is_expedited",col("is_expedited").cast("boolean")))

In [0]:
def staff_data_enrichedment_func(df):
    return (
        df.withColumn("load_dt",lit(current_timestamp()))
        .withColumn("full_name",concat(col("staff_first_name"),lit(" "),col("staff_last_name")))
        .select('shipment_id','full_name', 'age', 'role', 'origin_hub_city', 'vehicle_type', 'load_dt','data_source'))

In [0]:
def logistics_shipment_data_enrichment_func(df):
    return (
        df.withColumn("route_segment",concat(col("source_city"),lit("-"),col("destination_city")))
        .withColumn("vehicle_identifier",concat(col("vehicle_type"),lit("_"),col("shipment_id")))
        .withColumn("shipment_year",year(col("shipment_date")))
        .withColumn("shipment_month",month(col("shipment_date")))
        .withColumn("is_weekend",when((dayofweek(col("shipment_date")) == 1) | (dayofweek(col("shipment_date")) == 7 ),True).otherwise(False))
        .withColumn("is_expedited",when((col("shipment_status") == 'IN_TRANSIT') | (col("shipment_status") == 'DELIVERED'), True).otherwise(False))
        .withColumn("cost_per_kg", try_divide(col("shipment_cost"), col("shipment_weight_kg")))
        .withColumn("days_since_shipment",datediff(current_date(),col("shipment_date")))
        .withColumn("tax_amount",col("shipment_cost")*0.18)
        .withColumn("order_prefix", substring(col("order_id"), 1, 3))
        .withColumn("order_sequence", substring(col("order_id"), 4, length(col("order_id"))))
        .withColumn("ship_year",year(col("shipment_date")))
        .withColumn("ship_month",month(col("shipment_date")))
        .withColumn("ship_day",day(col("shipment_date")))
        .withColumn("route_lane",concat(col("source_city"),lit("->"),col("destination_city"))))

In [0]:
def staff_data_customize_func(df):
    return df.withColumn("projected_bonus",bonus_udf(col("role"),col("age"))).withColumn("full_name",mask_string_udf(col("full_name")))

In [0]:
silver_db='logistics_proj.shipment_logistics_data'
spark.sql(f'select * from {silver_db}.logistics_shipment_silver_tbl')

In [0]:
def logistics_shipment_gold_curation(table):
    
    logistics_shipment_temp_1=spark.sql(f'''select 
        shipment_id as log_shipment_id,
        order_id ,
        upper(source_city) as source_city ,
        destination_city ,
        shipment_status ,
        cargo_type ,
        vehicle_type as shipment_vehicle_type,
        payment_mode ,
        shipment_weight_kg,
        shipment_cost,
        concat('â‚¹',cast(shipment_cost as string)) as shipment_cost_inr ,
        shipment_date,
        domain, 
        ingestion_timestamp,
        is_expedited,
        route_segment,
        vehicle_identifier,
        shipment_year,
        shipment_month,
        is_weekend,
        cost_per_kg,
        days_since_shipment,
        tax_amount,
        order_prefix,
        order_sequence,
        ship_year,
        ship_month,
        ship_day,
        route_lane,
        case when shipment_cost > 50000 then True else False end as is_high_value
        from {table}''')
    return logistics_shipment_temp_1