In [9]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, lit
from pyspark.sql.functions import col, regexp_replace, trim, when, regexp_extract
from pyspark.sql.types import *
from pyspark.sql.functions import col, isnan, when, count ,date_format,to_date,to_timestamp


from pyspark.sql import (
    DataFrame,
    SparkSession,
)
from pyspark.sql.functions import (
    col,
    datediff,
    lit,
    regexp_replace,
    round,
    to_date,
    trim,
    upper,
    when,
)
from pyspark.sql.types import (
    FloatType,
    IntegerType,
    StringType,
    StructField,
    StructType,
)



# Create Spark Session
spark = SparkSession.builder \
    .appName("Orderlines  DataProcessing") \
    .getOrCreate()
#Customers Schema
order_lines_schema = StructType(
        [
            StructField("ORDER_ID", StringType(), True),
            StructField("PRODUCT_ID", StringType(), True),
            StructField("ORDER_QTY", FloatType(), True),
            StructField("AGREED_DELIVERY_DATE", StringType(), True),
            StructField("ACTUAL_DELIVERY_DATE", StringType(), True),
            StructField("DELIVERY_QTY", StringType(), True),
        ]) 

# Reading Customer CSV
df = spark.read \
    .format("csv") \
    .option("header", True) \
    .schema(order_lines_schema) \
    .load("../data/order_lines.csv")




In [10]:
# Check schemas
print("Customers DataFrame Schema:")
df.printSchema()

Customers DataFrame Schema:
root
 |-- ORDER_ID: string (nullable = true)
 |-- PRODUCT_ID: string (nullable = true)
 |-- ORDER_QTY: float (nullable = true)
 |-- AGREED_DELIVERY_DATE: string (nullable = true)
 |-- ACTUAL_DELIVERY_DATE: string (nullable = true)
 |-- DELIVERY_QTY: string (nullable = true)



In [11]:
order_lines_df.show()

+-----------+----------+---------+--------------------+--------------------+------------+
|   ORDER_ID|PRODUCT_ID|ORDER_QTY|AGREED_DELIVERY_DATE|ACTUAL_DELIVERY_DATE|DELIVERY_QTY|
+-----------+----------+---------+--------------------+--------------------+------------+
|FMR34203601|  25891601|    110.0|Friday, March 4, ...|Friday, March 4, ...|         110|
|FMR32320302|  25891203|    347.0|Wednesday, March ...|Wednesday, March ...|         347|
|FMR33320501|  25891203|    187.0|Thursday, March 3...|Thursday, March 3...|         150|
|FMR34220601|  25891203|    235.0|Friday, March 4, ...|Friday, March 4, ...|         235|
|FMR33703603|  25891203|    176.0|Thursday, March 3...|Thursday, March 3...|         176|
|FMR33721603|  25891203|    345.0|Thursday, March 3...|Thursday, March 3...|         345|
|FMR33420203|  25891203|    138.0|Thursday, March 3...|Sunday, March 6, ...|         138|
|FMR34420402|  25891203|    381.0|Friday, March 4, ...|Saturday, March 5...|         381|
|FMR324034

In [12]:

def clean_order_qty_and_delivery_qty(df: DataFrame) -> DataFrame:
    """Clean ORDER_QTY and DELIVERY_QTY columns."""
    return (
        df.withColumn("ORDER_QTY", col("ORDER_QTY").cast(IntegerType()))
        .withColumn("DELIVERY_QTY", regexp_replace(col("DELIVERY_QTY"), r"[^0-9]", ""))
        .withColumn("DELIVERY_QTY", col("DELIVERY_QTY").cast(IntegerType()))
    )

In [13]:
def filter_invalid_quantities(df: DataFrame) -> DataFrame:
    """Filter out rows with invalid quantities."""
    return df.filter((col("ORDER_QTY") > 0) & (col("DELIVERY_QTY") > 0))


In [14]:
def clean_agreed_delivery_date(df: DataFrame) -> DataFrame:
    """Clean and parse AGREED_DELIVERY_DATE column."""
    return (
        df.withColumn(
            "AGREED_DELIVERY_DATE",
            regexp_replace(
                col("AGREED_DELIVERY_DATE"), r"[^a-zA-Z0-9/,-]", ""
            ),  # Remove special characters
        )
        .withColumn(
            "AGREED_DELIVERY_DATE",
            regexp_replace(
                col("AGREED_DELIVERY_DATE"), r"\d{4}", "2024"
            ),  # Replace any year with 2024
        )
        .withColumn(
            "AGREED_DELIVERY_DATE",
            when(
                to_date(col("AGREED_DELIVERY_DATE"), "MM/dd/yyyy").isNotNull(),
                to_date(col("AGREED_DELIVERY_DATE"), "MM/dd/yyyy"),
            )
            .when(
                to_date(col("AGREED_DELIVERY_DATE"), "yyyy-MM-dd").isNotNull(),
                to_date(col("AGREED_DELIVERY_DATE"), "yyyy-MM-dd"),
            )
            .otherwise(
                to_date(lit("2024-01-01"), "yyyy-MM-dd")
            ),  # Default value for invalid dates
        )
    )


In [15]:
def clean_actual_delivery_date(df: DataFrame) -> DataFrame:
    """Clean and parse ACTUAL_DELIVERY_DATE column."""
    return (
        df.withColumn(
            "ACTUAL_DELIVERY_DATE",
            regexp_replace(
                col("ACTUAL_DELIVERY_DATE"), r"[^a-zA-Z0-9/,-]", ""
            ),  # Remove special characters
        )
        .withColumn(
            "ACTUAL_DELIVERY_DATE",
            regexp_replace(
                col("ACTUAL_DELIVERY_DATE"), r"\d{4}", "2024"
            ),  # Replace any year with 2024
        )
        .withColumn(
            "ACTUAL_DELIVERY_DATE",
            when(
                to_date(col("ACTUAL_DELIVERY_DATE"), "MM/dd/yyyy").isNotNull(),
                to_date(col("ACTUAL_DELIVERY_DATE"), "MM/dd/yyyy"),
            )
            .when(
                to_date(col("ACTUAL_DELIVERY_DATE"), "yyyy-MM-dd").isNotNull(),
                to_date(col("ACTUAL_DELIVERY_DATE"), "yyyy-MM-dd"),
            )
            .otherwise(
                to_date(lit("2024-01-01"), "yyyy-MM-dd")
            ),  # Default value for invalid dates
        )
    )

In [16]:
def filter_unwanted_values(df: DataFrame, unwanted_values: list) -> DataFrame:
    """Filter out rows with unwanted values in any column."""
    for column in df.columns:
        df = df.filter(~trim(col(column)).isin(unwanted_values))
    return df


def drop_null_values(df: DataFrame) -> DataFrame:
    """Drop rows with null values in any column."""
    return df.dropna()


def convert_column_names_to_lowercase(df: DataFrame) -> DataFrame:
    """Convert column names to lowercase."""
    return df.select([col(c).alias(c.lower()) for c in df.columns])


def add_derived_columns(df: DataFrame) -> DataFrame:
    """Add derived columns for analysis."""
    return (
        df.withColumn(
            "delivery_delay_days",
            datediff(col("actual_delivery_date"), col("agreed_delivery_date")),
        )
        .withColumn(
            "delivery_completion_rate",
            round(col("delivery_qty") / col("order_qty") * 100, 2),
        )
        .withColumn(
            "is_on_time",
            when(col("delivery_delay_days") <= 0, "Yes").otherwise("No"),
        )
        .withColumn(
            "is_complete_delivery",
            when(col("delivery_completion_rate") >= 100, "Yes").otherwise("No"),
        )
    )


def clean_order_lines_data(df: DataFrame) -> DataFrame:
    """Clean and transform order lines data."""
    # Define unwanted values
    unwanted_values = ["NULL", "null", "NA", "none", "N/A"]

    # Apply transformations
    df = clean_order_id_and_product_id(df)
    df = clean_order_qty_and_delivery_qty(df)
    df = filter_invalid_quantities(df)
    df = clean_agreed_delivery_date(df)
    df = clean_actual_delivery_date(df)
    df = filter_unwanted_values(df, unwanted_values)
    df = drop_null_values(df)
    df = convert_column_names_to_lowercase(df)
    df = add_derived_columns(df)

    return df

In [None]:
if __name__ == "__main__":

    # Load and clean data
    order_lines_df = load_order_lines_data(glue_context, s3_input_path)
    cleaned_order_lines = clean_order_lines_data(order_lines_df)

    # Save the cleaned data to S3 as a CSV file
    cleaned_order_lines.write.mode("overwrite").format("csv").option(
        "header", "true"
    ).save(s3_output_path)

    # Commit the Glue job
    job.commit()

In [19]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, lit
from pyspark.sql.functions import col, regexp_replace, trim, when, regexp_extract
from pyspark.sql.types import *
from pyspark.sql.functions import col, isnan, when, count, date_format, to_date, to_timestamp
from pyspark.sql import (
    DataFrame,
    SparkSession,
)
from pyspark.sql.functions import (
    col,
    datediff,
    lit,
    regexp_replace,
    round,
    to_date,
    trim,
    upper,
    when,
)
from pyspark.sql.types import (
    FloatType,
    IntegerType,
    StringType,
    StructField,
    StructType,
)

def clean_order_id_and_product_id(df: DataFrame) -> DataFrame:
    """Clean ORDER_ID and PRODUCT_ID columns."""
    return (
        df.withColumn(
            "ORDER_ID",
            upper(regexp_replace(trim(col("ORDER_ID")), r"[^a-zA-Z0-9]", "")),
        )
        .withColumn("PRODUCT_ID", regexp_replace(col("PRODUCT_ID"), r"[^0-9]", ""))
        .withColumn("PRODUCT_ID", col("PRODUCT_ID").cast(IntegerType()))
    )


def clean_order_qty_and_delivery_qty(df: DataFrame) -> DataFrame:
    """Clean ORDER_QTY and DELIVERY_QTY columns."""
    return (
        df.withColumn("ORDER_QTY", col("ORDER_QTY").cast(IntegerType()))
        .withColumn("DELIVERY_QTY", regexp_replace(col("DELIVERY_QTY"), r"[^0-9]", ""))
        .withColumn("DELIVERY_QTY", col("DELIVERY_QTY").cast(IntegerType()))
    )


def filter_invalid_quantities(df: DataFrame) -> DataFrame:
    """Filter out rows with invalid quantities."""
    return df.filter((col("ORDER_QTY") > 0) & (col("DELIVERY_QTY") > 0))


def clean_agreed_delivery_date(df: DataFrame) -> DataFrame:
    """Clean and parse AGREED_DELIVERY_DATE column."""
    return (
        df.withColumn(
            "AGREED_DELIVERY_DATE",
            regexp_replace(
                col("AGREED_DELIVERY_DATE"), r",.*$", ""
            ),  # Remove comma and anything after it
        )
        .withColumn(
            "AGREED_DELIVERY_DATE",
            regexp_replace(
                col("AGREED_DELIVERY_DATE"), r"[^a-zA-Z0-9/\-]", ""
            ),  # Remove special characters except / and -
        )
        .withColumn(
            "AGREED_DELIVERY_DATE",
            regexp_replace(
                col("AGREED_DELIVERY_DATE"), r"\d{4}", "2024"
            ),  # Replace any year with 2024
        )
        .withColumn(
            "AGREED_DELIVERY_DATE",
            when(
                to_date(col("AGREED_DELIVERY_DATE"), "MM/dd/yyyy").isNotNull(),
                to_date(col("AGREED_DELIVERY_DATE"), "MM/dd/yyyy"),
            )
            .when(
                to_date(col("AGREED_DELIVERY_DATE"), "yyyy-MM-dd").isNotNull(),
                to_date(col("AGREED_DELIVERY_DATE"), "yyyy-MM-dd"),
            )
            .otherwise(
                to_date(lit("2024-01-01"), "yyyy-MM-dd")
            ),  # Default value for invalid dates
        )
    )


def clean_actual_delivery_date(df: DataFrame) -> DataFrame:
    """Clean and parse ACTUAL_DELIVERY_DATE column."""
    return (
        df.withColumn(
            "ACTUAL_DELIVERY_DATE",
            regexp_replace(
                col("ACTUAL_DELIVERY_DATE"), r",.*$", ""
            ),  # Remove comma and anything after it
        )
        .withColumn(
            "ACTUAL_DELIVERY_DATE",
            regexp_replace(
                col("ACTUAL_DELIVERY_DATE"), r"[^a-zA-Z0-9/\-]", ""
            ),  # Remove special characters except / and -
        )
        .withColumn(
            "ACTUAL_DELIVERY_DATE",
            regexp_replace(
                col("ACTUAL_DELIVERY_DATE"), r"\d{4}", "2024"
            ),  # Replace any year with 2024
        )
        .withColumn(
            "ACTUAL_DELIVERY_DATE",
            when(
                to_date(col("ACTUAL_DELIVERY_DATE"), "MM/dd/yyyy").isNotNull(),
                to_date(col("ACTUAL_DELIVERY_DATE"), "MM/dd/yyyy"),
            )
            .when(
                to_date(col("ACTUAL_DELIVERY_DATE"), "yyyy-MM-dd").isNotNull(),
                to_date(col("ACTUAL_DELIVERY_DATE"), "yyyy-MM-dd"),
            )
            .otherwise(
                to_date(lit("2024-01-01"), "yyyy-MM-dd")
            ),  # Default value for invalid dates
        )
    )


def filter_unwanted_values(df: DataFrame, unwanted_values: list) -> DataFrame:
    """Filter out rows with unwanted values in any column."""
    for column in df.columns:
        df = df.filter(~trim(col(column)).isin(unwanted_values))
    return df


def drop_null_values(df: DataFrame) -> DataFrame:
    """Drop rows with null values in any column."""
    return df.dropna()


def convert_column_names_to_lowercase(df: DataFrame) -> DataFrame:
    """Convert column names to lowercase."""
    return df.select([col(c).alias(c.lower()) for c in df.columns])


def add_derived_columns(df: DataFrame) -> DataFrame:
    """Add derived columns for analysis."""
    return (
        df.withColumn(
            "delivery_delay_days",
            datediff(col("actual_delivery_date"), col("agreed_delivery_date")),
        )
        .withColumn(
            "delivery_completion_rate",
            round(col("delivery_qty") / col("order_qty") * 100, 2),
        )
        .withColumn(
            "is_on_time",
            when(col("delivery_delay_days") <= 0, "Yes").otherwise("No"),
        )
        .withColumn(
            "is_complete_delivery",
            when(col("delivery_completion_rate") >= 100, "Yes").otherwise("No"),
        )
    )


def clean_order_lines_data(df: DataFrame) -> DataFrame:
    """Clean and transform order lines data."""
    # Define unwanted values
    unwanted_values = ["NULL", "null", "NA", "none", "N/A"]

    # Apply transformations
    df = clean_order_id_and_product_id(df)
    df = clean_order_qty_and_delivery_qty(df)
    df = filter_invalid_quantities(df)
    df = clean_agreed_delivery_date(df)
    df = clean_actual_delivery_date(df)
    df = filter_unwanted_values(df, unwanted_values)
    df = drop_null_values(df)
    df = convert_column_names_to_lowercase(df)
    df = add_derived_columns(df)

    return df


if __name__ == "__main__":
    # Create Spark Session with legacy time parser policy
    spark = SparkSession.builder \
        .appName("Orderlines DataProcessing") \
        .config("spark.sql.legacy.timeParserPolicy", "LEGACY") \
        .getOrCreate()
    
    # Define order lines schema
    order_lines_schema = StructType(
        [
            StructField("ORDER_ID", StringType(), True),
            StructField("PRODUCT_ID", StringType(), True),
            StructField("ORDER_QTY", FloatType(), True),
            StructField("AGREED_DELIVERY_DATE", StringType(), True),
            StructField("ACTUAL_DELIVERY_DATE", StringType(), True),
            StructField("DELIVERY_QTY", StringType(), True),
        ]
    )
    
    # Read the CSV file
    df = spark.read \
        .format("csv") \
        .option("header", True) \
        .schema(order_lines_schema) \
        .load("../data/order_lines.csv")
    
    # Show sample of original data
    print("Original data sample:")
    df.show(5)
    
    # Clean the data
    cleaned_df = clean_order_lines_data(df)
    
    # Show sample of cleaned data
    print("Cleaned data sample:")
    cleaned_df.show(5)
    
    # Save cleaned data as CSV
    cleaned_df.write \
        .format("csv") \
        .option("header", "true") \
        .mode("overwrite") \
        .save("../data/cleaned_order_lines")
    
    # Stop Spark session
    spark.stop()

Original data sample:
+-----------+----------+---------+--------------------+--------------------+------------+
|   ORDER_ID|PRODUCT_ID|ORDER_QTY|AGREED_DELIVERY_DATE|ACTUAL_DELIVERY_DATE|DELIVERY_QTY|
+-----------+----------+---------+--------------------+--------------------+------------+
|FMR34203601|  25891601|    110.0|Friday, March 4, ...|Friday, March 4, ...|         110|
|FMR32320302|  25891203|    347.0|Wednesday, March ...|Wednesday, March ...|         347|
|FMR33320501|  25891203|    187.0|Thursday, March 3...|Thursday, March 3...|         150|
|FMR34220601|  25891203|    235.0|Friday, March 4, ...|Friday, March 4, ...|         235|
|FMR33703603|  25891203|    176.0|Thursday, March 3...|Thursday, March 3...|         176|
+-----------+----------+---------+--------------------+--------------------+------------+
only showing top 5 rows

Cleaned data sample:
+-----------+----------+---------+--------------------+--------------------+------------+-------------------+-------------

                                                                                

In [21]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, lit
from pyspark.sql.functions import col, regexp_replace, trim, when, regexp_extract
from pyspark.sql.types import *
from pyspark.sql.functions import col, isnan, when, count, date_format, to_date, to_timestamp
from pyspark.sql import (
    DataFrame,
    SparkSession,
)
from pyspark.sql.functions import (
    col,
    datediff,
    lit,
    regexp_replace,
    regexp_extract,
    round,
    to_date,
    trim,
    upper,
    when,
)
from pyspark.sql.types import (
    FloatType,
    IntegerType,
    StringType,
    StructField,
    StructType,
)

def clean_order_id_and_product_id(df: DataFrame) -> DataFrame:
    """Clean ORDER_ID and PRODUCT_ID columns."""
    return (
        df.withColumn(
            "ORDER_ID",
            upper(regexp_replace(trim(col("ORDER_ID")), r"[^a-zA-Z0-9]", "")),
        )
        .withColumn("PRODUCT_ID", regexp_replace(col("PRODUCT_ID"), r"[^0-9]", ""))
        .withColumn("PRODUCT_ID", col("PRODUCT_ID").cast(IntegerType()))
    )


def clean_order_qty_and_delivery_qty(df: DataFrame) -> DataFrame:
    """Clean ORDER_QTY and DELIVERY_QTY columns."""
    return (
        df.withColumn("ORDER_QTY", col("ORDER_QTY").cast(IntegerType()))
        .withColumn("DELIVERY_QTY", regexp_replace(col("DELIVERY_QTY"), r"[^0-9]", ""))
        .withColumn("DELIVERY_QTY", col("DELIVERY_QTY").cast(IntegerType()))
    )


def filter_invalid_quantities(df: DataFrame) -> DataFrame:
    """Filter out rows with invalid quantities."""
    return df.filter((col("ORDER_QTY") > 0) & (col("DELIVERY_QTY") > 0))


def clean_agreed_delivery_date(df: DataFrame) -> DataFrame:
    """Clean and parse AGREED_DELIVERY_DATE column."""
    return (
        df.withColumn(
            "AGREED_DELIVERY_DATE",
            # Extract the month day, year part
            regexp_extract(col("AGREED_DELIVERY_DATE"), r"([A-Za-z]+, [A-Za-z]+ \d+, \d{4})", 1)
        )
        .withColumn(
            "AGREED_DELIVERY_DATE",
            to_date(col("AGREED_DELIVERY_DATE"), "EEEE, MMMM d, yyyy")
        )
    )


def clean_actual_delivery_date(df: DataFrame) -> DataFrame:
    """Clean and parse ACTUAL_DELIVERY_DATE column."""
    return (
        df.withColumn(
            "ACTUAL_DELIVERY_DATE",
            # Extract the month day, year part
            regexp_extract(col("ACTUAL_DELIVERY_DATE"), r"([A-Za-z]+, [A-Za-z]+ \d+, \d{4})", 1)
        )
        .withColumn(
            "ACTUAL_DELIVERY_DATE",
            to_date(col("ACTUAL_DELIVERY_DATE"), "EEEE, MMMM d, yyyy")
        )
    )


def filter_unwanted_values(df: DataFrame, unwanted_values: list) -> DataFrame:
    """Filter out rows with unwanted values in any column."""
    for column in df.columns:
        df = df.filter(~trim(col(column)).isin(unwanted_values))
    return df


def drop_null_values(df: DataFrame) -> DataFrame:
    """Drop rows with null values in any column."""
    return df.dropna()


def convert_column_names_to_lowercase(df: DataFrame) -> DataFrame:
    """Convert column names to lowercase."""
    return df.select([col(c).alias(c.lower()) for c in df.columns])


def add_derived_columns(df: DataFrame) -> DataFrame:
    """Add derived columns for analysis."""
    return (
        df.withColumn(
            "delivery_delay_days",
            datediff(col("actual_delivery_date"), col("agreed_delivery_date")),
        )
        .withColumn(
            "delivery_completion_rate",
            round(col("delivery_qty") / col("order_qty") * 100, 2),
        )
        .withColumn(
            "is_on_time",
            when(col("delivery_delay_days") <= 0, "Yes").otherwise("No"),
        )
        .withColumn(
            "is_complete_delivery",
            when(col("delivery_completion_rate") >= 100, "Yes").otherwise("No"),
        )
    )


def clean_order_lines_data(df: DataFrame) -> DataFrame:
    """Clean and transform order lines data."""
    # Define unwanted values
    unwanted_values = ["NULL", "null", "NA", "none", "N/A"]

    # Apply transformations
    df = clean_order_id_and_product_id(df)
    df = clean_order_qty_and_delivery_qty(df)
    df = filter_invalid_quantities(df)
    df = clean_agreed_delivery_date(df)
    df = clean_actual_delivery_date(df)
    df = filter_unwanted_values(df, unwanted_values)
    df = drop_null_values(df)
    df = convert_column_names_to_lowercase(df)
    df = add_derived_columns(df)

    return df


if __name__ == "__main__":
    # Create Spark Session with legacy time parser policy
    spark = SparkSession.builder \
        .appName("Orderlines DataProcessing") \
        .config("spark.sql.legacy.timeParserPolicy", "LEGACY") \
        .getOrCreate()
    
    # Define order lines schema
    order_lines_schema = StructType(
        [
            StructField("ORDER_ID", StringType(), True),
            StructField("PRODUCT_ID", StringType(), True),
            StructField("ORDER_QTY", FloatType(), True),
            StructField("AGREED_DELIVERY_DATE", StringType(), True),
            StructField("ACTUAL_DELIVERY_DATE", StringType(), True),
            StructField("DELIVERY_QTY", StringType(), True),
        ]
    )
    
    # Read the CSV file
    df = spark.read \
        .format("csv") \
        .option("header", True) \
        .schema(order_lines_schema) \
        .load("../data/order_lines.csv")
    
    # Show sample of original data
    print("Original data sample:")
    df.show(5)
    
    # Clean the data
    cleaned_df = clean_order_lines_data(df)
    
    # Show sample of cleaned data
    print("Cleaned data sample:")
    cleaned_df.show(5)
    
    # Save cleaned data as CSV
    cleaned_df.write \
        .format("csv") \
        .option("header", "true") \
        .mode("overwrite") \
        .save("../data/cleaned_order_lines")
    
    # Stop Spark session
    spark.stop()

25/03/18 20:23:37 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
Original data sample:
+-----------+----------+---------+--------------------+--------------------+------------+
|   ORDER_ID|PRODUCT_ID|ORDER_QTY|AGREED_DELIVERY_DATE|ACTUAL_DELIVERY_DATE|DELIVERY_QTY|
+-----------+----------+---------+--------------------+--------------------+------------+
|FMR34203601|  25891601|    110.0|Friday, March 4, ...|Friday, March 4, ...|         110|
|FMR32320302|  25891203|    347.0|Wednesday, March ...|Wednesday, March ...|         347|
|FMR33320501|  25891203|    187.0|Thursday, March 3...|Thursday, March 3...|         150|
|FMR34220601|  25891203|    235.0|Friday, March 4, ...|Friday, March 4, ...|         235|
|FMR33703603|  25891203|    176.0|Thursday, March 3...|Thursday, March 3...|         176|
+-----------+----------+---------+--------------------+--------------------+------------+
only showing top 5 rows

Cleaned data sample:
+-----------+----------

In [22]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, lit
from pyspark.sql.functions import col, regexp_replace, trim, when, regexp_extract
from pyspark.sql.types import *
from pyspark.sql.functions import col, isnan, when, count, date_format, to_date, to_timestamp
from pyspark.sql import (
    DataFrame,
    SparkSession,
)
from pyspark.sql.functions import (
    col,
    concat,
    datediff,
    lit,
    lpad,
    regexp_replace,
    regexp_extract,
    round,
    to_date,
    trim,
    upper,
    when,
    year,
    month,
    dayofmonth,
)
from pyspark.sql.types import (
    FloatType,
    IntegerType,
    StringType,
    StructField,
    StructType,
)

def clean_order_id_and_product_id(df: DataFrame) -> DataFrame:
    """Clean ORDER_ID and PRODUCT_ID columns."""
    return (
        df.withColumn(
            "ORDER_ID",
            upper(regexp_replace(trim(col("ORDER_ID")), r"[^a-zA-Z0-9]", "")),
        )
        .withColumn("PRODUCT_ID", regexp_replace(col("PRODUCT_ID"), r"[^0-9]", ""))
        .withColumn("PRODUCT_ID", col("PRODUCT_ID").cast(IntegerType()))
    )


def clean_order_qty_and_delivery_qty(df: DataFrame) -> DataFrame:
    """Clean ORDER_QTY and DELIVERY_QTY columns."""
    return (
        df.withColumn("ORDER_QTY", col("ORDER_QTY").cast(IntegerType()))
        .withColumn("DELIVERY_QTY", regexp_replace(col("DELIVERY_QTY"), r"[^0-9]", ""))
        .withColumn("DELIVERY_QTY", col("DELIVERY_QTY").cast(IntegerType()))
    )


def filter_invalid_quantities(df: DataFrame) -> DataFrame:
    """Filter out rows with invalid quantities."""
    return df.filter((col("ORDER_QTY") > 0) & (col("DELIVERY_QTY") > 0))


def clean_agreed_delivery_date(df: DataFrame) -> DataFrame:
    """Clean and parse AGREED_DELIVERY_DATE column, setting all years to 2024."""
    return (
        df.withColumn(
            "AGREED_DELIVERY_DATE",
            # Extract the month day, year part
            regexp_extract(col("AGREED_DELIVERY_DATE"), r"([A-Za-z]+, [A-Za-z]+ \d+, \d{4})", 1)
        )
        .withColumn(
            "AGREED_DELIVERY_DATE",
            to_date(col("AGREED_DELIVERY_DATE"), "EEEE, MMMM d, yyyy")
        )
        # Extract month and day, then reconstruct with year 2024
        .withColumn(
            "AGREED_DELIVERY_DATE",
            to_date(
                concat(
                    lit("2024-"),
                    lpad(month(col("AGREED_DELIVERY_DATE")).cast("string"), 2, "0"),
                    lit("-"),
                    lpad(dayofmonth(col("AGREED_DELIVERY_DATE")).cast("string"), 2, "0")
                ),
                "yyyy-MM-dd"
            )
        )
    )


def clean_actual_delivery_date(df: DataFrame) -> DataFrame:
    """Clean and parse ACTUAL_DELIVERY_DATE column, setting all years to 2024."""
    return (
        df.withColumn(
            "ACTUAL_DELIVERY_DATE",
            # Extract the month day, year part
            regexp_extract(col("ACTUAL_DELIVERY_DATE"), r"([A-Za-z]+, [A-Za-z]+ \d+, \d{4})", 1)
        )
        .withColumn(
            "ACTUAL_DELIVERY_DATE",
            to_date(col("ACTUAL_DELIVERY_DATE"), "EEEE, MMMM d, yyyy")
        )
        # Extract month and day, then reconstruct with year 2024
        .withColumn(
            "ACTUAL_DELIVERY_DATE",
            to_date(
                concat(
                    lit("2024-"),
                    lpad(month(col("ACTUAL_DELIVERY_DATE")).cast("string"), 2, "0"),
                    lit("-"),
                    lpad(dayofmonth(col("ACTUAL_DELIVERY_DATE")).cast("string"), 2, "0")
                ),
                "yyyy-MM-dd"
            )
        )
    )


def filter_unwanted_values(df: DataFrame, unwanted_values: list) -> DataFrame:
    """Filter out rows with unwanted values in any column."""
    for column in df.columns:
        df = df.filter(~trim(col(column)).isin(unwanted_values))
    return df


def drop_null_values(df: DataFrame) -> DataFrame:
    """Drop rows with null values in any column."""
    return df.dropna()


def convert_column_names_to_lowercase(df: DataFrame) -> DataFrame:
    """Convert column names to lowercase."""
    return df.select([col(c).alias(c.lower()) for c in df.columns])


def add_derived_columns(df: DataFrame) -> DataFrame:
    """Add derived columns for analysis."""
    return (
        df.withColumn(
            "delivery_delay_days",
            datediff(col("actual_delivery_date"), col("agreed_delivery_date")),
        )
        .withColumn(
            "delivery_completion_rate",
            round(col("delivery_qty") / col("order_qty") * 100, 2),
        )
        .withColumn(
            "is_on_time",
            when(col("delivery_delay_days") <= 0, "Yes").otherwise("No"),
        )
        .withColumn(
            "is_complete_delivery",
            when(col("delivery_completion_rate") >= 100, "Yes").otherwise("No"),
        )
    )


def clean_order_lines_data(df: DataFrame) -> DataFrame:
    """Clean and transform order lines data."""
    # Define unwanted values
    unwanted_values = ["NULL", "null", "NA", "none", "N/A"]

    # Apply transformations
    df = clean_order_id_and_product_id(df)
    df = clean_order_qty_and_delivery_qty(df)
    df = filter_invalid_quantities(df)
    df = clean_agreed_delivery_date(df)
    df = clean_actual_delivery_date(df)
    df = filter_unwanted_values(df, unwanted_values)
    df = drop_null_values(df)
    df = convert_column_names_to_lowercase(df)
    df = add_derived_columns(df)

    return df


if __name__ == "__main__":
    # Create Spark Session with legacy time parser policy
    spark = SparkSession.builder \
        .appName("Orderlines DataProcessing") \
        .config("spark.sql.legacy.timeParserPolicy", "LEGACY") \
        .getOrCreate()
    
    # Define order lines schema
    order_lines_schema = StructType(
        [
            StructField("ORDER_ID", StringType(), True),
            StructField("PRODUCT_ID", StringType(), True),
            StructField("ORDER_QTY", FloatType(), True),
            StructField("AGREED_DELIVERY_DATE", StringType(), True),
            StructField("ACTUAL_DELIVERY_DATE", StringType(), True),
            StructField("DELIVERY_QTY", StringType(), True),
        ]
    )
    
    # Read the CSV file
    df = spark.read \
        .format("csv") \
        .option("header", True) \
        .schema(order_lines_schema) \
        .load("../data/order_lines.csv")
    
    # Show sample of original data
    print("Original data sample:")
    df.show(5)
    
    # Clean the data
    cleaned_df = clean_order_lines_data(df)
    
    # Show sample of cleaned data
    print("Cleaned data sample:")
    cleaned_df.show(5)
    
    # Save cleaned data as CSV
    cleaned_df.write \
        .format("csv") \
        .option("header", "true") \
        .mode("overwrite") \
        .save("../data/cleaned_order_lines")
    
    # Stop Spark session
    spark.stop()

25/03/18 20:27:20 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
Original data sample:
+-----------+----------+---------+--------------------+--------------------+------------+
|   ORDER_ID|PRODUCT_ID|ORDER_QTY|AGREED_DELIVERY_DATE|ACTUAL_DELIVERY_DATE|DELIVERY_QTY|
+-----------+----------+---------+--------------------+--------------------+------------+
|FMR34203601|  25891601|    110.0|Friday, March 4, ...|Friday, March 4, ...|         110|
|FMR32320302|  25891203|    347.0|Wednesday, March ...|Wednesday, March ...|         347|
|FMR33320501|  25891203|    187.0|Thursday, March 3...|Thursday, March 3...|         150|
|FMR34220601|  25891203|    235.0|Friday, March 4, ...|Friday, March 4, ...|         235|
|FMR33703603|  25891203|    176.0|Thursday, March 3...|Thursday, March 3...|         176|
+-----------+----------+---------+--------------------+--------------------+------------+
only showing top 5 rows

Cleaned data sample:
+-----------+----------

                                                                                