In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, lit
from pyspark.sql.functions import col, regexp_replace, trim, when, regexp_extract
from pyspark.sql.types import *
from pyspark.sql.functions import col, isnan, when, count ,date_format,to_date,to_timestamp

In [2]:
# Create Spark Session
spark = SparkSession.builder \
    .appName("DataProcessing") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


25/01/14 18:09:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
spark 

In [4]:
#Products Schema
products_schema = StructType([
     StructField("product_id", StringType(), True),
    StructField("product_name", StringType(), True),
    StructField("category", StringType(), True),
])  

In [5]:
# Reading Customer CSV
products_df = spark.read \
    .format("csv") \
    .option("header", True) \
    .schema(products_schema) \
    .load("../data/products.csv")

In [6]:
# Check schemas
print("Products DataFrame Schema:")
products_df.printSchema()

Products DataFrame Schema:
root
 |-- product_id: string (nullable = true)
 |-- product_name: string (nullable = true)
 |-- category: string (nullable = true)



In [7]:
num_rows = products_df.count()
print(f"Number of rows: {num_rows}")


Number of rows: 100


### Check for missing values

In [8]:
# Counting missing values for each column
missing_values = products_df.select(
    [count(when(col(c).isNull(), c)).alias(c) for c in products_df.columns]
)
missing_values.show()

25/01/14 18:09:53 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: product_name, product_id, category
 Schema: product_id, product_name, category
Expected: product_id but found: product_name
CSV file: file:///workspace/data/products.csv
+----------+------------+--------+
|product_id|product_name|category|
+----------+------------+--------+
|         0|           0|      16|
+----------+------------+--------+



In [9]:
from pyspark.sql import functions as F

def clean_product_names(products_df):
    """
    Efficiently handle missing names in products dataset using PySpark.
    Using na.fill() is more performant than withColumn() for simple replacements.
    
    Args:
        customers_df: PySpark DataFrame containing customer data
        
    Returns:
        PySpark DataFrame with missing names replaced with "Unknown"
    """
    # Get list of name columns (assuming they might be first_name, last_name, or name)
    name_columns = [col for col in products_df.columns 
                   if any(name_field in col.lower() 
                         for name_field in ['product_name','category'])]
    
    # Create dictionary of columns to fill
    fill_dict = {col: "Unknown" for col in name_columns}
    
    # Use na.fill() which is more efficient than withColumn() for simple replacements
    cleaned_df = products_df.na.fill(fill_dict)
    
    # Cache the result if you'll be using it multiple times
    cleaned_df = cleaned_df.cache()  # Uncomment if needed
    
    return cleaned_df


# Example usage
cleaned_products_df = clean_product_names(products_df)


In [10]:
# Counting missing values for each column
missing_values = cleaned_products_df.select(
    [count(when(col(c).isNull(), c)).alias(c) for c in cleaned_products_df.columns]
)
missing_values.show()

25/01/14 18:09:53 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: product_name, product_id, category
 Schema: product_id, product_name, category
Expected: product_id but found: product_name
CSV file: file:///workspace/data/products.csv
+----------+------------+--------+
|product_id|product_name|category|
+----------+------------+--------+
|         0|           0|       0|
+----------+------------+--------+



In [11]:
cleaned_products_df.show(5)

+--------------------+------------+---------------+
|          product_id|product_name|       category|
+--------------------+------------+---------------+
|    COCA COLA 500ML_|     1001001|      Beverages|
|           Flour 2kg|     1005001|      Groceries|
|     Coca Cola 500ml|     1001001|InvalidCategory|
|         FLOUR 2KG70|     1005001|        Unknown|
|INSTANT COFFEE 200G_|     1008002|      Beverages|
+--------------------+------------+---------------+
only showing top 5 rows



In [12]:
from pyspark.sql.functions import col



# Simple and efficient column renaming
products_df = products_df.select(
    col("product_name").alias("product_id"),
    col("product_id").alias("product_name"),
    col("category")
)

# Show results
print("Corrected column data:")
products_df.show(truncate=False)

Corrected column data:
25/01/14 18:09:54 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: product_name, product_id, category
 Schema: product_id, product_name, category
Expected: product_id but found: product_name
CSV file: file:///workspace/data/products.csv
+----------+--------------------+---------------+
|product_id|product_name        |category       |
+----------+--------------------+---------------+
|1001001   |COCA COLA 500ML_    |Beverages      |
|1005001   |Flour 2kg           |Groceries      |
|1001001   |Coca Cola 500ml     |InvalidCategory|
|1005001   |FLOUR 2KG70         |null           |
|1008002   |INSTANT COFFEE 200G_|Beverages      |
|-99999    |instant coffee 100g_|Beverages      |
|1007001   |YOGURT PLAIN 500ML93|Dairy          |
|1001003   |Sprite 500ml        |InvalidCategory|
|1005002   |Flour 1kg           |Groceries      |
|1005001   |FLOUR 2KG           |Groceries      |
|-99999    |rice 5kg            |null           |
|ID        |te

In [13]:
products_df.show()

25/01/14 18:09:54 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: product_name, product_id, category
 Schema: product_id, product_name, category
Expected: product_id but found: product_name
CSV file: file:///workspace/data/products.csv
+----------+--------------------+---------------+
|product_id|        product_name|       category|
+----------+--------------------+---------------+
|   1001001|    COCA COLA 500ML_|      Beverages|
|   1005001|           Flour 2kg|      Groceries|
|   1001001|     Coca Cola 500ml|InvalidCategory|
|   1005001|         FLOUR 2KG70|           null|
|   1008002|INSTANT COFFEE 200G_|      Beverages|
|    -99999|instant coffee 100g_|      Beverages|
|   1007001|YOGURT PLAIN 500ML93|          Dairy|
|   1001003|        Sprite 500ml|InvalidCategory|
|   1005002|           Flour 1kg|      Groceries|
|   1005001|           FLOUR 2KG|      Groceries|
|    -99999|            rice 5kg|           null|
|        ID|     tea leaves 250g|    

In [14]:
from pyspark.sql.functions import col, initcap

# Metadata
# Date: 2025-01-14 15:41:07 UTC
# User: alexio545

# Convert product_name to Title Case while maintaining the column structure
products_df = products_df.select(
    col("product_id"),
    initcap(col("product_name")).alias("product_name"),  # Convert to Title Case
    col("category")
)

# Show results
print("Product names in Title Case:")
products_df.show(truncate=False)

Product names in Title Case:
25/01/14 18:09:54 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: product_name, product_id, category
 Schema: product_id, product_name, category
Expected: product_id but found: product_name
CSV file: file:///workspace/data/products.csv
+----------+--------------------+---------------+
|product_id|product_name        |category       |
+----------+--------------------+---------------+
|1001001   |Coca Cola 500ml_    |Beverages      |
|1005001   |Flour 2kg           |Groceries      |
|1001001   |Coca Cola 500ml     |InvalidCategory|
|1005001   |Flour 2kg70         |null           |
|1008002   |Instant Coffee 200g_|Beverages      |
|-99999    |Instant Coffee 100g_|Beverages      |
|1007001   |Yogurt Plain 500ml93|Dairy          |
|1001003   |Sprite 500ml        |InvalidCategory|
|1005002   |Flour 1kg           |Groceries      |
|1005001   |Flour 2kg           |Groceries      |
|-99999    |Rice 5kg            |null           |
|ID     

In [15]:
from pyspark.sql.functions import col, initcap, regexp_replace, when, lit

# Metadata
# Date: 2025-01-14 15:51:05 UTC
# User: alexio545

# Clean and standardize the dataset
cleaned_products_df = products_df.select(
    # Keep product_id as is
    col("product_id"),
    
    # Clean product_name:
    # 1. Remove trailing numbers
    # 2. Remove special characters
    # 3. Convert to Title Case
    # 4. Trim spaces
    initcap(
        regexp_replace(
            regexp_replace(
                regexp_replace(
                    col("product_name"),
                    r'[_\-]|\d+$',  # Remove trailing numbers and special characters
                    ''
                ),
                r'\s+',  # Standardize spaces
                ' '
            ),
            r'\s+$',  # Remove trailing spaces
            ''
        )
    ).alias("product_name"),
    
    # Clean category:
    # 1. Replace null values with 'Unknown'
    # 2. Replace 'InvalidCategory' with 'Unknown'
    when(
        (col("category").isNull()) | 
        (col("category") == "InvalidCategory"), 
        lit("Unknown")
    ).otherwise(col("category")).alias("category")
)

# Remove duplicates
final_products_df = cleaned_products_df.dropDuplicates(["product_id", "product_name", "category"])

# Show results
print("Cleaned and standardized products data:")
final_products_df.show(truncate=False)

# Optional: Count the changes
print("\nSummary of changes:")
print("Original row count:", products_df.count())
print("Final row count:", final_products_df.count())

Cleaned and standardized products data:
25/01/14 18:09:54 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: product_name, product_id, category
 Schema: product_id, product_name, category
Expected: product_id but found: product_name
CSV file: file:///workspace/data/products.csv
+----------+-------------------+---------+
|product_id|product_name       |category |
+----------+-------------------+---------+
|-99999    |Instant Coffee 100g|Beverages|
|123.45    |Flour 2kg          |Groceries|
|1001001   |Coca Cola 500ml    |Unknown  |
|PROD2351  |Flour 1kg          |Groceries|
|1008001   |Instant Coffee 100g|Beverages|
|1007001   |Yogurt Plain 500ml |Dairy    |
|1007002   |Yogurt Plain 250ml |Unknown  |
|1006002   |Cheddar Cheese 500g|Unknown  |
|1007001   |Yogurt Plain 500ml |Unknown  |
|1006002   |Cheddar Cheese 500g|Dairy    |
|1001003   |Sprite 500ml       |Beverages|
|1005001   |Flour 2kg          |Groceries|
|ID        |Yogurt Plain 250ml |Dairy    |
|1004002 

In [16]:
## Hashing


In [17]:
from pyspark.sql.functions import (
    col, initcap, regexp_replace, when, lit, 
    md5, concat, coalesce
)

# Metadata
# Date: 2025-01-14 15:55:34 UTC
# User: alexio545

# Clean and create hashed product_id
cleaned_products_df = products_df.select(
    # Clean product_name first
    initcap(
        regexp_replace(
            regexp_replace(
                regexp_replace(
                    col("product_name"),
                    r'[_\-]|\d+$',  # Remove trailing numbers and special characters
                    ''
                ),
                r'\s+',  # Standardize spaces
                ' '
            ),
            r'\s+$',  # Remove trailing spaces
            ''
        )
    ).alias("product_name"),
    
    # Clean category
    when(
        (col("category").isNull()) | 
        (col("category") == "InvalidCategory"), 
        lit("Unknown")
    ).otherwise(col("category")).alias("category")
)

# Create hashed product_id
final_products_df = cleaned_products_df.withColumn(
    "product_id",
    md5(concat(
        coalesce(col("product_name"), lit("")),
        lit("_"),  # Delimiter for better uniqueness
        coalesce(col("category"), lit(""))
    ))
).select(
    "product_id",
    "product_name",
    "category"
)

# Cache the result for better performance
final_products_df.cache()

# Remove duplicates
final_products_df = final_products_df.dropDuplicates(["product_name", "category"])

# Show results
print("Products with hashed IDs:")
final_products_df.show(truncate=False)

# Print summary statistics
print("\nSummary:")
print("Original row count:", products_df.count())
print("Final row count:", final_products_df.count())
print("\nSchema of final dataframe:")
final_products_df.printSchema()

Products with hashed IDs:
25/01/14 18:09:55 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: product_name, category
 Schema: product_id, category
Expected: product_id but found: product_name
CSV file: file:///workspace/data/products.csv
+--------------------------------+-------------------+---------+
|product_id                      |product_name       |category |
+--------------------------------+-------------------+---------+
|59e9f4bf700031a537a7495085e108c4|Cheddar Cheese 250g|Dairy    |
|bd984327e9f06bf46e749d2e1300bb08|Cheddar Cheese 250g|Unknown  |
|fde194048278a967cd1e31df11a23052|Cheddar Cheese 500g|Dairy    |
|42687c68689652bb63145761b0120672|Cheddar Cheese 500g|Unknown  |
|374765b4eb2766da24879be557999ad3|Coca Cola 500ml    |Beverages|
|f90fd885f02cb1c92002c744b91dade5|Coca Cola 500ml    |Unknown  |
|3c0594ca73dba30ca02ffb2bfadd45c1|Cooking Oil 1l     |Groceries|
|15dd4eed8967ffc36ec15da1bb5ec094|Cooking Oil 1l     |Unknown  |
|c4ce0dac2a59479a54fd2