In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, lit
from pyspark.sql.functions import col, regexp_replace, trim, when, regexp_extract
from pyspark.sql.types import *
from pyspark.sql.functions import col, isnan, when, count ,date_format,to_date,to_timestamp

In [2]:
# Create Spark Session
spark = SparkSession.builder \
    .appName("DataProcessing") \
    .getOrCreate()


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


25/01/14 18:10:33 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/01/14 18:10:34 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
spark 

In [11]:
from pyspark.sql.types import StructType, StructField, StringType, DateType, IntegerType

date_schema = StructType([
    StructField("date", DateType(), True),       # For actual date values (e.g., "2023-01-01")
    StructField("mmm_yy", StringType(), True),  # For month and year as a string (e.g., "Jan-23")
    StructField("week_no", StringType(), True) # For week number as an integer (e.g., 1, 2, 3, ...)
])


In [12]:
# Reading Customer CSV
date_df = spark.read \
    .format("csv") \
    .option("header", True) \
    .schema(date_schema) \
    .load("../data/dates.csv")

In [13]:
# Check schemas
print("Dates DataFrame Schema:")
date_df.printSchema()

Dates DataFrame Schema:
root
 |-- date: date (nullable = true)
 |-- mmm_yy: string (nullable = true)
 |-- week_no: string (nullable = true)



In [14]:
num_rows = date_df.count()
print(f"Number of rows: {num_rows}")


Number of rows: 366


### Check for missing values

In [15]:
# Counting missing values for each column
missing_values = date_df.select(
    [count(when(col(c).isNull(), c)).alias(c) for c in date_df.columns]
)
missing_values.show()

+----+------+-------+
|date|mmm_yy|week_no|
+----+------+-------+
|  42|    41|      0|
+----+------+-------+



In [16]:
date_df.show()

+----------+------------+-------+
|      date|      mmm_yy|week_no|
+----------+------------+-------+
|2024-01-01|      Jan-24|     W1|
|2024-01-02|      Jan-24|     w1|
|2024-01-03|      Jan-24|     W1|
|2024-01-04|      Jan-24|     W1|
|2024-01-05|      Jan-24|     W1|
|2024-01-06|      Jan-24|     W1|
|2024-01-07|        null|     W1|
|2024-01-08|      JAN-24|     W2|
|2024-01-09|      Jan-24|     w2|
|2024-01-10|      Jan-24|     W2|
|2024-01-11|      Jan-24|     W2|
|2024-01-12|      JAN-24|     W2|
|2024-01-13|      Jan-24|     W2|
|2024-01-14|      Jan-24|     W2|
|2024-01-15|      JAN-24|     W3|
|2024-01-16|      Jan-24|     w3|
|2024-01-17|      Jan-24|     w3|
|2024-01-18|invalid_date|     W3|
|2024-01-19|      Jan-24|     W3|
|2024-01-20|      Jan-24|     W3|
+----------+------------+-------+
only showing top 20 rows



In [19]:
# Show initial count
print("Original row count:", date_df.count())

# Drop all rows with any null/missing values
date_df = date_df.na.drop(how='any')

# Show results and final count
print("\nCleaned data (rows with no missing values):")
date_df.show(truncate=False)

print("\nSummary of changes:")
print("Original row count:", date_df.count())
print("Final row count after dropping missing values:", cleaned_date_df.count())

# Optional: Show the schema to verify columns
print("\nSchema of cleaned dataframe:")
date_df.printSchema()

Original row count: 366

Cleaned data (rows with no missing values):
+----------+------------+-------+
|date      |mmm_yy      |week_no|
+----------+------------+-------+
|2024-01-01|Jan-24      |W1     |
|2024-01-02|Jan-24      |w1     |
|2024-01-03|Jan-24      |W1     |
|2024-01-04|Jan-24      |W1     |
|2024-01-05|Jan-24      |W1     |
|2024-01-06|Jan-24      |W1     |
|2024-01-08|JAN-24      |W2     |
|2024-01-09|Jan-24      |w2     |
|2024-01-10|Jan-24      |W2     |
|2024-01-11|Jan-24      |W2     |
|2024-01-12|JAN-24      |W2     |
|2024-01-13|Jan-24      |W2     |
|2024-01-14|Jan-24      |W2     |
|2024-01-15|JAN-24      |W3     |
|2024-01-16|Jan-24      |w3     |
|2024-01-17|Jan-24      |w3     |
|2024-01-18|invalid_date|W3     |
|2024-01-19|Jan-24      |W3     |
|2024-01-20|Jan-24      |W3     |
|2024-01-21|JAN-24      |w3     |
+----------+------------+-------+
only showing top 20 rows


Summary of changes:
Original row count: 286
Final row count after dropping missing value

In [21]:
# Counting missing values for each column
missing_values = date_df.select(
    [count(when(col(c).isNull(), c)).alias(c) for c in date_df.columns]
)
missing_values.show()

+----+------+-------+
|date|mmm_yy|week_no|
+----+------+-------+
|   0|     0|      0|
+----+------+-------+



In [22]:
from pyspark.sql.functions import col, regexp_extract



# Show initial count
print("Original row count:", date_df.count())

# 1. Drop null values and invalid_date entries
date_df = date_df.na.drop(how='any') \
    .filter(~col("mmm_yy").contains("invalid_date"))

# 2. Extract number from week_no (remove 'w' prefix)
date_df = cleaned_date_df.withColumn(
    "week_no",
    regexp_extract(col("week_no"), r"(\d+)", 1)  # Extract only the number
)

# Show results
print("\nCleaned data:")
date_df.show(truncate=False)

# Print summary statistics
print("\nSummary of changes:")
print("Original row count:", date_df.count())
print("Final row count after cleaning:", date_df.count())

# Show schema of cleaned dataframe
print("\nSchema of cleaned dataframe:")
date_df.printSchema()

Original row count: 286

Cleaned data:
+----------+------------+-------+
|date      |mmm_yy      |week_no|
+----------+------------+-------+
|2024-01-01|Jan-24      |1      |
|2024-01-02|Jan-24      |1      |
|2024-01-03|Jan-24      |1      |
|2024-01-04|Jan-24      |1      |
|2024-01-05|Jan-24      |1      |
|2024-01-06|Jan-24      |1      |
|2024-01-08|JAN-24      |2      |
|2024-01-09|Jan-24      |2      |
|2024-01-10|Jan-24      |2      |
|2024-01-11|Jan-24      |2      |
|2024-01-12|JAN-24      |2      |
|2024-01-13|Jan-24      |2      |
|2024-01-14|Jan-24      |2      |
|2024-01-15|JAN-24      |3      |
|2024-01-16|Jan-24      |3      |
|2024-01-17|Jan-24      |3      |
|2024-01-18|invalid_date|3      |
|2024-01-19|Jan-24      |3      |
|2024-01-20|Jan-24      |3      |
|2024-01-21|JAN-24      |3      |
+----------+------------+-------+
only showing top 20 rows


Summary of changes:
Original row count: 286
Final row count after cleaning: 286

Schema of cleaned dataframe:
root
 |-

In [25]:
from pyspark.sql.functions import col, regexp_extract, lower

# Metadata
# Date: 2025-01-14 18:32:45 UTC
# User: alexio545

# Show initial count
print("Original row count:", date_df.count())

# 1. Drop null values and invalid_date entries (case-insensitive)
cleaned_date_df = date_df.na.drop(how='any') \
    .filter(
        ~lower(col("mmm_yy")).contains("invalid_date") &  # Case-insensitive check
        col("mmm_yy").isNotNull()  # Additional null check
    )

# 2. Extract number from week_no (remove 'w' prefix)
date_df = cleaned_date_df.withColumn(
    "week_no",
    regexp_extract(col("week_no"), r"(\d+)", 1)  # Extract only the number
)

# Show results with more rows to verify
print("\nCleaned data:")
cleaned_date_df.show(20, truncate=False)  # Showing more rows to verify

# Print summary statistics
print("\nSummary of changes:")
print("Original row count:", date_df.count())
print("Final row count after cleaning:", date_df.count())

# Verify no invalid_date remains
print("\nChecking for any remaining 'invalid_date' entries:")
date_df.filter(lower(col("mmm_yy")).contains("invalid_date")).show()

# Show schema of cleaned dataframe
print("\nSchema of cleaned dataframe:")
date_df.printSchema()

Original row count: 254

Cleaned data:
+----------+------+-------+
|date      |mmm_yy|week_no|
+----------+------+-------+
|2024-01-01|Jan-24|1      |
|2024-01-02|Jan-24|1      |
|2024-01-03|Jan-24|1      |
|2024-01-04|Jan-24|1      |
|2024-01-05|Jan-24|1      |
|2024-01-06|Jan-24|1      |
|2024-01-08|JAN-24|2      |
|2024-01-09|Jan-24|2      |
|2024-01-10|Jan-24|2      |
|2024-01-11|Jan-24|2      |
|2024-01-12|JAN-24|2      |
|2024-01-13|Jan-24|2      |
|2024-01-14|Jan-24|2      |
|2024-01-15|JAN-24|3      |
|2024-01-16|Jan-24|3      |
|2024-01-17|Jan-24|3      |
|2024-01-19|Jan-24|3      |
|2024-01-20|Jan-24|3      |
|2024-01-21|JAN-24|3      |
|2024-01-23|JAN-24|4      |
+----------+------+-------+
only showing top 20 rows


Summary of changes:
Original row count: 254
Final row count after cleaning: 254

Checking for any remaining 'invalid_date' entries:
+----+------+-------+
|date|mmm_yy|week_no|
+----+------+-------+
+----+------+-------+


Schema of cleaned dataframe:
root
 |-- d