# Data Cleaning Techniques in PySpark

In [0]:
from pyspark.sql.functions import col, when, isnan, count, lit, regexp_replace, to_timestamp
from pyspark.sql.types import IntegerType, DoubleType, TimestampType, StringType

## Identifying Data Issues

In [0]:
# Load our messy dataset to demonstrate cleaning techniques
file_path = "/pyspark/video-streaming-data/module3-transform/cleaning/messy_events.csv"
messy_df = spark.read.option("header", "true").option("inferSchema", "true").csv(file_path)

# Look at the schema
print("Schema of our messy data:")
messy_df.printSchema()

Schema of our messy data:
root
 |-- event_id: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- content_id: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- duration_seconds: string (nullable = true)
 |-- device_type: string (nullable = true)
 |-- quality: string (nullable = true)
 |-- buffering_count: integer (nullable = true)
 |-- error_type: string (nullable = true)
 |-- ip_address: string (nullable = true)
 |-- country: string (nullable = true)
 |-- session_id: string (nullable = true)



In [0]:
messy_df.display()

event_id,user_id,content_id,timestamp,duration_seconds,device_type,quality,buffering_count,error_type,ip_address,country,session_id
EVT10000,USR41813,CON10763,2023-09-03T09:18:59Z,565,Web,HD,4,,72.119.240.124,ES,SES10000
EVT10001,USR46484,CON12784,2023-09-09T11:44:27Z,2018,Web,HD,1,,156.3.251.123,FR,SES10001
EVT10002,,CON16367,2023-09-09T16:51:53Z,2900,TV,4K,3,,182.53.26.241,AU,SES10002
EVT10003,USR46584,CON18916,2023-09-13T08:03:13Z,3242,Tablet,4K,3,,9.203.70.180,FR,SES10003
EVT10004,,CON18924,2023-09-04T13:07:20Z,4248,TV,4K,1,,152.202.251.124,NL,SES10004
EVT10005,USR26888,CON13567,2023-09-13T22:44:28Z,885,TV,HD,2,,12.248.203.140,DE,SES10005
EVT10006,USR58686,CON13571,2023-09-13T21:54:05Z,594,Web,HD,1,,118.180.35.136,FR,SES10006
EVT10007,USR59173,CON19019,2023-09-14T12:36:36Z,21955,TV,4K,1,,53.80.37.14,CA,SES10007
EVT10008,USR33266,CON15649,2023-09-04T03:11:27Z,9039,TV,HD,2,,123.155.41.53,UK,SES10008
EVT10009,USR56453,CON19816,2023-09-03T02:02:33Z,2262,Mobile,SD,2,,67.182.148.182,IT,SES10009


In [0]:
# How many records do we have?
print("Total number of records:", messy_df.count())

Total number of records: 52500


In [0]:
# Check for nulls
null_counts = messy_df.select([count(when(col(c).isNull(), c)).alias(c) for c in messy_df.columns])
display(null_counts)

event_id,user_id,content_id,timestamp,duration_seconds,device_type,quality,buffering_count,error_type,ip_address,country,session_id
2608,2637,2629,2607,0,0,0,0,47200,0,0,0


In [0]:
# Check for duplicates
duplicate_count = messy_df.count() - messy_df.dropDuplicates().count()
print(f"Number of duplicate records: {duplicate_count}")


Number of duplicate records: 2500


In [0]:
# Check for type issues
print("Sample of duration_seconds values:")
messy_df.select("duration_seconds").distinct().limit(5).display()

Sample of duration_seconds values:


duration_seconds
11332
2904
1090
691
467


## Cleaning Process

In [0]:
# 1. Remove duplicate rows
clean_df = messy_df.dropDuplicates()
print(f"Records after removing duplicates: {clean_df.count()}")

Records after removing duplicates: 50000


In [0]:
# First, the error_type column - null here means "no error"
clean_df = clean_df.na.fill({"error_type": "none"})

# For key identifier columns - events without IDs aren't useful, so drop these rows
critical_columns = ["user_id", "content_id", "event_id","timestamp"]
clean_df = clean_df.dropna(subset=critical_columns)
print(f"Records after dropping rows with missing IDs: {clean_df.count()}")

Records after dropping rows with missing IDs: 40748


In [0]:
# 3. Fix data type issues for duration_seconds
clean_df = clean_df.withColumn(
    "duration_seconds",
    when(col("duration_seconds").cast("integer").isNull(),
         regexp_replace(col("duration_seconds"), "[^0-9]", "").cast("integer")
    ).otherwise(col("duration_seconds").cast("integer"))
)
clean_df.printSchema()

root
 |-- event_id: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- content_id: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- duration_seconds: integer (nullable = true)
 |-- device_type: string (nullable = true)
 |-- quality: string (nullable = true)
 |-- buffering_count: integer (nullable = true)
 |-- error_type: string (nullable = false)
 |-- ip_address: string (nullable = true)
 |-- country: string (nullable = true)
 |-- session_id: string (nullable = true)



In [0]:
# 5. Verify our cleaned data
print("Null count in critical columns:")
clean_df.select([count(when(col(c).isNull(), c)).alias(c) 
                for c in ["user_id", "content_id", "timestamp"]]).display()

Null count in critical columns:


user_id,content_id,timestamp
0,0,0


In [0]:
clean_df.display()

event_id,user_id,content_id,timestamp,duration_seconds,device_type,quality,buffering_count,error_type,ip_address,country,session_id
EVT10055,USR58667,CON13530,2023-09-11T09:15:08Z,3337,Tablet,HD,2,none,132.115.188.195,IT,SES10055
EVT10208,USR53851,CON17461,2023-09-06T23:16:39Z,189,Web,HD,2,content_unavailable,32.41.65.15,ES,SES10208
EVT10366,USR56472,CON11159,2023-09-01T00:24:29Z,1687,Tablet,HD,0,none,58.91.186.209,DE,SES10366
EVT10700,USR21119,CON12457,2023-09-08T12:33:15Z,279,TV,4K,4,content_unavailable,199.99.169.130,IT,SES10700
EVT10903,USR53432,CON17698,2023-09-01T06:47:13Z,1202,Web,HD,4,none,29.146.120.117,IT,SES10903
EVT12012,USR26775,CON12162,2023-09-14T06:23:04Z,1205,TV,HD,3,none,201.66.91.167,NL,SES12012
EVT12369,USR11329,CON17702,2023-09-02T17:49:54Z,2072,Web,HD,4,none,212.215.103.104,UK,SES12369
EVT12449,USR46104,CON14114,2023-09-09T16:50:35Z,584,Tablet,SD,4,none,109.215.149.205,AU,SES12449
EVT12890,USR10962,CON13892,2023-09-10T13:15:17Z,4533,Web,HD,1,none,12.44.172.92,CA,SES12890
EVT12926,USR44825,CON11499,2023-09-08T08:06:50Z,2086,Mobile,SD,0,none,35.68.25.19,FR,SES12926


In [0]:
# 6. Save the cleaned data
output_path = "pyspark/video-streaming-data/module3-transform/cleaning/our_cleaned_output"
clean_df.write.mode("overwrite").parquet(output_path)

print(f"Cleaned data saved to {output_path}")

Cleaned data saved to pyspark/video-streaming-data/module3-transform/cleaning/our_cleaned_output


In [0]:
# Key Takeaways
print("Data Cleaning Techniques Demonstrated:")
print("1. Removing duplicate rows with dropDuplicates()")
print("2. Handling missing values with fillna() and dropna()")
print("3. Type casting with cast() and when() functions")
print("4. String manipulation with regexp_replace()")
print("5. Filtering invalid data with filter()")

In [0]:
dbutils.fs.rm("dbfs:/pyspark/video-streaming-data/module3-transform/cleaning/our_cleaned_output", recurse=True)