# 02 – Data Cleaning (PySpark)

This notebook performs data cleaning steps using Spark.  We remove duplicate records, handle missing values and convert timestamps to human–readable formats.  The cleaned DataFrames are saved to the `data/processed` directory.

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_unixtime
import os

# Initialize Spark
spark = SparkSession.builder.appName('CTR_Data_Cleaning').getOrCreate()
spark.sparkContext.setLogLevel('WARN')

raw_dir = os.path.join('..', 'data', 'raw')
processed_dir = os.path.join('..', 'data', 'processed')
os.makedirs(processed_dir, exist_ok=True)

# Load up to 1M rows from each dataset
user_df = spark.read.csv(os.path.join(raw_dir, 'user_profile.csv'), header=True, inferSchema=True).limit(1_000_000)
ad_df = spark.read.csv(os.path.join(raw_dir, 'ad_feature.csv'), header=True, inferSchema=True).limit(1_000_000)
click_df = spark.read.csv(os.path.join(raw_dir, 'raw_sample.csv'), header=True, inferSchema=True).limit(1_000_000)
behaviour_df = spark.read.csv(os.path.join(raw_dir, 'behavior_log.csv'), header=True, inferSchema=True).limit(1_000_000)

# Drop duplicate rows
user_df = user_df.dropDuplicates()
ad_df = ad_df.dropDuplicates()
click_df = click_df.dropDuplicates()
behaviour_df = behaviour_df.dropDuplicates()

# Fill missing numeric values with median and categorical with mode (simple heuristic)
# For demonstration, we fill numeric nulls with -1 and categorical nulls with 'unknown'
from pyspark.sql.types import NumericType

def fill_missing_values(df):
    for column, dtype in df.dtypes:
        if isinstance(df.schema[column].dataType, NumericType):
            df = df.na.fill({column: -1})
        else:
            df = df.na.fill({column: 'unknown'})
    return df

user_df = fill_missing_values(user_df)
ad_df = fill_missing_values(ad_df)
click_df = fill_missing_values(click_df)
behaviour_df = fill_missing_values(behaviour_df)

# Convert timestamp columns (assumed to be Unix epoch in seconds) to timestamps
from pyspark.sql.functions import to_timestamp

click_df = click_df.withColumn('time_stamp', to_timestamp(col('time_stamp')))
behaviour_df = behaviour_df.withColumn('time_stamp', to_timestamp(col('time_stamp')))

# Save cleaned DataFrames to processed directory in parquet format
user_df.write.mode('overwrite').parquet(os.path.join(processed_dir, 'user_profile_clean.parquet'))
ad_df.write.mode('overwrite').parquet(os.path.join(processed_dir, 'ad_feature_clean.parquet'))
click_df.write.mode('overwrite').parquet(os.path.join(processed_dir, 'raw_sample_clean.parquet'))
behaviour_df.write.mode('overwrite').parquet(os.path.join(processed_dir, 'behavior_log_clean.parquet'))

print('Cleaning complete – cleaned files written to data/processed.')
