# 02 â€“ Data Cleaning (PySpark)

This notebook performs data cleaning and basic preprocessing on the CTR dataset using PySpark.  
It is designed to be **robust to missing files** and to work with up to **1M rows per source file**.

In [1]:
from pyspark.sql import SparkSession

# Create or get Spark session
spark = (
    SparkSession.builder
        .appName("CTR_Data_Cleaning")
        .config("spark.sql.shuffle.partitions", "200")
        .getOrCreate()
)

spark.sparkContext.setLogLevel("WARN")
print("Spark version:", spark.version)

Spark version: 4.0.1


In [5]:
import os

# Resolve project root assuming this notebook lives in `notebooks/`
project_root = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
raw_dir = os.path.join(project_root, "data", "raw")
processed_dir = os.path.join(project_root, "data", "processed")
os.makedirs(processed_dir, exist_ok=True)

print("Project root:", project_root)
print("Raw data dir:", raw_dir)
print("Processed data dir:", processed_dir)

# Expected file names
files = {
    "user_profile": "user_profile.csv",
    "ad_feature": "ad_feature.csv",
    "raw_sample": "raw_sample.csv",
    "behavior_log": "behavior_log.csv",  # change here if your file name is different
}

available_files = {}
for key, fname in files.items():
    path = os.path.join(raw_dir, fname)
    exists = os.path.exists(path)
    print(f"{key:15s} -> {path}  | exists: {exists}")
    if exists:
        available_files[key] = fname

if not available_files:
    print("\nWARNING: No data files found in `data/raw`. Please add at least one of:")
    for key, fname in files.items():
        print(f"  - {fname}")


Project root: d:\projects\Ai\project_fusion_ecu
Raw data dir: d:\projects\Ai\project_fusion_ecu\data\raw
Processed data dir: d:\projects\Ai\project_fusion_ecu\data\processed
user_profile    -> d:\projects\Ai\project_fusion_ecu\data\raw\user_profile.csv  | exists: False
ad_feature      -> d:\projects\Ai\project_fusion_ecu\data\raw\ad_feature.csv  | exists: False
raw_sample      -> d:\projects\Ai\project_fusion_ecu\data\raw\raw_sample.csv  | exists: False
behavior_log    -> d:\projects\Ai\project_fusion_ecu\data\raw\behavior_log.csv  | exists: False

  - user_profile.csv
  - ad_feature.csv
  - raw_sample.csv
  - behavior_log.csv


In [6]:
from pyspark.sql import functions as F

# Load up to 1M rows from each dataset (only if available)
dataframes = {}

if "user_profile" in available_files:
    user_path = os.path.join(raw_dir, available_files["user_profile"])
    user_df = spark.read.csv(user_path, header=True, inferSchema=True).limit(1_000_000).cache()
    dataframes["user"] = user_df
    print("User profile rows:", user_df.count())

if "ad_feature" in available_files:
    ad_path = os.path.join(raw_dir, available_files["ad_feature"])
    ad_df = spark.read.csv(ad_path, header=True, inferSchema=True).limit(1_000_000).cache()
    dataframes["ad"] = ad_df
    print("Ad feature rows:", ad_df.count())

if "raw_sample" in available_files:
    click_path = os.path.join(raw_dir, available_files["raw_sample"])
    click_df = spark.read.csv(click_path, header=True, inferSchema=True).limit(1_000_000).cache()
    dataframes["click"] = click_df
    print("Click log rows:", click_df.count())

if "behavior_log" in available_files:
    behaviour_path = os.path.join(raw_dir, available_files["behavior_log"])
    behaviour_df = spark.read.csv(behaviour_path, header=True, inferSchema=True).limit(1_000_000).cache()
    dataframes["behaviour"] = behaviour_df
    print("Behavior log rows:", behaviour_df.count())

if not dataframes:
    print("No data files were loaded. Please add data files to `data/raw`.")


No data files were loaded. Please add data files to `data/raw`.


In [7]:
# Drop exact duplicates from available dataframes
for key, df in dataframes.items():
    dataframes[key] = df.dropDuplicates()

print("After dropping duplicates:")
for key, df in dataframes.items():
    print(f"{key:15s} rows: {df.count()}")


After dropping duplicates:


In [8]:
# Convert timestamp columns if present
if "click" in dataframes and "time_stamp" in dataframes["click"].columns:
    dataframes["click"] = dataframes["click"].withColumn(
        "time_stamp_ts",
        F.to_timestamp(F.col("time_stamp").cast("double"))
    )

if "behaviour" in dataframes and "time_stamp" in dataframes["behaviour"].columns:
    dataframes["behaviour"] = dataframes["behaviour"].withColumn(
        "time_stamp_ts",
        F.to_timestamp(F.col("time_stamp").cast("double"))
    )

# Simple missing value handling: fill numeric nulls with median and string nulls with mode
def fill_nulls(df, id_cols=None):
    if id_cols is None:
        id_cols = []
    cols = [c for c in df.columns if c not in id_cols]
    result = df
    for c in cols:
        dtype = dict(result.dtypes)[c]
        if dtype in ("int", "bigint", "double", "float", "long", "decimal"):
            median_val = result.approxQuantile(c, [0.5], 0.01)[0] if result.filter(F.col(c).isNotNull()).count() > 0 else 0
            result = result.fillna({c: median_val})
        else:
            mode_row = result.groupBy(c).count().orderBy(F.desc("count")).first()
            if mode_row is not None:
                result = result.fillna({c: mode_row[0]})
    return result

# Clean available dataframes with conditional ID columns
cleaned_dataframes = {}
if "user" in dataframes:
    cleaned_dataframes["user"] = fill_nulls(dataframes["user"], id_cols=["userid", "user", "nick"])

if "ad" in dataframes:
    cleaned_dataframes["ad"] = fill_nulls(dataframes["ad"], id_cols=["adgroup_id"])

if "click" in dataframes:
    cleaned_dataframes["click"] = fill_nulls(dataframes["click"], id_cols=["user", "adgroup_id"])

if "behaviour" in dataframes:
    cleaned_dataframes["behaviour"] = fill_nulls(dataframes["behaviour"], id_cols=["user", "nick"])

print("Null handling complete.")


Null handling complete.


In [9]:
# Save cleaned data as Parquet (recommended for Spark) and CSV (for backup or inspection)

if "user" in cleaned_dataframes:
    cleaned_dataframes["user"].write.mode("overwrite").parquet(os.path.join(processed_dir, "user_profile_clean.parquet"))
    cleaned_dataframes["user"].limit(50_000).coalesce(1).write.mode("overwrite").option("header", True).csv(os.path.join(processed_dir, "user_profile_clean_sample_csv"))

if "ad" in cleaned_dataframes:
    cleaned_dataframes["ad"].write.mode("overwrite").parquet(os.path.join(processed_dir, "ad_feature_clean.parquet"))
    cleaned_dataframes["ad"].limit(50_000).coalesce(1).write.mode("overwrite").option("header", True).csv(os.path.join(processed_dir, "ad_feature_clean_sample_csv"))

if "click" in cleaned_dataframes:
    cleaned_dataframes["click"].write.mode("overwrite").parquet(os.path.join(processed_dir, "raw_sample_clean.parquet"))
    cleaned_dataframes["click"].limit(50_000).coalesce(1).write.mode("overwrite").option("header", True).csv(os.path.join(processed_dir, "raw_sample_clean_sample_csv"))

if "behaviour" in cleaned_dataframes:
    cleaned_dataframes["behaviour"].write.mode("overwrite").parquet(os.path.join(processed_dir, "behavior_log_clean.parquet"))
    cleaned_dataframes["behaviour"].limit(50_000).coalesce(1).write.mode("overwrite").option("header", True).csv(os.path.join(processed_dir, "behavior_log_clean_sample_csv"))

print("Cleaned data saved to:", processed_dir)


Cleaned data saved to: d:\projects\Ai\project_fusion_ecu\data\processed
