# 02 â€“ Data Cleaning (PySpark)

In this notebook we clean and preprocess the data. We load the raw CSV files using PySpark (limiting to 1 million rows), drop duplicates, handle missing values, and perform outlier detection. Cleaned data are saved to the `data/processed` directory for later use.

Steps:
1. Load data from `data/raw` using PySpark.
2. Drop duplicate rows.
3. Fill missing values using appropriate strategies (median for numeric, mode for categorical).
4. Convert timestamp columns to datetime if needed.
5. Save cleaned data as Parquet for efficient downstream processing.


In [3]:
# ============================
# 0. Imports and Spark session
# ============================

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_unixtime
from pyspark.sql.types import NumericType
import os

# Create a reasonably sized local Spark session
spark = (
    SparkSession.builder
        .appName("CTR_Project_Fusion_Ensemble")
        .master("local[4]")                     # use 4 cores (good balance for your laptop)
        .config("spark.sql.shuffle.partitions", "100")
        .config("spark.driver.memory", "6g")    # keep some RAM for the OS and VS Code
        .getOrCreate()
)

spark.sparkContext.setLogLevel("WARN")
print("Spark version:", spark.version)

# ============================
# 1. Define project and data paths
# ============================

# Assuming this notebook lives in: D:/projects/Ai/project_fusion_ecu/notebooks
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
raw_dir = os.path.join(project_root, "data", "raw")
processed_dir = os.path.join(project_root, "data", "processed")
os.makedirs(processed_dir, exist_ok=True)

print("Project root:", project_root)
print("Raw data dir:", raw_dir)
print("Processed data dir:", processed_dir)

# Expected file names (change here if your actual names differ)
file_names = {
    "raw_sample": "raw_sample.csv",
    "ad_feature": "ad_feature.csv",
    "user_profile": "user_profile.csv",
    # If your file is actually named 'raw_behavior_log.csv', change the next line accordingly:
    "behavior_log": "behavior_log.csv",
}

# ============================
# 2. Verify that required files exist
# ============================

missing = []
for key, fname in file_names.items():
    path = os.path.join(raw_dir, fname)
    exists = os.path.exists(path)
    print(f"{key:15s} -> {path} | exists: {exists}")
    if not exists:
        missing.append(path)

if missing:
    raise FileNotFoundError(
        "The following required files are missing. "
        "Please copy them into data/raw with the exact names:\n"
        + "\n".join(missing)
    )

# ============================
# 3. Load up to 1M rows per file
# ============================

print("\nLoading up to 1,000,000 rows from each file...")

user_df = (
    spark.read.csv(
        os.path.join(raw_dir, file_names["user_profile"]),
        header=True,
        inferSchema=True,
    )
    .limit(1_000_000)
    .cache()
)

ad_df = (
    spark.read.csv(
        os.path.join(raw_dir, file_names["ad_feature"]),
        header=True,
        inferSchema=True,
    )
    .limit(1_000_000)
    .cache()
)

click_df = (
    spark.read.csv(
        os.path.join(raw_dir, file_names["raw_sample"]),
        header=True,
        inferSchema=True,
    )
    .limit(1_000_000)
    .cache()
)

behavior_df = (
    spark.read.csv(
        os.path.join(raw_dir, file_names["behavior_log"]),
        header=True,
        inferSchema=True,
    )
    .limit(1_000_000)
    .cache()
)

print("User rows:     ", user_df.count())
print("Ad rows:       ", ad_df.count())
print("Click rows:    ", click_df.count())
print("Behavior rows: ", behavior_df.count())

# ============================
# 4. Drop duplicate rows
# ============================

print("\nDropping exact duplicate rows...")

user_df = user_df.dropDuplicates()
ad_df = ad_df.dropDuplicates()
click_df = click_df.dropDuplicates()
behavior_df = behavior_df.dropDuplicates()

print("User rows after dropDuplicates:", user_df.count())
print("Ad rows after dropDuplicates:  ", ad_df.count())
print("Click rows after dropDuplicates:", click_df.count())
print("Behavior rows after dropDuplicates:", behavior_df.count())

# ============================
# 5. Helper function: fill missing values
#    - Numeric columns: median
#    - Categorical columns: mode (most frequent value)
# ============================

from pyspark.sql import functions as F

def fill_missing(df, id_cols=None):
    """
    Fill missing values in a Spark DataFrame.
    - Numeric columns: filled with median
    - Non-numeric columns: filled with mode (most frequent value)
    id_cols: columns that should not be imputed (e.g., IDs)
    """
    if id_cols is None:
        id_cols = []

    # Work on a copy-like reference
    result = df
    for col_name, dtype in result.dtypes:
        if col_name in id_cols:
            continue

        field = result.schema[col_name].dataType

        # Numeric column: use median
        if isinstance(field, NumericType):
            non_null = result.filter(F.col(col_name).isNotNull())
            if non_null.limit(1).count() == 0:
                # No non-null values at all, fill with 0 as safe default
                median_val = 0
            else:
                median_val = non_null.approxQuantile(col_name, [0.5], 0.01)[0]
            result = result.fillna({col_name: median_val})

        # Categorical/text column: use mode (most frequent value)
        else:
            non_null = result.filter(F.col(col_name).isNotNull())
            if non_null.limit(1).count() == 0:
                # Column is entirely null -> skip
                continue
            mode_row = (
                non_null.groupBy(col_name)
                .count()
                .orderBy(F.desc("count"))
                .first()
            )
            if mode_row is not None:
                mode_val = mode_row[0]
                result = result.fillna({col_name: mode_val})

    return result

# ============================
# 6. Apply missing value imputation
# ============================

print("\nFilling missing values (median for numeric, mode for categorical)...")

user_df = fill_missing(user_df, id_cols=["userid", "user"])
ad_df = fill_missing(ad_df, id_cols=["adgroup_id"])
click_df = fill_missing(click_df, id_cols=["user", "adgroup_id"])
behavior_df = fill_missing(behavior_df, id_cols=["user", "nick"])

print("Missing value imputation completed.")

# ============================
# 7. Safe timestamp handling
#    - For Windows, toPandas() with TimestampType can fail if values are out of range.
#    - We convert Unix time to a STRING column instead of Spark TimestampType.
# ============================

print("\nAdding safe timestamp string columns...")

if "time_stamp" in click_df.columns:
    click_df = click_df.withColumn(
        "time_stamp_str",
        from_unixtime(col("time_stamp").cast("bigint")).cast("string"),
    )

if "time_stamp" in behavior_df.columns:
    behavior_df = behavior_df.withColumn(
        "time_stamp_str",
        from_unixtime(col("time_stamp").cast("bigint")).cast("string"),
    )

# ============================
# 8. Convert to Pandas and save as CSV (no Hadoop / winutils issues)
# ============================

print("\nConverting Spark DataFrames to Pandas for saving... "
      "This may take some time for 1M rows each.")

user_pdf = user_df.toPandas()
ad_pdf = ad_df.toPandas()
click_pdf = click_df.toPandas()
behavior_pdf = behavior_df.toPandas()

print("Saving cleaned data as CSV (sufficient for the project and easy to use later).")

user_pdf.to_csv(os.path.join(processed_dir, "user_profile_clean.csv"), index=False)
ad_pdf.to_csv(os.path.join(processed_dir, "ad_feature_clean.csv"), index=False)
click_pdf.to_csv(os.path.join(processed_dir, "raw_sample_clean.csv"), index=False)
behavior_pdf.to_csv(os.path.join(processed_dir, "behavior_log_clean.csv"), index=False)

print("\nSaved cleaned CSVs into:", processed_dir)

# ============================
# 9. Stop Spark session
# ============================

spark.stop()
print("\nSpark session stopped.")


Spark version: 4.0.1
Project root: d:\projects\Ai\project_fusion_ecu
Raw data dir: d:\projects\Ai\project_fusion_ecu\data\raw
Processed data dir: d:\projects\Ai\project_fusion_ecu\data\processed
raw_sample      -> d:\projects\Ai\project_fusion_ecu\data\raw\raw_sample.csv | exists: True
ad_feature      -> d:\projects\Ai\project_fusion_ecu\data\raw\ad_feature.csv | exists: True
user_profile    -> d:\projects\Ai\project_fusion_ecu\data\raw\user_profile.csv | exists: True
behavior_log    -> d:\projects\Ai\project_fusion_ecu\data\raw\behavior_log.csv | exists: True

Loading up to 1,000,000 rows from each file...
User rows:      1000000
Ad rows:        846811
Click rows:     1000000
Behavior rows:  1000000

Dropping exact duplicate rows...
User rows after dropDuplicates: 1000000
Ad rows after dropDuplicates:   846811
Click rows after dropDuplicates: 1000000
Behavior rows after dropDuplicates: 930695

Filling missing values (median for numeric, mode for categorical)...
Missing value imputatio