# 04 – Data Preprocessing (PySpark ML)

This notebook prepares the fused data for machine learning.  We encode categorical features, scale numerical features, and split the data into training and testing sets.  We also handle class imbalance by oversampling the minority class using the `imbalanced-learn` library after converting to Pandas.  Our goal is to create a balanced and well‑structured dataset for model training.


In [None]:
# ============================================
# 0. Imports and configuration
# ============================================

import os
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

from imblearn.over_sampling import SMOTE
from scipy import sparse
from joblib import dump
from pathlib import Path

# ============================================
# 1. Load fused data
# ============================================

# This notebook is expected in: D:/projects/Ai/project_fusion_ecu/notebooks
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
processed_dir = os.path.join(project_root, "data", "processed")

print("Project root:", project_root)
print("Processed dir:", processed_dir)

fused_path = os.path.join(processed_dir, "fused_data.csv")
if not os.path.exists(fused_path):
    raise FileNotFoundError(
        f"Fused data file not found at: {fused_path}\n"
        "Run 03_eda_fusion.ipynb first to generate fused_data.csv."
    )

df = pd.read_csv(fused_path)
print("Full fused data shape:", df.shape)

# ============================================
# 2. Optional sampling for faster processing
# ============================================

MAX_ROWS = 200_000  # adjust if needed

if len(df) > MAX_ROWS:
    df = df.sample(n=MAX_ROWS, random_state=42)
    print(f"Sampled down to {MAX_ROWS} rows.")
else:
    print("Using all rows (no sampling).")

print("Working dataframe shape:", df.shape)

# ============================================
# 3. Choose target and user ID columns
# ============================================

# Target: prefer 'label' if exists, otherwise 'clk'
if "label" in df.columns:
    target_col = "label"
elif "clk" in df.columns:
    target_col = "clk"
else:
    raise ValueError('No target column found. Expected "label" or "clk".')

print("Target column:", target_col)

# User ID column for user-level split
if "user" in df.columns:
    user_col = "user"
elif "userid" in df.columns:
    user_col = "userid"
elif "nick" in df.columns:
    user_col = "nick"
else:
    raise ValueError('No user ID column found. Expected "user", "userid" or "nick".')

print("User ID column:", user_col)

# Drop rows with missing user/target
df = df.dropna(subset=[user_col, target_col])
print("Shape after dropping rows with null user/target:", df.shape)

# ============================================
# 4. Remove leakage columns (VERY IMPORTANT)
# ============================================

# Columns that directly leak the target or timestamp:
# - 'clk'     : original binary click label
# - 'nonclk'  : complement of clk, leaks label perfectly
# - 'noclk'   : alternative spelling (just in case)
# - 'impressions' : if created, clk + nonclk
# - time_stamp columns: can be too granular and duplicate user behavior
leakage_cols = [
    "clk",
    "nonclk",
    "noclk",
    "impressions",
    "time_stamp",
    "time_stamp_str",
]

for c in leakage_cols:
    if c in df.columns and c != target_col:
        print("REMOVED LEAKAGE COLUMN:", c)
        df.drop(columns=[c], inplace=True)

print("Columns after leakage removal:")
print(df.columns.tolist())

# ============================================
# 5. User-level train/test split
# ============================================

unique_users = df[user_col].unique()
print("Number of unique users:", len(unique_users))

train_users, test_users = train_test_split(
    unique_users, test_size=0.2, random_state=42
)

train_df = df[df[user_col].isin(train_users)].copy()
test_df = df[df[user_col].isin(test_users)].copy()

print("Train_df shape:", train_df.shape)
print("Test_df shape:", test_df.shape)

# ============================================
# 6. Build X and y (drop IDs)
# ============================================

def build_xy(sub_df, target_col, user_col):
    # Start from full dataframe
    X = sub_df.copy()
    y = X[target_col].astype(int)
    X = X.drop(columns=[target_col])

    # Drop ID-like columns (including duplicated names like user.1, adgroup_id.1)
    id_keywords = ["user", "userid", "nick", "adgroup_id", "campaign_id",
                   "customer", "pid"]
    id_like_cols = [c for c in X.columns if any(k in c for k in id_keywords)]

    if id_like_cols:
        print("Dropping ID-like columns:", id_like_cols)
        X = X.drop(columns=id_like_cols)

    return X, y

X_train, y_train = build_xy(train_df, target_col, user_col)
X_test, y_test = build_xy(test_df, target_col, user_col)

print("X_train shape (raw):", X_train.shape)
print("X_test shape  (raw):", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

print("\ny_train distribution:")
print(y_train.value_counts(normalize=True))

print("\ny_test distribution:")
print(y_test.value_counts(normalize=True))

# ============================================
# 7. Identify numeric and categorical features
# ============================================

categorical_cols = X_train.select_dtypes(include=["object", "category"]).columns.tolist()
numeric_cols = [c for c in X_train.columns if c not in categorical_cols]

print("\nNumeric columns:", len(numeric_cols))
print("Categorical columns:", len(categorical_cols))

# ============================================
# 8. Preprocessing pipeline (imputer + encoder + scaler)
# ============================================

numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
    ]
)

categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_cols),
        ("cat", categorical_transformer, categorical_cols),
    ]
)

# ============================================
# 9. Fit preprocessing and transform
# ============================================

print("\nFitting preprocessing pipeline on X_train...")
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

print("X_train_transformed:", X_train_transformed.shape)
print("X_test_transformed:", X_test_transformed.shape)

def has_nan(matrix):
    if sparse.issparse(matrix):
        return np.isnan(matrix.data).any()
    else:
        return np.isnan(matrix).any()

print("NaN in X_train_transformed:", has_nan(X_train_transformed))
print("NaN in X_test_transformed:", has_nan(X_test_transformed))

if has_nan(X_train_transformed):
    raise ValueError("NaNs detected in X_train_transformed. Check imputers.")

# ============================================
# 10. SMOTE on preprocessed training data
# ============================================

print("\nApplying SMOTE to handle class imbalance...")
smote = SMOTE(random_state=42)

X_train_resampled, y_train_resampled = smote.fit_resample(
    X_train_transformed, y_train
)

print("X_train_resampled shape:", X_train_resampled.shape)
print("y_train_resampled distribution:")
print(pd.Series(y_train_resampled).value_counts(normalize=True))

# ============================================
# 11. Save NPZ matrices, labels, and preprocessor
# ============================================

Path(processed_dir).mkdir(parents=True, exist_ok=True)

X_train_resampled_path = os.path.join(processed_dir, "X_train_resampled.npz")
X_test_path = os.path.join(processed_dir, "X_test_transformed.npz")
y_train_resampled_path = os.path.join(processed_dir, "y_train_resampled.csv")
y_test_path = os.path.join(processed_dir, "y_test.csv")
preprocessor_path = os.path.join(processed_dir, "preprocessor.joblib")

sparse.save_npz(X_train_resampled_path, X_train_resampled)
sparse.save_npz(X_test_path, X_test_transformed)
pd.Series(y_train_resampled, name=target_col).to_csv(y_train_resampled_path, index=False)
y_test.to_csv(y_test_path, index=False)
dump(preprocessor, preprocessor_path)

print("\nSaved:")
print(" -", X_train_resampled_path)
print(" -", X_test_path)
print(" -", y_train_resampled_path)
print(" -", y_test_path)
print(" -", preprocessor_path)

print("\n[Preprocessing completed successfully – data is ready for model training.]")


Project root: d:\projects\Ai\project_fusion_ecu
Processed dir: d:\projects\Ai\project_fusion_ecu\data\processed
Full fused data shape: (1000000, 28)
Sampled down to 200000 rows.
Working dataframe shape: (200000, 28)
Target column: label
User ID column: user
Shape after dropping rows with null user/target: (200000, 28)
REMOVED LEAKAGE COLUMN: clk
REMOVED LEAKAGE COLUMN: nonclk
REMOVED LEAKAGE COLUMN: time_stamp
REMOVED LEAKAGE COLUMN: time_stamp_str
Columns after leakage removal:
['user', 'adgroup_id', 'pid', 'userid', 'cms_segid', 'cms_group_id', 'final_gender_code', 'age_level', 'pvalue_level', 'shopping_level', 'occupation', 'new_user_class_level ', 'adgroup_id.1', ' cate_id', ' campaign_id', ' customer', ' brand ', ' price', 'user.1', 'buy', 'cart', 'fav', 'pv', 'label']
Number of unique users: 130800
Train_df shape: (159964, 24)
Test_df shape: (40036, 24)
Dropping ID-like columns: ['user', 'adgroup_id', 'pid', 'userid', 'new_user_class_level ', 'adgroup_id.1', ' campaign_id', ' cus

: 