# 04 – Data Preprocessing (PySpark ML)

This notebook prepares the fused data for machine learning.  We encode categorical features, scale numerical features, and split the data into training and testing sets.  We also handle class imbalance by oversampling the minority class using the `imbalanced-learn` library after converting to Pandas.  Our goal is to create a balanced and well‑structured dataset for model training.


In [4]:
import os
import numpy as np
import pandas as pd

from pathlib import Path
from scipy import sparse
from joblib import dump

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score

from xgboost import XGBClassifier

# ==========================================================
# 1) Paths + Load fused data
# ==========================================================
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
processed_dir = os.path.join(project_root, "data", "processed")
Path(processed_dir).mkdir(parents=True, exist_ok=True)

fused_path = os.path.join(processed_dir, "fused_data.csv")
if not os.path.exists(fused_path):
    raise FileNotFoundError(
        f"Missing: {fused_path}\nRun fusion notebook first."
    )

df = pd.read_csv(fused_path)
print("Loaded fused:", df.shape)

# ==========================================================
# 2) Target + user + time
# ==========================================================
target_col = "label" if "label" in df.columns else ("clk" if "clk" in df.columns else None)
if target_col is None:
    raise ValueError('No target column found ("label" or "clk").')

user_col = next((c for c in ["user", "userid", "nick"] if c in df.columns), None)
if user_col is None:
    raise ValueError('No user ID column found ("user" or "userid" or "nick").')

time_col = next((c for c in ["time_stamp", "timestamp", "date_time", "datetime", "time"] if c in df.columns), None)

print("target:", target_col, "| user:", user_col, "| time:", time_col)

# drop nulls
drop_cols = [user_col, target_col] + ([time_col] if time_col else [])
before = len(df)
df = df.dropna(subset=drop_cols)
print("Dropped null rows:", before - len(df))

df[target_col] = df[target_col].astype(int)

# ==========================================================
# 3) Cap rows (optional)
# ==========================================================
MAX_ROWS = 200_000
if time_col is not None:
    df = df.sort_values(time_col).reset_index(drop=True)
    if len(df) > MAX_ROWS:
        df = df.iloc[-MAX_ROWS:].copy()
        print(f"Temporal tail kept: {MAX_ROWS}")
else:
    if len(df) > MAX_ROWS:
        df = df.sample(n=MAX_ROWS, random_state=42).copy()
        print(f"Random sample kept: {MAX_ROWS}")

print("Working df:", df.shape, "pos_rate:", df[target_col].mean())

# ==========================================================
# 4) Remove obvious leakage columns (names)
# ==========================================================
# Strong rule: anything that smells like target/after-click aggregates is removed
name_blacklist_substrings = [
    "label", "clk", "click", "nonclk", "noclk", "impression", "ctr", "conversion",
    "time_stamp_str", "is_click", "clicked"
]
removed = []
for c in list(df.columns):
    if c == target_col:
        continue
    low = c.lower()
    if any(s in low for s in name_blacklist_substrings):
        # allow time_col to remain for split only
        if time_col is not None and c == time_col:
            continue
        # allow user_col to remain for drop later
        if c == user_col:
            continue
        df.drop(columns=[c], inplace=True)
        removed.append(c)

if removed:
    print("Removed by name blacklist:", removed)

# ==========================================================
# 5) Temporal split 80/20 (fallback stratified)
# ==========================================================
if time_col is not None:
    df = df.sort_values(time_col).reset_index(drop=True)
    split_idx = int(len(df) * 0.8)
    train_df = df.iloc[:split_idx].copy()
    test_df  = df.iloc[split_idx:].copy()
    print("Temporal split cutoff:", df.iloc[split_idx][time_col])
else:
    train_df, test_df = train_test_split(
        df, test_size=0.2, stratify=df[target_col], random_state=42
    )
    print("Stratified split (no time col).")

print("train:", train_df.shape, "pos:", train_df[target_col].mean())
print("test :", test_df.shape,  "pos:", test_df[target_col].mean())

# ==========================================================
# 6) Build X/y
# ==========================================================
def build_xy(sub_df: pd.DataFrame):
    y = sub_df[target_col].astype(int)
    X = sub_df.drop(columns=[target_col], errors="ignore")

    # time only for splitting, not as a feature
    if time_col is not None and time_col in X.columns:
        X = X.drop(columns=[time_col], errors="ignore")

    # user id removed from features
    if user_col in X.columns:
        X = X.drop(columns=[user_col], errors="ignore")

    return X, y

X_train, y_train = build_xy(train_df)
X_test,  y_test  = build_xy(test_df)

print("X_train raw:", X_train.shape, "X_test raw:", X_test.shape)

# ==========================================================
# 7) Feature typing
# ==========================================================
force_categorical = set([
    "adgroup_id", "pid", "campaign_id", "customer", "cate_id", "brand",
    "cms_segid", "cms_group_id", "final_gender_code", "age_level",
    "shopping_level", "occupation", "pvalue_level", "new_user_class_level",
])
force_categorical = {c for c in force_categorical if c in X_train.columns}

categorical_cols = list(X_train.select_dtypes(include=["object", "category"]).columns)
for c in force_categorical:
    if c not in categorical_cols:
        categorical_cols.append(c)

numeric_cols = [c for c in X_train.columns if c not in categorical_cols]

print("categorical:", len(categorical_cols), "numeric:", len(numeric_cols))

# ==========================================================
# 8) Missing indicators + log1p
# ==========================================================
key_missing_cols = [c for c in ["price", "age_level", "final_gender_code", "shopping_level", "pvalue_level", "cms_segid"] if c in X_train.columns]
for c in key_missing_cols:
    X_train[f"is_missing_{c}"] = X_train[c].isna().astype(int)
    X_test[f"is_missing_{c}"]  = X_test[c].isna().astype(int)
    if f"is_missing_{c}" not in numeric_cols:
        numeric_cols.append(f"is_missing_{c}")

for c in ["pv", "cart", "fav", "buy"]:
    if c in X_train.columns:
        X_train[c] = pd.to_numeric(X_train[c], errors="coerce")
        X_test[c]  = pd.to_numeric(X_test[c], errors="coerce")
        X_train[c] = np.log1p(X_train[c])
        X_test[c]  = np.log1p(X_test[c])

# ==========================================================
# 9) scale_pos_weight
# ==========================================================
pos = int(y_train.sum())
neg = int(len(y_train) - pos)
scale_pos_weight = (neg / pos) if pos > 0 else 1.0
print("scale_pos_weight:", scale_pos_weight)

# ==========================================================
# 10) Encoding (NO OHE)
# ==========================================================
HIGH_CARD_THRESHOLD = 30
SMOOTHING = 50
N_SPLITS_TE = 5
RANDOM_STATE = 42
global_mean = float(y_train.mean())

X_train_enc = X_train.copy()
X_test_enc  = X_test.copy()

for c in categorical_cols:
    X_train_enc[c] = X_train_enc[c].astype("string")
    X_test_enc[c]  = X_test_enc[c].astype("string")

high_card_cols, low_card_cols = [], []
for c in categorical_cols:
    nunq = X_train_enc[c].nunique(dropna=True)
    (high_card_cols if nunq > HIGH_CARD_THRESHOLD else low_card_cols).append(c)

# low-card label encoding
low_card_mappings = {}
for c in low_card_cols:
    cats = X_train_enc[c].dropna().unique().tolist()
    cats = sorted([str(x) for x in cats])
    mapping = {cat: i for i, cat in enumerate(cats)}
    low_card_mappings[c] = mapping
    X_train_enc[c] = X_train_enc[c].map(mapping).fillna(-1).astype(int)
    X_test_enc[c]  = X_test_enc[c].map(mapping).fillna(-1).astype(int)

# high-card target encoding (OOF)
high_card_mappings = {}
if high_card_cols:
    skf = StratifiedKFold(n_splits=N_SPLITS_TE, shuffle=True, random_state=RANDOM_STATE)
    for c in high_card_cols:
        oof = pd.Series(index=X_train_enc.index, dtype=float)

        for tr_idx, val_idx in skf.split(X_train_enc, y_train):
            tr_x = X_train_enc.iloc[tr_idx]
            tr_y = y_train.iloc[tr_idx]
            stats = pd.DataFrame({c: tr_x[c], "y": tr_y.values}).groupby(c)["y"].agg(["count", "mean"])

            enc_map = {}
            for cat, row in stats.iterrows():
                cnt = float(row["count"])
                mu  = float(row["mean"])
                enc_val = (cnt * mu + SMOOTHING * global_mean) / (cnt + SMOOTHING)
                enc_map[str(cat)] = enc_val

            val_x = X_train_enc.iloc[val_idx]
            oof.iloc[val_idx] = val_x[c].map(enc_map).astype(float)

        X_train_enc[c] = oof.fillna(global_mean).astype(float)

        stats_full = pd.DataFrame({c: X_train[c].astype("string"), "y": y_train.values}).groupby(c)["y"].agg(["count", "mean"])
        final_map = {}
        for cat, row in stats_full.iterrows():
            cnt = float(row["count"])
            mu  = float(row["mean"])
            enc_val = (cnt * mu + SMOOTHING * global_mean) / (cnt + SMOOTHING)
            final_map[str(cat)] = enc_val

        high_card_mappings[c] = {"mapping": final_map, "global_mean": global_mean, "smoothing": SMOOTHING}
        X_test_enc[c] = X_test[c].astype("string").map(final_map).fillna(global_mean).astype(float)

# now everything numeric
for c in categorical_cols:
    if c not in numeric_cols:
        numeric_cols.append(c)

numeric_cols = [c for c in numeric_cols if c in X_train_enc.columns]

# force float
X_train_enc[numeric_cols] = X_train_enc[numeric_cols].apply(pd.to_numeric, errors="coerce").astype(np.float64)
X_test_enc[numeric_cols]  = X_test_enc[numeric_cols].apply(pd.to_numeric, errors="coerce").astype(np.float64)

# impute + scale
imputer = SimpleImputer(strategy="median")
scaler  = StandardScaler()

Xtr_num = scaler.fit_transform(imputer.fit_transform(X_train_enc[numeric_cols]))
Xte_num = scaler.transform(imputer.transform(X_test_enc[numeric_cols]))

X_train_enc.loc[:, numeric_cols] = Xtr_num.astype(np.float64)
X_test_enc.loc[:, numeric_cols]  = Xte_num.astype(np.float64)

# align columns
X_test_enc = X_test_enc.reindex(columns=X_train_enc.columns, fill_value=0.0)

print("Encoded shapes:", X_train_enc.shape, X_test_enc.shape)

# ==========================================================
# 11) LEAKAGE GUARD (critical)
# ==========================================================
# 11.1 exact equality checks (y or 1-y)
leak_cols_exact = []
y_arr = y_train.values.astype(int)

for c in X_train_enc.columns:
    col = X_train_enc[c].values
    # skip nan checks by safe conversion
    col_num = np.array(col, dtype=float)
    if np.all(np.isfinite(col_num)):
        # try rounding for "almost binary"
        col_bin = np.round(col_num).astype(int)
        if col_bin.shape[0] == y_arr.shape[0]:
            if np.array_equal(col_bin, y_arr) or np.array_equal(1 - col_bin, y_arr):
                leak_cols_exact.append(c)

# 11.2 single-feature AUC scan on a holdout split inside TRAIN
# if a single feature yields AUC ~ 1, it's almost surely leakage
Xtr_sub, Xva_sub, ytr_sub, yva_sub = train_test_split(
    X_train_enc, y_train, test_size=0.2, stratify=y_train, random_state=42
)

leak_auc_rows = []
leak_cols_auc = []
for c in X_train_enc.columns:
    x = Xva_sub[c].values.astype(float)
    if np.nanstd(x) < 1e-12:
        continue
    try:
        auc = roc_auc_score(yva_sub, x)
        auc = max(auc, 1.0 - auc)  # make it symmetric
        leak_auc_rows.append((c, float(auc)))
        if auc > 0.999:  # near-perfect from single feature -> leakage
            leak_cols_auc.append(c)
    except Exception:
        continue

leakage_report = pd.DataFrame(leak_auc_rows, columns=["feature", "single_feature_auc"]).sort_values("single_feature_auc", ascending=False)
leak_report_path = os.path.join(processed_dir, "leakage_report.csv")
leakage_report.to_csv(leak_report_path, index=False)
print("Saved leakage report:", leak_report_path)

leak_all = sorted(set(leak_cols_exact + leak_cols_auc))
if leak_all:
    print("\n[LEAKAGE GUARD] Dropping leaked features:", leak_all)
    X_train_enc = X_train_enc.drop(columns=leak_all, errors="ignore")
    X_test_enc  = X_test_enc.drop(columns=leak_all, errors="ignore")
else:
    print("\n[LEAKAGE GUARD] No obvious leaked features found by exact/AUC scans.")

# final align
X_test_enc = X_test_enc.reindex(columns=X_train_enc.columns, fill_value=0.0)
print("After leakage guard:", X_train_enc.shape, X_test_enc.shape)

# ==========================================================
# 12) Feature importance (optional, keep all)
# ==========================================================
importance_model = XGBClassifier(
    n_estimators=600,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="logloss",
    random_state=42,
    n_jobs=-1,
    scale_pos_weight=scale_pos_weight,
    tree_method="hist",
)
importance_model.fit(X_train_enc, y_train)

imp_df = pd.DataFrame({"feature": X_train_enc.columns, "importance": importance_model.feature_importances_}).sort_values("importance", ascending=False)
imp_path = os.path.join(processed_dir, "feature_importances.csv")
imp_df.to_csv(imp_path, index=False)
print("Saved importances:", imp_path)

top_features = imp_df["feature"].tolist()
X_train_final = X_train_enc[top_features]
X_test_final  = X_test_enc[top_features]

# ==========================================================
# 13) Save outputs
# ==========================================================
X_train_path = os.path.join(processed_dir, "X_train_processed.npz")
X_test_path  = os.path.join(processed_dir, "X_test_processed.npz")
y_train_path = os.path.join(processed_dir, "y_train.csv")
y_test_path  = os.path.join(processed_dir, "y_test.csv")
top_features_path = os.path.join(processed_dir, "top_features.csv")
scale_pos_weight_path = os.path.join(processed_dir, "scale_pos_weight.txt")
preprocessor_path = os.path.join(processed_dir, "preprocessor.joblib")

sparse.save_npz(X_train_path, sparse.csr_matrix(X_train_final.values))
sparse.save_npz(X_test_path,  sparse.csr_matrix(X_test_final.values))

pd.Series(y_train, name=target_col).to_csv(y_train_path, index=False)
pd.Series(y_test, name=target_col).to_csv(y_test_path, index=False)

with open(scale_pos_weight_path, "w") as f:
    f.write(str(scale_pos_weight))

pd.DataFrame({"feature": top_features}).to_csv(top_features_path, index=False)

preprocessor = {
    "target_col": target_col,
    "user_col_removed": user_col,
    "time_col_used_for_split": time_col,
    "categorical_cols_original": categorical_cols,
    "numeric_cols": numeric_cols,
    "high_card_cols": high_card_cols,
    "low_card_cols": low_card_cols,
    "high_card_mappings": high_card_mappings,
    "low_card_mappings": low_card_mappings,
    "global_mean": global_mean,
    "scale_pos_weight": scale_pos_weight,
    "top_features": top_features,
    "leakage_dropped_features": leak_all,
    "leakage_report_path": leak_report_path,
    "imputer_statistics": imputer.statistics_.tolist(),
    "scaler_mean": scaler.mean_.tolist(),
    "scaler_scale": scaler.scale_.tolist(),
}
dump(preprocessor, preprocessor_path)

print("\nSaved:")
print(" -", X_train_path)
print(" -", X_test_path)
print(" -", y_train_path)
print(" -", y_test_path)
print(" -", top_features_path)
print(" -", scale_pos_weight_path)
print(" -", preprocessor_path)
print("\n[Preprocessing done with leakage guard.]")


Loaded fused: (50000, 29)
target: label | user: user | time: time_stamp
Dropped null rows: 0
Working df: (50000, 29) pos_rate: 0.05134
Removed by name blacklist: ['nonclk', 'clk', 'time_stamp_str']
Temporal split cutoff: 1494682302
train: (40000, 26) pos: 0.051525
test : (10000, 26) pos: 0.0506
X_train raw: (40000, 23) X_test raw: (10000, 23)
categorical: 14 numeric: 9
scale_pos_weight: 18.40805434255216
Encoded shapes: (40000, 29) (10000, 29)
Saved leakage report: d:\projects\Ai\project_fusion_ecu\data\processed\leakage_report.csv

[LEAKAGE GUARD] No obvious leaked features found by exact/AUC scans.
After leakage guard: (40000, 29) (10000, 29)
Saved importances: d:\projects\Ai\project_fusion_ecu\data\processed\feature_importances.csv

Saved:
 - d:\projects\Ai\project_fusion_ecu\data\processed\X_train_processed.npz
 - d:\projects\Ai\project_fusion_ecu\data\processed\X_test_processed.npz
 - d:\projects\Ai\project_fusion_ecu\data\processed\y_train.csv
 - d:\projects\Ai\project_fusion_ecu

### Modifications Summary
This notebook has been updated to remove SMOTE-based oversampling and One-Hot encoding.
Instead, class imbalance is handled by computing `scale_pos_weight` (negative/positive ratio) on the training data.
Categorical features are encoded using **Target Encoding** for high-cardinality columns and **Label Encoding** for low-cardinality columns.
Numeric features are imputed with the median and standardised.
Feature importances are computed with an XGBoost model using the calculated `scale_pos_weight` and the top 50 features are selected.
The processed datasets, label files, selected features, feature importances, scale position weight, and a preprocessor mapping are saved to the `data/processed` directory for subsequent notebooks.
