# 04 – Data Preprocessing (PySpark ML)

This notebook prepares the fused data for machine learning.  We encode categorical features, scale numerical features, and split the data into training and testing sets.  We also handle class imbalance by oversampling the minority class using the `imbalanced-learn` library after converting to Pandas.  Our goal is to create a balanced and well‑structured dataset for model training.


In [4]:
# ============================================
# 03_preprocessing.ipynb / preprocessing.py
# Full Preprocessing Code (Temporal split + Safe encoding + No OHE)
# ============================================

# ============================================
# 0. Imports and configuration
# ============================================

import os
import re
import json
import numpy as np
import pandas as pd

from pathlib import Path
from scipy import sparse
from joblib import dump

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold

from xgboost import XGBClassifier

# ============================================
# 1. Paths + Load fused data
# ============================================

project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
processed_dir = os.path.join(project_root, "data", "processed")
Path(processed_dir).mkdir(parents=True, exist_ok=True)

print("Project root:", project_root)
print("Processed dir:", processed_dir)

fused_path = os.path.join(processed_dir, "fused_data.csv")
if not os.path.exists(fused_path):
    raise FileNotFoundError(
        f"Fused data file not found at: {fused_path}\n"
        "Run 03_eda_fusion.ipynb first to generate fused_data.csv."
    )

df = pd.read_csv(fused_path)
print("Full fused data shape:", df.shape)

# ============================================
# 2. Target + user + time columns
# ============================================

# Target
if "label" in df.columns:
    target_col = "label"
elif "clk" in df.columns:
    target_col = "clk"
else:
    raise ValueError('No target column found. Expected "label" or "clk".')

# User id
if "user" in df.columns:
    user_col = "user"
elif "userid" in df.columns:
    user_col = "userid"
elif "nick" in df.columns:
    user_col = "nick"
else:
    raise ValueError('No user ID column found. Expected "user", "userid" or "nick".')

# Time column (for temporal split)
time_col = None
for cand in ["time_stamp", "timestamp", "date_time", "datetime", "time"]:
    if cand in df.columns:
        time_col = cand
        break

print("Target:", target_col)
print("User ID:", user_col)
print("Time col:", time_col)

# Drop rows with missing target/user/time (time required for temporal split)
drop_cols = [user_col, target_col] + ([time_col] if time_col else [])
before = len(df)
df = df.dropna(subset=drop_cols)
print(f"Dropped rows with null user/target/time: {before - len(df)}")
print("After dropna shape:", df.shape)

# Ensure target is int 0/1
df[target_col] = df[target_col].astype(int)

# ============================================
# 3. Optional: keep only most recent N rows (temporal tail)
# ============================================

MAX_ROWS = 200_000  # you used this in output (last 200k rows)
if time_col is not None:
    # Sort by time ascending then take last MAX_ROWS
    df = df.sort_values(time_col).reset_index(drop=True)
    if len(df) > MAX_ROWS:
        df = df.iloc[-MAX_ROWS:].copy()
        print(f"Temporal sampled to last {MAX_ROWS} rows.")
else:
    # If no time col, random sample
    if len(df) > MAX_ROWS:
        df = df.sample(n=MAX_ROWS, random_state=42).copy()
        print(f"Random sampled to {MAX_ROWS} rows (no time col).")

print("Working dataframe shape:", df.shape)

# ============================================
# 4. Remove leakage columns (STRICT)
# ============================================

leakage_cols = [
    "clk",
    "nonclk",
    "noclk",
    "impressions",
    "time_stamp_str",
    # NOTE: time_col is used only for split then removed from X
]
removed_cols = []
for c in leakage_cols:
    if c in df.columns and c != target_col:
        df.drop(columns=[c], inplace=True)
        removed_cols.append(c)

# If you created "impressions" in EDA (clk+nonclk), remove it (already in list)
if removed_cols:
    print("Removed leakage columns:", removed_cols)

print("Columns after leakage removal (before split):", len(df.columns))

# ============================================
# 5. Temporal split 80/20 (or stratified if no time_col)
# ============================================

if time_col is not None:
    df = df.sort_values(time_col).reset_index(drop=True)
    split_idx = int(len(df) * 0.8)
    train_df = df.iloc[:split_idx].copy()
    test_df  = df.iloc[split_idx:].copy()

    cutoff = df.iloc[split_idx][time_col]
    print("\nTemporal split 80/20:")
    print(f"cutoff {time_col} = {cutoff}")
else:
    # fallback
    from sklearn.model_selection import train_test_split
    train_df, test_df = train_test_split(
        df,
        test_size=0.2,
        stratify=df[target_col],
        random_state=42,
    )
    print("\nStratified split 80/20 (no time column).")

print("train_df shape:", train_df.shape)
print("test_df shape :", test_df.shape)

print("\nPositive rate:")
print("train pos_rate:", train_df[target_col].mean())
print("test  pos_rate:", test_df[target_col].mean())

# ============================================
# 6. Build X/y (drop target + keep time only for later optional feature engineering)
# ============================================

def drop_id_like_cols(X: pd.DataFrame) -> pd.DataFrame:
    """
    Drop ID-like and key columns that can cause leakage or meaningless ordering.
    This is stricter and safer than manual lists.
    """
    patterns = [
        r"_id$",          # any *_id
        r"^id$",
        r"^user$",
        r"^userid$",
        r"^nick$",
        r"^adgroup_id$",
        r"^campaign_id$",
        r"^customer$",
        r"^pid$",
    ]
    drop_cols = []
    for c in X.columns:
        c_low = c.lower().strip()
        if any(re.search(p, c_low) for p in patterns):
            drop_cols.append(c)
    return X.drop(columns=drop_cols, errors="ignore")

def build_xy(sub_df: pd.DataFrame, target: str, time_col: str | None) -> tuple[pd.DataFrame, pd.Series]:
    y = sub_df[target].astype(int)
    X = sub_df.drop(columns=[target], errors="ignore")

    # Drop time column from features (used only for split)
    if time_col is not None and time_col in X.columns:
        X = X.drop(columns=[time_col], errors="ignore")

    # Drop ID-like cols
    X = drop_id_like_cols(X)

    return X, y

X_train, y_train = build_xy(train_df, target_col, time_col)
X_test,  y_test  = build_xy(test_df,  target_col, time_col)

print("\nX_train shape (raw):", X_train.shape)
print("X_test shape  (raw):", X_test.shape)
print("y_train:", y_train.shape, "pos_rate:", y_train.mean())
print("y_test :", y_test.shape,  "pos_rate:", y_test.mean())

# ============================================
# 7. Feature typing (IMPORTANT): treat "numeric-looking categories"
# ============================================

# Heuristic: treat these known code columns as categorical even if numeric dtype
force_categorical = set([
    "cms_segid", "cms_group_id", "final_gender_code", "age_level",
    "shopping_level", "occupation", "cate_id", "brand", "pid",
    "pvalue_level", "new_user_class_level"
])

# Make sure forced categorical exist
force_categorical = {c for c in force_categorical if c in X_train.columns}

# Identify categorical: objects + forced categorical
categorical_cols = list(X_train.select_dtypes(include=["object", "category"]).columns)
for c in force_categorical:
    if c not in categorical_cols:
        categorical_cols.append(c)

# Numeric = everything else
numeric_cols = [c for c in X_train.columns if c not in categorical_cols]

print("\nNumeric columns:", len(numeric_cols))
print("Categorical columns:", len(categorical_cols))

# ============================================
# 8. Add missing indicators + log1p for behavior counts
# ============================================

# Missing indicators for key features (based on your fusion enrichment rates)
key_missing_cols = [c for c in ["price", "age_level", "final_gender_code", "shopping_level", "pvalue_level", "cms_segid"] if c in X_train.columns]
for c in key_missing_cols:
    X_train[f"is_missing_{c}"] = X_train[c].isna().astype(int)
    X_test[f"is_missing_{c}"]  = X_test[c].isna().astype(int)

# Update numeric/categorical lists (indicators are numeric)
for c in [f"is_missing_{x}" for x in key_missing_cols]:
    if c in X_train.columns and c not in numeric_cols:
        numeric_cols.append(c)

# Log1p for behavior counts (these should be numeric)
for c in ["pv", "cart", "fav", "buy"]:
    if c in X_train.columns:
        X_train[c] = pd.to_numeric(X_train[c], errors="coerce")
        X_test[c]  = pd.to_numeric(X_test[c], errors="coerce")
        X_train[c] = np.log1p(X_train[c])
        X_test[c]  = np.log1p(X_test[c])

# ============================================
# 9. Compute scale_pos_weight (neg/pos) on TRAIN ONLY
# ============================================

pos_count = int(y_train.sum())
neg_count = int(len(y_train) - pos_count)
scale_pos_weight = (neg_count / pos_count) if pos_count > 0 else 1.0
print(f"\nComputed scale_pos_weight: {scale_pos_weight:.4f}")

# ============================================
# 10. Categorical encoding (NO OHE)
#     - High-cardinality: KFold target encoding (train only, leak-safe)
#     - Low-cardinality: label encoding
# ============================================

HIGH_CARD_THRESHOLD = 10
SMOOTHING = 20
N_SPLITS_TE = 5
RANDOM_STATE = 42

global_mean = float(y_train.mean())

X_train_enc = X_train.copy()
X_test_enc  = X_test.copy()

# Ensure categorical columns are treated as strings for mapping stability
for c in categorical_cols:
    X_train_enc[c] = X_train_enc[c].astype("string")
    X_test_enc[c]  = X_test_enc[c].astype("string")

high_card_cols = []
low_card_cols  = []
for c in categorical_cols:
    nunq = X_train_enc[c].nunique(dropna=True)
    if nunq > HIGH_CARD_THRESHOLD:
        high_card_cols.append(c)
    else:
        low_card_cols.append(c)

# --- Low-card: label encoding ---
low_card_mappings = {}
for c in low_card_cols:
    cats = X_train_enc[c].dropna().unique().tolist()
    # stable ordering
    cats = sorted([str(x) for x in cats])
    mapping = {cat: i for i, cat in enumerate(cats)}
    low_card_mappings[c] = mapping

    X_train_enc[c] = X_train_enc[c].map(mapping).fillna(-1).astype(int)
    X_test_enc[c]  = X_test_enc[c].map(mapping).fillna(-1).astype(int)

# --- High-card: KFold target encoding (leak-safe) ---
high_card_mappings = {}

if len(high_card_cols) > 0:
    skf = StratifiedKFold(n_splits=N_SPLITS_TE, shuffle=True, random_state=RANDOM_STATE)

    for c in high_card_cols:
        # out-of-fold encoding for training
        oof = pd.Series(index=X_train_enc.index, dtype=float)

        for tr_idx, val_idx in skf.split(X_train_enc, y_train):
            tr_x = X_train_enc.iloc[tr_idx]
            tr_y = y_train.iloc[tr_idx]

            stats = (
                pd.DataFrame({c: tr_x[c], "target": tr_y.values})
                .groupby(c)["target"]
                .agg(["count", "mean"])
            )

            # smoothing
            enc_map = {}
            for cat, row in stats.iterrows():
                cnt = row["count"]
                mu  = row["mean"]
                enc_val = (cnt * mu + SMOOTHING * global_mean) / (cnt + SMOOTHING)
                enc_map[str(cat)] = float(enc_val)

            val_x = X_train_enc.iloc[val_idx]
            oof.iloc[val_idx] = val_x[c].map(enc_map).astype(float)

        # Fill unseen/null with global mean
        X_train_enc[c] = oof.fillna(global_mean).astype(float)

        # Fit final mapping on full training for test transform
        stats_full = (
            pd.DataFrame({c: X_train_enc.index.map(lambda i: X_train.loc[i, c] if c in X_train.columns else None),
                          "target": y_train.values})
        )
        # Rebuild using original string categories from X_train (not oof)
        stats_full = pd.DataFrame({c: X_train[c].astype("string"), "target": y_train.values}).groupby(c)["target"].agg(["count", "mean"])

        final_map = {}
        for cat, row in stats_full.iterrows():
            cnt = row["count"]
            mu  = row["mean"]
            enc_val = (cnt * mu + SMOOTHING * global_mean) / (cnt + SMOOTHING)
            final_map[str(cat)] = float(enc_val)

        high_card_mappings[c] = {
            "mapping": final_map,
            "global_mean": global_mean,
            "smoothing": SMOOTHING,
            "n_splits": N_SPLITS_TE,
        }

        X_test_enc[c] = X_test[c].astype("string").map(final_map).fillna(global_mean).astype(float)

# After encoding, all categorical cols become numeric, so add them to numeric_cols if needed
for c in categorical_cols:
    if c in X_train_enc.columns and c not in numeric_cols:
        numeric_cols.append(c)

# ============================================
# 11. Numeric imputation + scaling (fix dtype warnings)
# ============================================

# Ensure numeric columns exist in encoded DF (some might be missing due to drops)
numeric_cols = [c for c in numeric_cols if c in X_train_enc.columns]

# Coerce to numeric safely
for c in numeric_cols:
    X_train_enc[c] = pd.to_numeric(X_train_enc[c], errors="coerce")
    X_test_enc[c]  = pd.to_numeric(X_test_enc[c], errors="coerce")

numeric_imputer = SimpleImputer(strategy="median")
numeric_scaler  = StandardScaler()

X_train_num = numeric_imputer.fit_transform(X_train_enc[numeric_cols].astype(np.float64))
X_test_num  = numeric_imputer.transform(X_test_enc[numeric_cols].astype(np.float64))

X_train_num = numeric_scaler.fit_transform(X_train_num)
X_test_num  = numeric_scaler.transform(X_test_num)

# Assign back as float (avoid FutureWarning)
X_train_enc.loc[:, numeric_cols] = X_train_num.astype(np.float64)
X_test_enc.loc[:, numeric_cols]  = X_test_num.astype(np.float64)

# Final processed matrices (keep column order stable)
X_train_processed = X_train_enc.copy()
X_test_processed  = X_test_enc.copy()

# Safety: ensure same columns
missing_in_test = [c for c in X_train_processed.columns if c not in X_test_processed.columns]
missing_in_train = [c for c in X_test_processed.columns if c not in X_train_processed.columns]
if missing_in_test or missing_in_train:
    raise ValueError(
        f"Train/Test columns mismatch.\n"
        f"Missing in test: {missing_in_test}\n"
        f"Missing in train: {missing_in_train}\n"
    )

print("\nProcessed X_train shape:", X_train_processed.shape)
print("Processed X_test shape :", X_test_processed.shape)

# ============================================
# 12. Feature importance (XGBoost) + optional top-K selection
# ============================================

print("\nFitting XGBoost for feature importance...")
importance_model = XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="logloss",
    random_state=42,
    n_jobs=-1,
    scale_pos_weight=scale_pos_weight,
)
importance_model.fit(X_train_processed, y_train)

importances   = importance_model.feature_importances_
feature_names = X_train_processed.columns.tolist()

importance_df = pd.DataFrame({"feature": feature_names, "importance": importances}).sort_values("importance", ascending=False)
importance_csv_path = os.path.join(processed_dir, "feature_importances.csv")
importance_df.to_csv(importance_csv_path, index=False)
print(f"Saved feature importances to: {importance_csv_path}")

# Select top-K only if many features
TOP_K = 50
if len(feature_names) <= TOP_K:
    top_features = feature_names[:]   # keep all
else:
    top_features = importance_df["feature"].iloc[:TOP_K].tolist()

print(f"Selected Top-K features: {len(top_features)}")

X_train_final = X_train_processed[top_features]
X_test_final  = X_test_processed[top_features]

# ============================================
# 13. Save processed outputs
# ============================================

X_train_path = os.path.join(processed_dir, "X_train_processed.npz")
X_test_path  = os.path.join(processed_dir, "X_test_processed.npz")
y_train_path = os.path.join(processed_dir, "y_train.csv")
y_test_path  = os.path.join(processed_dir, "y_test.csv")
top_features_path = os.path.join(processed_dir, "top_features.csv")
scale_pos_weight_path = os.path.join(processed_dir, "scale_pos_weight.txt")
preprocessor_path = os.path.join(processed_dir, "preprocessor.joblib")

# Save sparse matrices
sparse.save_npz(X_train_path, sparse.csr_matrix(X_train_final.values))
sparse.save_npz(X_test_path,  sparse.csr_matrix(X_test_final.values))

# Save labels
pd.Series(y_train, name=target_col).to_csv(y_train_path, index=False)
pd.Series(y_test, name=target_col).to_csv(y_test_path, index=False)

# Save scale_pos_weight
with open(scale_pos_weight_path, "w") as f:
    f.write(str(scale_pos_weight))

# Save top features list
pd.DataFrame({"feature": top_features}).to_csv(top_features_path, index=False)

# Save preprocessor (for inference reproducibility)
preprocessor = {
    "target_col": target_col,
    "user_col": user_col,
    "time_col_used_for_split": time_col,

    "numeric_cols": numeric_cols,
    "categorical_cols_original": categorical_cols,

    "missing_indicator_base_cols": key_missing_cols,
    "log1p_cols": [c for c in ["pv", "cart", "fav", "buy"] if c in X_train.columns],

    "numeric_imputer_strategy": "median",
    "numeric_imputer_statistics": numeric_imputer.statistics_.tolist(),

    "numeric_scaler_mean": numeric_scaler.mean_.tolist(),
    "numeric_scaler_scale": numeric_scaler.scale_.tolist(),

    "high_card_cols": high_card_cols,
    "high_card_mappings": high_card_mappings,  # contains mapping + smoothing + global mean

    "low_card_cols": low_card_cols,
    "low_card_mappings": low_card_mappings,

    "global_mean": global_mean,
    "scale_pos_weight": scale_pos_weight,
    "top_features": top_features,
}

dump(preprocessor, preprocessor_path)

print("\nSaved processed files:")
print(" -", X_train_path)
print(" -", X_test_path)
print(" -", y_train_path)
print(" -", y_test_path)
print(" -", importance_csv_path)
print(" -", top_features_path)
print(" -", scale_pos_weight_path)
print(" -", preprocessor_path)

print("\n[Preprocessing completed successfully — جاهز لملف التدريب.]")


Project root: d:\projects\Ai\project_fusion_ecu
Processed dir: d:\projects\Ai\project_fusion_ecu\data\processed
Full fused data shape: (500000, 25)
Target: label
User ID: user
Time col: time_stamp
Dropped rows with null user/target/time: 0
After dropna shape: (500000, 25)
Temporal sampled to last 200000 rows.
Working dataframe shape: (200000, 25)
Removed leakage columns: ['clk', 'nonclk', 'time_stamp_str']
Columns after leakage removal (before split): 22

Temporal split 80/20:
cutoff time_stamp = 1494648979
train_df shape: (160000, 22)
test_df shape : (40000, 22)

Positive rate:
train pos_rate: 0.04765625
test  pos_rate: 0.0457

X_train shape (raw): (160000, 13)
X_test shape  (raw): (40000, 13)
y_train: (160000,) pos_rate: 0.04765625
y_test : (40000,) pos_rate: 0.0457

Numeric columns: 5
Categorical columns: 8

Computed scale_pos_weight: 19.9836


  1.42088877]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  X_train_enc.loc[:, numeric_cols] = X_train_num.astype(np.float64)
  X_train_enc.loc[:, numeric_cols] = X_train_num.astype(np.float64)
  X_train_enc.loc[:, numeric_cols] = X_train_num.astype(np.float64)
  X_train_enc.loc[:, numeric_cols] = X_train_num.astype(np.float64)
  X_train_enc.loc[:, numeric_cols] = X_train_num.astype(np.float64)
  X_train_enc.loc[:, numeric_cols] = X_train_num.astype(np.float64)
  1.05498129]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  X_train_enc.loc[:, numeric_cols] = X_train_num.astype(np.float64)
 -0.15102511]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  X_train_enc.loc[:, numeric_cols] = X_train_num.astype(np.float64)
  0.35084604]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  X_train_enc.loc[:, numeric_cols] = X_train_num


Processed X_train shape: (160000, 19)
Processed X_test shape : (40000, 19)

Fitting XGBoost for feature importance...
Saved feature importances to: d:\projects\Ai\project_fusion_ecu\data\processed\feature_importances.csv
Selected Top-K features: 19

Saved processed files:
 - d:\projects\Ai\project_fusion_ecu\data\processed\X_train_processed.npz
 - d:\projects\Ai\project_fusion_ecu\data\processed\X_test_processed.npz
 - d:\projects\Ai\project_fusion_ecu\data\processed\y_train.csv
 - d:\projects\Ai\project_fusion_ecu\data\processed\y_test.csv
 - d:\projects\Ai\project_fusion_ecu\data\processed\feature_importances.csv
 - d:\projects\Ai\project_fusion_ecu\data\processed\top_features.csv
 - d:\projects\Ai\project_fusion_ecu\data\processed\scale_pos_weight.txt
 - d:\projects\Ai\project_fusion_ecu\data\processed\preprocessor.joblib

[Preprocessing completed successfully — جاهز لملف التدريب.]


### Modifications Summary
This notebook has been updated to remove SMOTE-based oversampling and One-Hot encoding.
Instead, class imbalance is handled by computing `scale_pos_weight` (negative/positive ratio) on the training data.
Categorical features are encoded using **Target Encoding** for high-cardinality columns and **Label Encoding** for low-cardinality columns.
Numeric features are imputed with the median and standardised.
Feature importances are computed with an XGBoost model using the calculated `scale_pos_weight` and the top 50 features are selected.
The processed datasets, label files, selected features, feature importances, scale position weight, and a preprocessor mapping are saved to the `data/processed` directory for subsequent notebooks.
