In [1]:
from functions import data_pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
train, test = data_pipeline(model_name='XGB')

## Model Pipeline

### Functions

In [None]:
def get_n_components(X_train, method = "avg"):
        pca = PCA()
        pca.fit(X_train)
        explained_var_ratio = pca.explained_variance_ratio_
        if method == "avg":
            avg_var = 1 / len(explained_var_ratio)
            optimal_components = np.sum(explained_var_ratio > avg_var)

        elif method == "elbow":
            diffs = np.diff(explained_var_ratio)
            elbow_idx = np.argmax(diffs * -1) + 1 
            optimal_components = elbow_idx

        elif method == "cumulative":
            cum_var = np.cumsum(explained_var_ratio)
            optimal_components = np.argmax(cum_var >= 0.95) + 1
        else:
            # comparison with a base model to see if PCA methods are actually improving the model
            optimal_components = None
        return optimal_components
    
methods = ["avg", "elbow", "cumulative", "default"]

### Transformations & Scaling
- from QQ plot analysis - noticed that certain numerical features require transformation and diff features need different scalers
> Highly skewed & outlier-heavy	risk_score, months_employed	→ YeoJohnsonTransformer() or np.log1p() → RobustScaler

> Already normal/log-transformed	amt_income_total_log, age	→ StandardScaler

> Discrete / ordinal numeric	cnt_children, cnt_fam_members	→ Keep as is or encode as ordinal integers

### Encoding
* categorical features - label encoder might assign encoded categories some inherent ordering affecting model which is fine for tree based models & XGBoost but not for SVC and KNN, so need to use diff encoding methods that suit the diff models

| Feature type                  | XGBoost                     | SVC          | KNN                       |
| ----------------------------- | --------------------------- | ------------ | ------------------------- |
| Binary                        | 0/1 mapping                 | 0/1 mapping  | 0/1 mapping               |
| Low-cardinality (<5)          | One-hot or label encoding   | One-hot only | One-hot                   |
| Medium/high-cardinality (~17) | Frequency or label encoding | One-hot only | One-hot / binary encoding |
| Numeric                       | Raw                         | Standardized | Standardized              |

## Dropping of correlated features
| Feature type     | XGBoost / Tree                      | SVC / KNN / Linear                |
| ---------------- | ----------------------------------- | --------------------------------- |
| Discrete numeric | keep numeric                        | Better as categorical / one-hot   |
| Binned/ordinal   | Optional (tree can handle either)   | Use one-hot encoding              |

## Feature Selection



In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import (
    PowerTransformer, RobustScaler, StandardScaler,
    OneHotEncoder, OrdinalEncoder, FunctionTransformer
)
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.feature_selection import SelectKBest, mutual_info_classif, RFE, SelectFromModel

# ------------------------------------------------------------
# Column Dictionary
# ------------------------------------------------------------

numeric_cols = ["age", "cnt_children", "amt_income_total_log", "risk_score", "months_employed"]


# ------------------------------------------------------------
# 1. Build Transformer
# ------------------------------------------------------------
def build_transformer():
    
    transformers=[
            ("num", StandardScaler(), numeric_cols)
        ]

    preprocessor = ColumnTransformer(transformers, remainder='passthrough')
    
    return preprocessor
# ------------------------------------------------------------
# 2. Drop Correlated Features
# ------------------------------------------------------------
# def drop_correlated_features(model_name, col_dic=column_dic):
drop_cols = ["days_birth", "amt_income_total", "years_employed", "flag_mobil", "code_gender", "flag_own_realty", "flag_own_car", "cnt_fam_members"]
    # if model_name in ["SVC", "KNN"]:
    #     drop_cols.extend(["cnt_children", "cnt_fam_members"])
    #     drop_cols.extend
    # return drop_cols

# ------------------------------------------------------------
# 3. Feature Selection Model
# ------------------------------------------------------------
def build_feature_selector(model_name):
    if model_name=="SVC":
        return RFE(SVC(kernel='linear'), n_features_to_select=None, step=0.2, importance_getter='feature_importances_')
    elif model_name=="XGB":
        return SelectFromModel(XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42), threshold='median')
    elif model_name=="KNN":
        return SelectKBest(score_func=mutual_info_classif, k=10)
# ------------------------------------------------------------
# 4. Build Model
# ------------------------------------------------------------
def build_model(model_name):
    if model_name == "SVC":
        return "SVM (Linear)", SVC(kernel='linear', random_state=42)
    elif model_name == "XGB":
        return "XGB Classifier", XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42)
    elif model_name == "KNN":
        return "KNN", KNeighborsClassifier()
    else:
        raise ValueError("Unsupported model name")
    


# ------------------------------------------------------------
# 5. Model Training Pipeline
# ------------------------------------------------------------
def model_pipeline(model_name, train_df, test_df, target_col="label", random_state=42):
    # Drop correlated columns
    drop_cols = ["days_birth", "amt_income_total", "years_employed", "flag_mobil", "code_gender", "flag_own_realty", "flag_own_car", "cnt_fam_members"]
    train_df = train_df.drop(columns=drop_cols, errors='ignore')
    test_df = test_df.drop(columns=drop_cols, errors='ignore')

    # Split features and target
    X_train_full = train_df.drop(columns=[target_col])
    y_train_full = train_df[target_col]
    X_test = test_df.drop(columns=[target_col])
    y_test = test_df[target_col]

    # Preprocessor
    # preprocessor = build_transformer()
    
    # Model
    name, model = build_model(model_name)
    print(f"\nTraining model: {name} using StratifiedKFold...")

    # Stratified K-Fold
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)
    acc_scores, f1_scores, roc_scores = [], [], []

    for fold, (train_idx, val_idx) in enumerate(skf.split(X_train_full, y_train_full), 1):
        X_train, X_val = X_train_full.iloc[train_idx], X_train_full.iloc[val_idx]
        y_train, y_val = y_train_full.iloc[train_idx], y_train_full.iloc[val_idx]

        pipeline = Pipeline([
            # ("preprocess", preprocessor),
            ("feature_selector", build_feature_selector(model_name)),
            ("classifier", model)
        ])
        pipeline.fit(X_train, y_train)

        y_pred = pipeline.predict(X_val)
        y_proba = pipeline.predict_proba(X_val)[:, 1]
        

        acc_scores.append(accuracy_score(y_val, y_pred))
        f1_scores.append(f1_score(y_val, y_pred))
        roc_scores.append(roc_auc_score(y_val, y_proba))

        print(f"Fold {fold}: Accuracy={acc_scores[-1]:.3f}, F1={f1_scores[-1]:.3f}, ROC-AUC={roc_scores[-1]:.3f}")

    results = {
        "model": name,
        "accuracy": np.mean(acc_scores),
        "f1_score": np.mean(f1_scores),
        "roc_auc": np.nanmean(roc_scores)
    }

    print(f"\nFinished training {name} across all folds.")
    print(f"Average Accuracy: {results['accuracy']:.3f}, F1: {results['f1_score']:.3f}, ROC-AUC: {results['roc_auc']:.3f}")

    return results, X_train_full, y_train_full, X_test, y_test


In [None]:
models = ["XGB", "KNN"]
for model in models:
    # train_df, test_df = data_pipeline()
    results_df, X_train, y_train, X_test, y_test = model_pipeline(model_name=model, 
        train_df=train, test_df=test, target_col="label", random_state=42
    )

NameError: name 'model_pipeline' is not defined