In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as ImbPipeline  
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import ClusterCentroids
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV, learning_curve
from sklearn.metrics import (
    classification_report, confusion_matrix,
    roc_curve, roc_auc_score, precision_recall_curve,
    average_precision_score, f1_score, auc
)
from sklearn.model_selection import StratifiedKFold
# from functions_ver2 import data_pipeline as data_pipeline_v2
from functions import data_pipeline


## Baseline model using label encoding and no hyperparameter tuning

## Using one-hot encoding 

In [2]:
X_train_std, y_train, X_test_std, y_test = data_pipeline('onehot')

Loading data
Splitting data
Total unique accounts: 45985. Starting to find cutoff point
Cutoff month where CDF reaches 80%: -10

=== Split based on CDF 80% cutoff ===
Cutoff month: -10 (10 months ago)
Old accounts (≤ month -10): 37,210 (80.9%)
New accounts (> month -10): 8,775 (19.1%)
Ratio (old/new): 4.2405
Splitting raw credit records
Cleaning old accounts credit records - [Length: 996586]


  final_df = df.groupby(['id', 'origination_month']).apply(lambda x: pd.Series({


Cleaning new accounts credit records - [Length: 51989]


  final_df = df.groupby(['id', 'origination_month']).apply(lambda x: pd.Series({
  df_dropped=df_sorted.groupby('id', group_keys=False).apply(keep_row)
  df_dropped=df_sorted.groupby('id', group_keys=False).apply(keep_row)


Cleaning credit data completed
Splitting application dataset
Cleaning old accounts application records - [Length: (29264, 18)]
Cleaning new accounts appplication records, - [Length: (7193, 18)]
Encoding
Encoders: {'name_income_type': OneHotEncoder(handle_unknown='ignore', sparse_output=False), 'name_education_type': OneHotEncoder(handle_unknown='ignore', sparse_output=False), 'name_family_status': OneHotEncoder(handle_unknown='ignore', sparse_output=False), 'name_housing_type': OneHotEncoder(handle_unknown='ignore', sparse_output=False), 'occupation_type': OneHotEncoder(handle_unknown='ignore', sparse_output=False)}
Encoding type: onehot
Merging data
Engineering target variable to label data
Completed old accounts labelling
Completed new accounts labelling
Old accounts: (37210, 3)
New accounts: (8775, 3)
Old threshold: 0.20232732732732733
New threshold: 0.5786182336182336
Merging cleaned application and credit records
Train shape: (29264, 63)
Test shape: (7193, 63)
<class 'pandas.core.

In [10]:
def get_n_components(X_train, method="default"):
    pca = PCA()
    pca.fit(X_train)
    explained_var_ratio = pca.explained_variance_ratio_

    if method == "avg":
        avg_var = 1 / len(explained_var_ratio)
        optimal_components = np.sum(explained_var_ratio > avg_var)

    elif method == "elbow":
        diffs = np.diff(explained_var_ratio)
        elbow_idx = np.argmax(diffs * -1) + 1 
        optimal_components = elbow_idx

    elif method == "cumulative":
        cum_var = np.cumsum(explained_var_ratio)
        optimal_components = np.argmax(cum_var >= 0.95) + 1

    else:
        optimal_components = None
    return optimal_components


def build_knn_pipeline(pca_components=None, pca_method='default', sampling_method='smote', random_state=42):
    
    steps = []
    
    if sampling_method:
        sampler_map = {
            'smote': SMOTE(random_state=random_state),
            'smotetomek': SMOTETomek(random_state=random_state),
            'cc': ClusterCentroids(random_state=random_state)
        }
        steps.append(('sampling', sampler_map[sampling_method.lower()]))

    if pca_method == 'default' or pca_components is None:
        steps.append(('pca', PCA(whiten=True, random_state=random_state)))
    else:
        steps.append(('pca', PCA(n_components=pca_components, whiten=True, random_state=random_state)))
    
    steps.append(('knn', KNeighborsClassifier()))
    
    pipeline_cls = ImbPipeline if sampling_method else Pipeline
    pipeline = pipeline_cls(steps)
    
    return pipeline


def tune_knn_pipeline(pipeline, X_train, y_train, param_grid, cv=3, scoring='f1', n_jobs=-1):
    grid = GridSearchCV(pipeline, param_grid=param_grid, cv=cv, scoring=scoring, n_jobs=n_jobs)
    grid.fit(X_train, y_train)
    return grid.best_estimator_, grid.best_params_


def evaluate_and_plot(best_model, X_test, y_test, method_name='best'):
    y_pred = best_model.predict(X_test)
    y_proba = best_model.predict_proba(X_test)[:, 1]

    # Compute PR curve for PR-AUC
    precision, recall, _ = precision_recall_curve(y_test, y_proba)
    pr_auc = auc(recall, precision)

    results = {
        'test_f1': f1_score(y_test, y_pred),
        'test_roc_auc': roc_auc_score(y_test, y_proba),
        'test_ap': average_precision_score(y_test, y_proba),
        'test_pr_auc': pr_auc,  
        'classification_report': classification_report(y_test, y_pred, digits=3),
        'confusion_matrix': confusion_matrix(y_test, y_pred)
    }
    return results


def model_pipeline_knn_pca(
    X_train, y_train, X_test=None, y_test=None,
    n_splits=5, random_state=42,
    pca_method='default', sampling_method=None
):
    """
    Full model pipeline: KNN + PCA + (optional) sampling.
    Performs CV for hyperparameter tuning and evaluates best model.
    """
    
    knn_param_grid = {
        'knn__n_neighbors': [3, 5, 7, 9, 11],
        'knn__weights': ['uniform', 'distance'],
        'knn__metric': ['euclidean', 'minkowski', 'manhattan']
    }
    
    print(f"\nStarting PCA+KNN pipeline ({'Base' if sampling_method is None else sampling_method.upper()})")

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    
    fold_models, f1_scores, roc_scores, pr_auc_scores = [], [], [], []
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train), 1):
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
        
        components = get_n_components(X_tr, method=pca_method)
        pipeline = build_knn_pipeline(
            pca_components=components,
            pca_method=pca_method,
            sampling_method=sampling_method,
            random_state=random_state
        )

        best_pipeline, best_params = tune_knn_pipeline(pipeline, X_tr, y_tr, knn_param_grid, cv=3)
        
        y_pred = best_pipeline.predict(X_val)
        y_proba = best_pipeline.predict_proba(X_val)[:, 1]

        f1 = f1_score(y_val, y_pred)
        roc = roc_auc_score(y_val, y_proba)
        pr_auc = average_precision_score(y_val, y_proba)
        
        f1_scores.append(f1)
        roc_scores.append(roc)
        pr_auc_scores.append(pr_auc)
        fold_models.append(best_pipeline)
        
        print(f"Fold {fold}: n_components={components}, Best Params={best_params}, "
              f"F1={f1:.3f}, ROC-AUC={roc:.3f}, PR-AUC={pr_auc:.3f}")
    
    # Select best fold by F1
    best_fold_idx = int(np.argmax(f1_scores))
    best_model = fold_models[best_fold_idx]
    
    print(f"\nBest fold selected: Fold {best_fold_idx+1} "
          f"(F1={f1_scores[best_fold_idx]:.3f}, ROC-AUC={roc_scores[best_fold_idx]:.3f}, "
          f"PR-AUC={pr_auc_scores[best_fold_idx]:.3f})")

    results = {
        'f1': np.mean(f1_scores),
        'roc_auc': np.mean(roc_scores),
        'pr_auc': np.mean(pr_auc_scores),
        'n_components': components,
        'best_fold_idx': best_fold_idx
    }

    # Test evaluation
    if X_test is not None and y_test is not None:
        test_results = evaluate_and_plot(best_model, X_test, y_test, method_name=pca_method)
        results.update(test_results)

        print(f"\nTest set → F1={results['test_f1']:.3f}, "
              f"ROC-AUC={results['test_roc_auc']:.3f}, "
              f"PR-AUC={results['test_ap']:.3f}")

    return results, best_model



In [11]:
def run_knn_model(X_train, y_train, X_test, y_test, random_state=42):
    sampling_methods = [None, 'smote', 'smotetomek', 'cc']
    train_summary, test_summary = [], []

    for method in sampling_methods:
        method_name = "Base" if method is None else method.upper()
        print(f"\n{'='*70}\nRunning {method_name} Sampling Method Model\n{'='*70}")

        res, best_model = model_pipeline_knn_pca(
            X_train=X_train,
            y_train=y_train,
            X_test=X_test,
            y_test=y_test,
            pca_method='default',
            sampling_method=method,
            random_state=random_state
        )

        train_summary.append({
            "Method": method_name,
            "F1 (Train)": res.get('f1', None),
            "ROC-AUC (Train)": res.get('roc_auc', None),
            "PR-AUC (Train)": res.get('ap', None)
        })

        test_summary.append({
            "Method": method_name,
            "F1 (Test)": res.get('test_f1', None),
            "ROC-AUC (Test)": res.get('test_roc_auc', None),
            "PR-AUC (Test)": res.get('test_ap', None)
        })

    
    df_train = pd.DataFrame(train_summary).sort_values(by="F1 (Train)", ascending=False).reset_index(drop=True)
    df_test = pd.DataFrame(test_summary).sort_values(by="F1 (Test)", ascending=False).reset_index(drop=True)

    
    print("\n\n================ TRAIN PERFORMANCE SUMMARY ================\n")
    print(df_train[["Method", "F1 (Train)", "ROC-AUC (Train)", "PR-AUC (Train)"]].to_string(index=False))

    
    print("\n\n================ TEST PERFORMANCE SUMMARY ================\n")
    print(df_test[["Method", "F1 (Test)", "ROC-AUC (Test)", "PR-AUC (Test)"]].to_string(index=False))

    
    best_method = df_test.iloc[0]["Method"]
    best_test_f1 = df_test.iloc[0]["F1 (Test)"]
    best_test_roc = df_test.iloc[0]["ROC-AUC (Test)"]
    best_test_pr = df_test.iloc[0]["PR-AUC (Test)"]

    print(f"\nBest Model (by Test F1): {best_method} → "
          f"F1={best_test_f1:.3f}, ROC-AUC={best_test_roc:.3f}, PR-AUC={best_test_pr:.3f}")

    return df_train, df_test


In [7]:
def visualize_pca_feature_importance(best_model, X_train, y_train, top_n_features=10, top_n_pcs=5):
    
    pca_step_name = [name for name in best_model.named_steps if 'pca' in name.lower()][0]
    pca_step = best_model.named_steps[pca_step_name]
    
    loadings = pd.DataFrame(
        pca_step.components_.T,
        index=X_train.columns,
        columns=[f'PC{i+1}' for i in range(pca_step.n_components_)]
    )
    loadings['importance'] = np.sum(np.abs(loadings), axis=1)
    loadings = loadings.sort_values('importance', ascending=False)
    
    print("Top features by PCA importance:\n", loadings.head(top_n_features))
    
    # --- Transform X_train to PCA space ---
    X_pca = pca_step.transform(X_train)
    X_pca_df = pd.DataFrame(X_pca, columns=[f'PC{i+1}' for i in range(pca_step.n_components_)])
    X_pca_df['label'] = y_train.values
    
    # --- Correlation of PCs with label ---
    corr_with_label = X_pca_df.drop(columns='label').corrwith(X_pca_df['label']).abs().sort_values(ascending=False)
    
    # --- Heatmap of top features × all PCs ---
    top_features = loadings.head(top_n_features).index
    loadings_top_features = loadings.loc[top_features, loadings.columns[:-1]] 
    loadings_scaled = loadings_top_features.apply(lambda x: x / np.max(np.abs(x)), axis=0)
    
    plt.figure(figsize=(min(20, pca_step.n_components_*1.5),6))
    sns.heatmap(loadings_scaled, annot=True, cmap='coolwarm', center=0)
    plt.title(f"PCA Loadings (Top {top_n_features} Features)")
    plt.xlabel("Principal Components")
    plt.ylabel("Features")
    plt.tight_layout()
    plt.show()
    
    # --- Barplot of top PCs by correlation with label ---
    top_pcs = corr_with_label.head(top_n_pcs).index
    plt.figure(figsize=(8,4))
    sns.barplot(x=top_pcs, y=corr_with_label[top_pcs])
    plt.ylabel("Absolute correlation with label")
    plt.title(f"Top {top_n_pcs} PCs Correlated with Label")
    plt.show()
    
    return loadings, corr_with_label

### Resample In Model Pipeline

In [None]:

result_train, result_test = run_knn_model(X_train_std, y_train, X_test_std, y_test)



Running Base Sampling Method Model

Starting PCA+KNN pipeline (Base)


In [None]:
# Visualize top features and PCs
loadings, corr_with_label = visualize_pca_feature_importance(
    best_model, X_train_std, y_train,
    top_n_features=10, top_n_pcs=5
)

## Final performance on Test Set