<a href="https://colab.research.google.com/github/alyssa-tsh/CS3244_ML_Project/blob/main/cs3244_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from functions import data_pipeline, data_pipeline_svc

In [4]:
train, test = data_pipeline()
# train_svc, test_svc = data_pipeline_svc()

Loading data
Splitting credits data
Total unique accounts: 45985. Starting to find cutoff point
Cutoff month where CDF reaches 80%: -10

=== Split based on CDF 80% cutoff ===
Cutoff month: -10 (10 months ago)
Old accounts (≤ month -10): 37,210 (80.9%)
New accounts (> month -10): 8,775 (19.1%)
Ratio (old/new): 4.2405
Splitting raw credit records
Cleaning old accounts credit records - [Length: 996586]


  final_df = df.groupby(['id', 'origination_month']).apply(lambda x: pd.Series({


Cleaning new accounts credit records - [Length: 51989]


  final_df = df.groupby(['id', 'origination_month']).apply(lambda x: pd.Series({


Cleaning credit data completed
Splitting applications data
Splitting application dataset


KeyError: 'id'

## Model Pipeline

In [None]:
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler, RobustScaler, PowerTransformer, OneHotEncoder, FunctionTransformer, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer 
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
import numpy as np
import pandas as pd

### Functions

In [None]:
def get_n_components(X_train, method = "avg"):
        pca = PCA()
        pca.fit(X_train)
        explained_var_ratio = pca.explained_variance_ratio_
        if method == "avg":
            avg_var = 1 / len(explained_var_ratio)
            optimal_components = np.sum(explained_var_ratio > avg_var)

        elif method == "elbow":
            diffs = np.diff(explained_var_ratio)
            elbow_idx = np.argmax(diffs * -1) + 1 
            optimal_components = elbow_idx

        elif method == "cumulative":
            cum_var = np.cumsum(explained_var_ratio)
            optimal_components = np.argmax(cum_var >= 0.95) + 1
        else:
            # comparison with a base model to see if PCA methods are actually improving the model
            optimal_components = None
        return optimal_components
    
methods = ["avg", "elbow", "cumulative", "default"]

### Transformations & Scaling
- from QQ plot analysis - noticed that certain numerical features require transformation and diff features need different scalers
> Highly skewed & outlier-heavy	risk_score, months_employed	→ YeoJohnsonTransformer() or np.log1p() → RobustScaler

> Already normal/log-transformed	amt_income_total_log, age	→ StandardScaler

> Discrete / ordinal numeric	cnt_children, cnt_fam_members	→ Keep as is or encode as ordinal integers

### Encoding
* REALIZED that there are a lot of categorical features - label encoder might assign encoded categories some inherent ordering affecting model which is fine for tree based models & XGBoost but not for SVC and KNN

| Feature type                  | XGBoost                     | SVC          | KNN                       |
| ----------------------------- | --------------------------- | ------------ | ------------------------- |
| Binary                        | 0/1 mapping                 | 0/1 mapping  | 0/1 mapping               |
| Low-cardinality (<5)          | One-hot or label encoding   | One-hot only | One-hot                   |
| Medium/high-cardinality (~17) | Frequency or label encoding | One-hot only | One-hot / binary encoding |
| Numeric                       | Raw                         | Standardized | Standardized              |

## Dropping of correlated features
| Feature type     | XGBoost / Tree                      | SVC / KNN / Linear                |
| ---------------- | ----------------------------------- | --------------------------------- |
| Discrete numeric | keep numeric                        | Better as categorical / one-hot   |
| Binned/ordinal   | Optional (tree can handle either)   | Use one-hot encoding              |

## Feature Selection



In [None]:
column_dic = {
     # assigning numeric_cols specific scalers and transformations based on QQ plot analysis
    "skewed" : ['risk_score', 'months_employed'],
    "normal" : ['amt_income_total_log', 'age'],
    # "discrete" : ['cnt_children', 'cnt_fam_members'],
    # categorical variables
    "low_card_cols" : ["name_income_type","name_education_type","name_family_status","name_housing_type"],
    "high_card_cols" : ["occupation_type", "aged_binned"]
}
def build_transformer(model_name, column_dic):
    transformer = []
    transformer.append(
        [
            ('skewed', Pipeline([
            ('yeo', PowerTransformer(method='yeo-johnson')),
            ('robust', RobustScaler())
            ]), column_dic["skewed"]), 
            ('normal', StandardScaler(), column_dic["normal"])                                                                 
        ])
    
    if model_name == "SVC" or model_name == "KNN":
        print("Building transformer for model:", model_name)
        transformer.append(
            [   
                ("cat", OneHotEncoder(handle_unknown='ignore'), column_dic["low_card_cols"] + column_dic["high_card_cols"])
            ]
        )
    elif model_name == "XGB":
        print("Building transformer for model:", model_name)
        transformer.append(
            [   
                ("low_cat", OneHotEncoder(handle_unknown='ignore'), column_dic["low_card_cols"]),
                ("high_cat", FunctionTransformer(lambda X: X.assign(**{col: X[col].map(X[col].value_counts(normalize=True)) 
                                                                    for col in column_dic["high_card_cols"]})), column_dic["high_card_cols"])
            ]
        )
    else:
        print("Building default transformer model")
        transformer.append(
            [("cat",LabelEncoder(handle_unknown='ignore'), column_dic["low_card_cols"] + column_dic["high_card_cols"])]
        )
    return transformer

def drop_correlated_features(model_name):
    # drop highly correlated features - keeping months employed 
    drop_cols = ["days_birth", "amt_income_total", "years_employed", "flag_mobil"]
    if model_name == "SVC" or model_name == "KNN":
        drop_cols.append(["cnt_children", "cnt_fam_members"])

    return drop_cols

def build_model(model_name):
    if model_name == "SVC":
        model = {"SVM (Linear)" : SVC(kernel='linear', random_state=42) }       
        return model
    elif model_name == "XGB":
        model = {"XGB Classifier" :  XGBClassifier(use_label_encoder=True, eval_metric="logloss", random_state=42)}
        return XGBClassifier(use_label_encoder=True, eval_metric="logloss", random_state=42)
    elif model_name == "KNN":
        return KNeighborsClassifier()
    else:
        raise ValueError("Unsupported model name")
        

def model_pipeline(model_name, train_df, test_df, target_col="label", random_state=42):


    train_df = train_df.drop(columns=drop_correlated_features(model_name))
    test_df = test_df.drop(columns=drop_correlated_features(model_name))

    train = train_df.copy()
    test = test_df.copy()

    # train test split
    X_train_full = train.drop(columns=[target_col])
    y_train_full = train[target_col]
    X_test = test.drop(columns=[target_col])
    y_test = test[target_col]

    # Using RobustScaler instead since from EDA Standard & Min-Max Scaler distorted by outliers
    preprocessor = ColumnTransformer(
        build_transformer(model_name, column_dic)
    )

    # Define models
    model = build_model(model_name)
    

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)
    results = []
    name = model.keys()[0]
    print(f"Training model: {name} with StratifiedKFold...")
    acc_scores, f1_scores, roc_scores = [], [], []

    for fold, (train_idx, val_idx) in enumerate(skf.split(X_train_full, y_train_full), 1):
        X_train, X_val = X_train_full.iloc[train_idx], X_train_full.iloc[val_idx]
        y_train, y_val = y_train_full.iloc[train_idx], y_train_full.iloc[val_idx]

        pipeline = Pipeline([
            ("preprocess", preprocessor),
            ("classifier", model)
        ])
        pipeline.fit(X_train, y_train)

        y_pred = pipeline.predict(X_val)
        y_proba = pipeline.predict_proba(X_val)[:,1] if hasattr(pipeline, "predict_proba") else None

        acc_scores.append(accuracy_score(y_val, y_pred))
        f1_scores.append(f1_score(y_val, y_pred))
        roc_scores.append(roc_auc_score(y_val, y_proba) if y_proba is not None else np.nan)

        print(f"Fold {fold}: Accuracy={acc_scores[-1]:.3f}, F1={f1_scores[-1]:.3f}, ROC-AUC={roc_scores[-1]:.3f}")

    results.append({
        "model": name,
        "accuracy": np.mean(acc_scores),
        "f1_score": np.mean(f1_scores),
        "roc_auc": np.nanmean(roc_scores)
    })
    print(f"Finished training {name} across all folds.\n")


    return results, X_train_full, y_train_full, X_test, y_test




In [None]:
models = ["SVC", "XGB", "KNN"]
for model in models:
    train_df, test_df = data_pipeline()
    results_df, X_train, y_train, X_test, y_test = model_pipeline(model_name=model, 
        train=train_df, test=test_df, target_col="label", random_state=42
    )