In [None]:
from functions_ver2 import data_pipeline as data_pipeline_v1

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score, precision_recall_curve
import numpy as np

# -----------------------------
# Neural Network Definition
# -----------------------------
class MLPClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dims=[128, 64], dropout=0.2):
        super(MLPClassifier, self).__init__()
        layers = []
        prev_dim = input_dim
        for h_dim in hidden_dims:
            layers.append(nn.Linear(prev_dim, h_dim))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout))
            prev_dim = h_dim
        layers.append(nn.Linear(prev_dim, 1)) 
        self.network = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.network(x)

# -----------------------------
# Threshold tuning function
# -----------------------------
def tune_threshold(y_true, y_proba):
    precision, recall, thresholds = precision_recall_curve(y_true, y_proba)
    f1_scores = 2 * precision * recall / (precision + recall + 1e-6)
    best_idx = np.argmax(f1_scores)
    best_threshold = thresholds[best_idx] if best_idx < len(thresholds) else 0.5
    return best_threshold, f1_scores[best_idx]

# -----------------------------
# Training pipeline with hyperparameters
# -----------------------------
def train_mlp_cv_hp(X, y, hp_dict, n_splits=5, random_state=42):
    """
    X: DataFrame of features
    y: Series of labels
    hp_dict: hyperparameter dictionary with keys:
        - hidden_dims
        - dropout
        - batch_size
        - lr
        - n_epochs
    """
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    X_tensor = torch.tensor(X.values, dtype=torch.float32)
    y_tensor = torch.tensor(y.values, dtype=torch.float32).unsqueeze(1)
    
    fold_metrics = []
    fold_models = []

    for fold, (train_idx, val_idx) in enumerate(skf.split(X.values, y.values), 1):
        X_train, X_val = X_tensor[train_idx], X_tensor[val_idx]
        y_train, y_val = y_tensor[train_idx], y_tensor[val_idx]
        
        loader_train = DataLoader(TensorDataset(X_train, y_train), 
                                  batch_size=hp_dict["batch_size"], shuffle=True)
        loader_val = DataLoader(TensorDataset(X_val, y_val), 
                                batch_size=hp_dict["batch_size"])
        
        model = MLPClassifier(input_dim=X.shape[1], 
                              hidden_dims=hp_dict["hidden_dims"], 
                              dropout=hp_dict["dropout"]).to(device)
        
        pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
        criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor(pos_weight).to(device))
        optimizer = torch.optim.Adam(model.parameters(), lr=hp_dict["lr"])
        
        # Training loop
        model.train()
        for epoch in range(hp_dict["n_epochs"]):
            for xb, yb in loader_train:
                xb, yb = xb.to(device), yb.to(device)
                optimizer.zero_grad()
                pred = model(xb)
                loss = criterion(pred, yb)
                loss.backward()
                optimizer.step()
        
        # Validation
        model.eval()
        with torch.no_grad():
            y_val_logits = model(X_val.to(device))
            y_val_proba = torch.sigmoid(y_val_logits).cpu().numpy()
        
        best_threshold, best_f1 = tune_threshold(y_val.numpy(), y_val_proba)
        y_val_pred = (y_val_proba > best_threshold).astype(int)
        
        fold_metrics.append({
            "accuracy": accuracy_score(y_val.numpy(), y_val_pred),
            "f1": f1_score(y_val.numpy(), y_val_pred),
            "roc_auc": roc_auc_score(y_val.numpy(), y_val_proba),
            "best_threshold": best_threshold,
            "f1_at_threshold": best_f1
        })
        fold_models.append(model)
        
        print(f"Fold {fold}: Acc={fold_metrics[-1]['accuracy']:.3f}, "
              f"F1={fold_metrics[-1]['f1']:.3f}, ROC-AUC={fold_metrics[-1]['roc_auc']:.3f}, "
              f"Threshold={best_threshold:.2f}")
    
    # Select best model
    best_idx = np.argmax([m['roc_auc'] for m in fold_metrics])
    best_model = fold_models[best_idx]
    print(f"\nBest model: Fold {best_idx+1}, ROC-AUC={fold_metrics[best_idx]['roc_auc']:.3f}")
    
    return best_model, fold_metrics


In [4]:
X_train_std, y_train, X_train_smote_std, y_train_smote, X_train_smotetomek_std, y_train_smotetomek, X_train_cc_std, y_train_cc, X_test_std, y_test = data_pipeline_v1('onehot')

Loading data
Splitting data
Total unique accounts: 45985. Starting to find cutoff point
Cutoff month where CDF reaches 80%: -10

=== Split based on CDF 80% cutoff ===
Cutoff month: -10 (10 months ago)
Old accounts (≤ month -10): 37,210 (80.9%)
New accounts (> month -10): 8,775 (19.1%)
Ratio (old/new): 4.2405
Splitting raw credit records
Cleaning old accounts credit records - [Length: 996586]


  final_df = df.groupby(['id', 'origination_month']).apply(lambda x: pd.Series({


Cleaning new accounts credit records - [Length: 51989]


  final_df = df.groupby(['id', 'origination_month']).apply(lambda x: pd.Series({
  df_dropped=df_sorted.groupby('id', group_keys=False).apply(keep_row)


Cleaning credit data completed
Splitting application dataset
Cleaning old accounts application records - [Length: (29264, 18)]
Cleaning new accounts appplication records, - [Length: (7193, 18)]
Encoding


  df_dropped=df_sorted.groupby('id', group_keys=False).apply(keep_row)


Encoders: {'name_income_type': OneHotEncoder(handle_unknown='ignore', sparse_output=False), 'name_education_type': OneHotEncoder(handle_unknown='ignore', sparse_output=False), 'name_family_status': OneHotEncoder(handle_unknown='ignore', sparse_output=False), 'name_housing_type': OneHotEncoder(handle_unknown='ignore', sparse_output=False), 'occupation_type': OneHotEncoder(handle_unknown='ignore', sparse_output=False)}
Encoding type: onehot
Merging data
Engineering target variable to label data
Completed old accounts labelling
Completed new accounts labelling
Old accounts: (37210, 3)
New accounts: (8775, 3)
Old threshold: 0.2008258258258258
New threshold: 0.5786182336182336
Merging cleaned application and credit records
Train shape: (29264, 63)
Test shape: (7193, 63)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29264 entries, 0 to 29263
Data columns (total 63 columns):
 #   Column                                             Non-Null Count  Dtype  
---  ------                        

In [1]:
# Example hyperparameter dictionary
hp_dict = {
    "hidden_dims": [256, 128],
    "dropout": 0.3,
    "batch_size": 64,
    "lr": 5e-4,
    "n_epochs": 30
}

best_model, metrics = train_mlp_cv_hp(X_train_smote_std, y_train_smote, hp_dict)


NameError: name 'train_mlp_cv_hp' is not defined

In [10]:
def evaluate_mlp_on_test(best_model, X_test, y_test, batch_size=64):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32).to(device)
    y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).unsqueeze(1).to(device)
    
    best_model.eval()
    with torch.no_grad():
        y_test_logits = best_model(X_test_tensor)
        y_test_proba = torch.sigmoid(y_test_logits).cpu().numpy()
    
    # Tune threshold for best F1
    best_threshold, best_f1 = tune_threshold(y_test.values, y_test_proba)
    y_test_pred = (y_test_proba > best_threshold).astype(int)
    
    # Compute metrics
    test_metrics = {
        "accuracy": accuracy_score(y_test, y_test_pred),
        "f1": f1_score(y_test, y_test_pred),
        "roc_auc": roc_auc_score(y_test, y_test_proba),
        "best_threshold": best_threshold,
        "f1_at_threshold": best_f1
    }
    
    print(f"Test Results → Accuracy={test_metrics['accuracy']:.3f}, "
          f"F1={test_metrics['f1']:.3f}, ROC-AUC={test_metrics['roc_auc']:.3f}, "
          f"Threshold={best_threshold:.2f}, F1@Threshold={best_f1:.3f}")
    
    return test_metrics
best_model = model
test_metrics = evaluate_mlp_on_test(best_model, X_test_std, y_test)

Test Results → Accuracy=0.388, F1=0.230, ROC-AUC=0.551, Threshold=0.25, F1@Threshold=0.231
