In [1]:
import os

# Setup Imports
import pandas as pd
import numpy as np

from sklearn.datasets import load_breast_cancer, load_diabetes, load_iris
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.metrics import (
    accuracy_score,
    mean_absolute_error,
    mean_squared_error,
    root_mean_squared_error,
    r2_score,
    roc_auc_score,
)
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.inspection import DecisionBoundaryDisplay

from sklearn.datasets import fetch_openml
from sklearn.preprocessing import LabelEncoder
from IPython.display import display, Markdown, Latex

# Baseline Imports
from xgboost import XGBClassifier, XGBRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from catboost import CatBoostClassifier, CatBoostRegressor

import torch

import time

from tabpfn import TabPFNClassifier, TabPFNRegressor
from tabpfn_extensions.post_hoc_ensembles.sklearn_interface import AutoTabPFNClassifier, AutoTabPFNRegressor

if not torch.mps.is_available():
    raise SystemError('GPU device not found. For fast training, please enable GPU. See section above for instructions.')

SystemError: GPU device not found. For fast training, please enable GPU. See section above for instructions.

In [2]:
df = pd.read_csv(r'C:\Users\Hovsep\Desktop\Hovsep\German\HU studies\Semester 3\Applied Predictive Analytics\TABPFNCredit-APA25\data\pd\02 taiwan creditcard\taiwan_creditcard.csv')

In [3]:
def _preprocess_02_taiwan_creditcard(_data):

    # Drop ID and useless columns
    _data = _data.drop('ID', axis=1)

    # Transform
    _data['SEX'] = _data['SEX'].replace({'2': 1, '1': 0})

    # Split into covariates, labels
    y = _data['default.payment.next.month'].values.astype(int)
    x = _data.drop('default.payment.next.month', axis=1).values

    cols = list(_data.drop('default.payment.next.month', axis=1).columns)

    cols_cat = []
    cols_num = cols

    cols_cat_idx = [cols.index(col) for col in cols_cat if col in cols]
    cols_num_idx = [cols.index(col) for col in cols_num if col in cols]

    print("02_taiwan_creditcard preprocessed")
    print("x shape: ", x.shape)
    print("y shape: ", y.shape)

    return x, y, cols, cols_cat, cols_num, cols_cat_idx, cols_num_idx

X, y, cols, cols_cat, cols_num, cols_cat_idx, cols_num_idx = _preprocess_02_taiwan_creditcard(df)

02_taiwan_creditcard preprocessed
x shape:  (30000, 23)
y shape:  (30000,)


In [4]:
X = X[:1000]
y = y[:1000]

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [9]:
from tabpfn_extensions.rf_pfn import (
    RandomForestTabPFNClassifier,
    RandomForestTabPFNRegressor,
)

from tabpfn_extensions import TabPFNClassifier, TabPFNRegressor

# ----------------------------
# Classification - Strategy 1: Random Forest Preprocessing
# ----------------------------

clf_base = TabPFNClassifier(
    ignore_pretraining_limits=False,
    inference_config = {"SUBSAMPLE_SAMPLES": 10000} # Needs to be set low so that not OOM on fitting intermediate nodes
)

tabpfn_tree_clf = RandomForestTabPFNClassifier(
    tabpfn=clf_base,
    verbose=1,
    max_predict_time=60, # Will fit for one minute
    fit_nodes=True, # Wheather or not to fit intermediate nodes
    adaptive_tree=True, # Whather or not to validate if adding a leaf helps or not
  )

In [7]:
# ----------------------------
# Classification - Strategy 2: Subsampled Ensemble using TabPFNClassifier
# ----------------------------
print("\n--- Classification: Strategy 1 (Subsampled Ensemble) ---")
tabpfn_subsample_clf = TabPFNClassifier(
    ignore_pretraining_limits=True,  # (bool) Allows the use of datasets larger than pretraining limits.
    n_estimators=32,                 # (int) Number of estimators for ensembling; improves accuracy with higher values.
    inference_config={
        "SUBSAMPLE_SAMPLES": 10000  # (int) Maximum number of samples per inference step to manage memory usage.
    },
)


--- Classification: Strategy 1 (Subsampled Ensemble) ---


In [8]:
# Compare different machine learning models by training each one multiple times
# on different parts of the data and averaging their performance scores for a
# more reliable performance estimate

assert len(np.unique(y)) <= 10 # Is classification?

print("\n Starting Training...")
start_time = time.time()


# Define models
models_class = [
    ('TabPFN RF', tabpfn_tree_clf),
#     ('TabPFN Subsample', tabpfn_subsample_clf),
#     ('XGBoost', XGBClassifier()),
#     ('CatBoost', CatBoostClassifier(random_state=42, verbose=0)),
#     ('RandomForest', RandomForestClassifier(random_state=42)),
]

# Calculate scores
cv = KFold(random_state=42, n_splits=3, shuffle=True)
scoring = 'roc_auc_ovr' if len(np.unique(y)) > 2 else 'roc_auc'
scores_raw_class = {name: cross_val_score(model, X, y, cv=cv, scoring=scoring, verbose=1)
          for name, model in models_class}
scores_class = {name: scores_raw_class[name].mean()
          for name, model in models_class}


end_time = time.time()
print(f"\n Training done in {(end_time - start_time)/60:.2f} minutes.")


 Starting Training...

 Training done in 3.19 minutes.


In [None]:
# Plot results
df = pd.DataFrame(list(scores_class.items()), columns=['Model', 'ROC AUC'])
colors = ['tab:blue' if 'RF' in name else ('tab:red' if 'sample' in name else 'tab:gray') for (name, _) in models_class]
ax = df.plot(x='Model', y='ROC AUC', kind='bar', figsize=(10, 6), color=colors)
ax.set_ylim(df['ROC AUC'].min() * 0.995, min(1.0, df['ROC AUC'].max() * 1.005))
ax.set_title('Model Comparison - 5-fold Cross-validation')

trying the tabpfn tuner with autotuning

In [12]:
from tabpfn_extensions.hpo import TunedTabPFNClassifier

# Create a tuned classifier with 50 optimization trials
tuned_clf = TunedTabPFNClassifier(
    n_trials=50,                    # Number of hyperparameter configurations to try
    metric='roc_auc',              # Metric to optimize
    categorical_feature_indices=[0, 2],  # Categorical features
    random_state=42                 # For reproducibility
)

# Fit will automatically find the best hyperparameters
tuned_clf.fit(X_train, y_train)

# Use like any scikit-learn estimator
y_pred = tuned_clf.predict(X_test)

  0%|                                                                           | 0/50 [00:00<?, ?trial/s, best loss=?]

  model, _, config_ = load_model_criterion_config(

  from .autonotebook import tqdm as notebook_tqdm

Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client





Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client



  2%|▉                                             | 1/50 [01:30<1:13:59, 90.60s/trial, best loss: -0.8283582089552238]

Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client



  4%|█▉                                              | 2/50 [02:28<57:15, 71.58s/trial, best loss: -0.8283582089552238]

Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client



  6%|██▉                                             | 3/50 [03:12<46:12, 59.00s/trial, best loss: -0.8283582089552238]

  model, _, config_ = load_model_criterion_config(

Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client

Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client



  8%|███▉                                             | 4/50 [04:20<47:52, 62.46s/trial, best loss: -0.835820895522388]

  model, _, config_ = load_model_criterion_config(

Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client





Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client



 10%|████▉                                            | 5/50 [05:00<40:39, 54.20s/trial, best loss: -0.835820895522388]

Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client



 12%|█████▉                                           | 6/50 [05:34<34:50, 47.51s/trial, best loss: -0.835820895522388]

Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client

Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client



 14%|██████▊                                          | 7/50 [06:31<36:12, 50.53s/trial, best loss: -0.835820895522388]

  model, _, config_ = load_model_criterion_config(

Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client

Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client



 16%|███████▊                                         | 8/50 [07:12<33:17, 47.57s/trial, best loss: -0.835820895522388]

Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client

Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client



 18%|████████▊                                        | 9/50 [08:42<41:27, 60.68s/trial, best loss: -0.835820895522388]

Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client

Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client



 20%|█████████▌                                      | 10/50 [09:54<42:53, 64.35s/trial, best loss: -0.835820895522388]

Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client




Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client



 22%|██████████▌                                     | 11/50 [10:28<35:44, 54.98s/trial, best loss: -0.835820895522388]

Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client



 24%|███████████▌                                    | 12/50 [10:53<28:56, 45.70s/trial, best loss: -0.835820895522388]

Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client



 26%|████████████▍                                   | 13/50 [11:40<28:33, 46.30s/trial, best loss: -0.835820895522388]

Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client

Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client



 28%|█████████████▍                                  | 14/50 [12:18<26:15, 43.77s/trial, best loss: -0.835820895522388]

Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client

Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client



 30%|██████████████▍                                 | 15/50 [13:18<28:25, 48.72s/trial, best loss: -0.835820895522388]

Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client

Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client



 32%|███████████████▎                                | 16/50 [13:53<25:16, 44.59s/trial, best loss: -0.835820895522388]

Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client

Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client



 34%|████████████████▎                               | 17/50 [14:43<25:24, 46.20s/trial, best loss: -0.835820895522388]

Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client

Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client



 36%|█████████████████▎                              | 18/50 [16:01<29:45, 55.81s/trial, best loss: -0.835820895522388]

Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client



 38%|██████████████████▏                             | 19/50 [16:33<25:01, 48.43s/trial, best loss: -0.835820895522388]

  model, _, config_ = load_model_criterion_config(

Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client



 40%|███████████████████▏                            | 20/50 [17:18<23:41, 47.38s/trial, best loss: -0.835820895522388]

Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client

Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client



 42%|████████████████████▏                           | 21/50 [17:59<21:58, 45.48s/trial, best loss: -0.835820895522388]

Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client

Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client



 44%|█████████████████████                           | 22/50 [19:19<26:05, 55.92s/trial, best loss: -0.835820895522388]

Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client



 46%|██████████████████████                          | 23/50 [19:47<21:26, 47.65s/trial, best loss: -0.835820895522388]

Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client



 48%|███████████████████████                         | 24/50 [20:44<21:48, 50.34s/trial, best loss: -0.835820895522388]

Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client



 50%|████████████████████████                        | 25/50 [21:30<20:30, 49.20s/trial, best loss: -0.835820895522388]

Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client



 52%|████████████████████████▉                       | 26/50 [22:12<18:47, 46.98s/trial, best loss: -0.835820895522388]

Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client

Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client



 54%|█████████████████████████▉                      | 27/50 [23:23<20:44, 54.10s/trial, best loss: -0.835820895522388]

Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client



 56%|██████████████████████████▉                     | 28/50 [24:05<18:33, 50.61s/trial, best loss: -0.835820895522388]

Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client



 58%|███████████████████████████▊                    | 29/50 [24:26<14:31, 41.52s/trial, best loss: -0.835820895522388]

Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client



 60%|████████████████████████████▊                   | 30/50 [24:40<11:06, 33.34s/trial, best loss: -0.835820895522388]

Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client



 62%|█████████████████████████████▊                  | 31/50 [25:12<10:28, 33.08s/trial, best loss: -0.835820895522388]

Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client





Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client



 64%|██████████████████████████████▋                 | 32/50 [25:46<09:59, 33.28s/trial, best loss: -0.835820895522388]

Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client



 66%|███████████████████████████████▋                | 33/50 [26:05<08:13, 29.04s/trial, best loss: -0.835820895522388]

Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client



 68%|████████████████████████████████▋               | 34/50 [26:18<06:27, 24.22s/trial, best loss: -0.835820895522388]

Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client

Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client



 70%|█████████████████████████████████▌              | 35/50 [27:14<08:24, 33.65s/trial, best loss: -0.835820895522388]

Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client

Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client



 72%|██████████████████████████████████▌             | 36/50 [28:33<11:00, 47.18s/trial, best loss: -0.835820895522388]

Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client



 74%|███████████████████████████████████▌            | 37/50 [29:09<09:29, 43.81s/trial, best loss: -0.835820895522388]

Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client



 76%|████████████████████████████████████▍           | 38/50 [29:52<08:43, 43.59s/trial, best loss: -0.835820895522388]

Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client



 78%|█████████████████████████████████████▍          | 39/50 [30:41<08:16, 45.18s/trial, best loss: -0.835820895522388]

Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client

Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client



 80%|██████████████████████████████████████▍         | 40/50 [32:17<10:04, 60.48s/trial, best loss: -0.835820895522388]

Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client





Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client



 82%|███████████████████████████████████████▎        | 41/50 [32:52<07:55, 52.79s/trial, best loss: -0.835820895522388]

Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client

Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client



 84%|████████████████████████████████████████▎       | 42/50 [34:02<07:44, 58.05s/trial, best loss: -0.835820895522388]

Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client

Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client



 86%|█████████████████████████████████████████▎      | 43/50 [35:38<08:05, 69.35s/trial, best loss: -0.835820895522388]

Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client

Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client



 88%|██████████████████████████████████████████▏     | 44/50 [36:31<06:26, 64.40s/trial, best loss: -0.835820895522388]

Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client





Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client



 90%|███████████████████████████████████████████▏    | 45/50 [37:51<05:45, 69.09s/trial, best loss: -0.835820895522388]

Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client

Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client



 92%|████████████████████████████████████████████▏   | 46/50 [39:15<04:54, 73.56s/trial, best loss: -0.835820895522388]

Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client

Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client



 94%|█████████████████████████████████████████████   | 47/50 [40:08<03:22, 67.66s/trial, best loss: -0.835820895522388]

Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client

Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client



 96%|██████████████████████████████████████████████  | 48/50 [41:11<02:12, 66.07s/trial, best loss: -0.835820895522388]

Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client

Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client



 98%|███████████████████████████████████████████████ | 49/50 [42:37<01:12, 72.21s/trial, best loss: -0.835820895522388]

Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client



100%|████████████████████████████████████████████████| 50/50 [42:58<00:00, 51.57s/trial, best loss: -0.835820895522388]


Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client


In [25]:
 tuned_clf.predict_proba(X_test)

Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client


array([[0.73074007, 0.26925993],
       [0.8795687 , 0.1204313 ],
       [0.88711888, 0.11288112],
       [0.93638292, 0.06361708],
       [0.86568275, 0.13431725],
       [0.88322412, 0.11677588],
       [0.87651786, 0.12348214],
       [0.40098746, 0.59901254],
       [0.83786645, 0.16213355],
       [0.8866648 , 0.1133352 ],
       [0.89190365, 0.10809635],
       [0.8335162 , 0.1664838 ],
       [0.8518206 , 0.1481794 ],
       [0.92616078, 0.07383922],
       [0.61038816, 0.38961184],
       [0.8934478 , 0.1065522 ],
       [0.86036568, 0.13963432],
       [0.85813031, 0.14186969],
       [0.85381824, 0.14618176],
       [0.64784149, 0.35215851],
       [0.89268423, 0.10731577],
       [0.64605457, 0.35394543],
       [0.88978739, 0.11021261],
       [0.47542359, 0.52457641],
       [0.89107564, 0.10892436],
       [0.88750876, 0.11249124],
       [0.90212709, 0.09787291],
       [0.91504431, 0.08495569],
       [0.87895426, 0.12104574],
       [0.85724623, 0.14275377],
       [0.

In [14]:
df_gmsc = pd.read_csv(r'C:\Users\Hovsep\Desktop\Hovsep\German\HU studies\Semester 3\Applied Predictive Analytics\TABPFNCredit-APA25\data\pd\01 kaggle_give me some credit\gmsc.csv')

In [15]:
df_gmsc

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,0,0.658180,38,1,0.085113,3042.0,2,1,0,0,0.0
3,0,0.233810,30,0,0.036050,3300.0,5,0,0,0,0.0
4,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
149995,0,0.040674,74,0,0.225131,2100.0,4,0,1,0,0.0
149996,0,0.299745,44,0,0.716562,5584.0,4,0,1,0,2.0
149997,0,0.246044,58,0,3870.000000,,18,0,1,0,0.0
149998,0,0.000000,30,0,0.000000,5716.0,4,0,0,0,0.0


In [17]:
def _preprocess_01_gmsc(_data: pd.DataFrame) -> tuple[np.ndarray, np.ndarray, list[str], list[str], list[str], list[int], list[int]]:

    y = _data['SeriousDlqin2yrs'].values.astype(int)
    x = _data.drop('SeriousDlqin2yrs', axis=1).values

    cols = list(_data.drop('SeriousDlqin2yrs', axis=1).columns)

    cols_cat = []
    cols_num = cols

    cols_cat_idx = []
    cols_num_idx = list(range(len(cols)))

    print("01_gmsc preprocessed")
    print("x shape: ", x.shape)
    print("y shape: ", y.shape)

    return x, y, cols, cols_cat, cols_num, cols_cat_idx, cols_num_idx

x, y, cols, cols_cat, cols_num, cols_cat_idx, cols_num_idx = _preprocess_01_gmsc(df_gmsc)

01_gmsc preprocessed
x shape:  (150000, 10)
y shape:  (150000,)


### Introducing imbalance

In [7]:
import numpy as np

def induce_class_imbalance(x, y, imbalance_ratio=0.1, random_state=0):
    """
    Downsample the minority class to achieve a given class imbalance.

    Parameters
    ----------
    x : np.ndarray
        Feature matrix.
    y : np.ndarray
        Target vector (1D, integer or categorical).
    imbalance_ratio : float
        Desired ratio of minority class in the output data (e.g. 0.1 for 10%).
    random_state : int
        Random seed.

    Returns
    -------
    x_new, y_new : np.ndarray
        Feature matrix and labels with induced class imbalance.
    """

    # Count class occurrences
    unique, counts = np.unique(y, return_counts=True)
    print(f"[INFO] Class distribution BEFORE imbalance: {dict(zip(unique, counts))}")

    # Find majority and minority classes
    class_counts = dict(zip(unique, counts))
    majority_class = unique[np.argmax(counts)]
    minority_class = unique[np.argmin(counts)]
    print(f"[INFO] Majority class: {majority_class}, Minority class: {minority_class}")

    idx_major = np.where(y == majority_class)[0]
    idx_minor = np.where(y == minority_class)[0]
    n_major = len(idx_major)

    # Calculate how many minority samples to keep
    n_minor_new = int(n_major * imbalance_ratio / (1 - imbalance_ratio))
    n_minor_new = min(len(idx_minor), n_minor_new)
    print(f"[INFO] Will keep {n_minor_new} of {len(idx_minor)} minority samples (ratio {imbalance_ratio})")

    # Randomly sample minority indices
    rng = np.random.RandomState(random_state)
    idx_minor_sampled = rng.choice(idx_minor, size=n_minor_new, replace=False)

    # Combine indices and shuffle
    idx_combined = np.concatenate([idx_major, idx_minor_sampled])
    rng.shuffle(idx_combined)

    # Final data
    x_new = x[idx_combined]
    y_new = y[idx_combined]

    # Print new distribution
    unique_new, counts_new = np.unique(y_new, return_counts=True)
    print(f"[INFO] Class distribution AFTER imbalance: {dict(zip(unique_new, counts_new))}")
    print(f"[INFO] Total samples: {len(y_new)}\n")
    return x_new, y_new


In [10]:
x = np.random.randn(1000, 5)
y = np.array([0]*500 + [1]*500)

x_imb, y_imb = induce_class_imbalance(x, y, imbalance_ratio=0.1, random_state=42)


[INFO] Class distribution BEFORE imbalance: {0: 500, 1: 500}
[INFO] Majority class: 0, Minority class: 0
[INFO] Will keep 55 of 500 minority samples (ratio 0.1)
[INFO] Class distribution AFTER imbalance: {0: 555}
[INFO] Total samples: 555



In [13]:
x = np.random.randn(1000, 5)
y = np.array([0]*900 + [1]*100)

x_imb, y_imb = induce_class_imbalance(x, y, imbalance_ratio=0.05, random_state=42)


[INFO] Class distribution BEFORE imbalance: {0: 900, 1: 100}
[INFO] Majority class: 0, Minority class: 1
[INFO] Will keep 47 of 100 minority samples (ratio 0.05)
[INFO] Class distribution AFTER imbalance: {0: 900, 1: 47}
[INFO] Total samples: 947



[INFO] Class distribution BEFORE imbalance: {0: 900, 1: 100}
[INFO] Majority class: 0, Minority class: 1
[INFO] Will keep 100 of 100 minority samples (ratio 0.1)
[INFO] Class distribution AFTER imbalance: {0: 900, 1: 100}
[INFO] Total samples: 1000

Class 0: 900 Class 1: 100


In [18]:
from collections import Counter
from imblearn.over_sampling import SMOTE


def _introduce_class_imbalance(x, y, imbalance_ratio=0.1, random_state=0):
    """
    Create artificial class imbalance in the dataset. Downsamples or oversamples
    the minority class to reach the desired imbalance ratio.

    Parameters
    ----------
    x : np.ndarray
        Feature matrix.
    y : np.ndarray
        Target vector.
    imbalance_ratio : float
        Desired minority class ratio (e.g., 0.1 for 10% minority).
    random_state : int
        Random seed.

    Returns
    -------
    x_new, y_new : np.ndarray
        New dataset with induced class imbalance.
    """
    unique, counts = np.unique(y, return_counts=True)
    print(f"Class distribution BEFORE imbalance: {dict(zip(unique, counts))}")

    # Identify majority and minority classes
    class_counts = dict(zip(unique, counts))
    majority_class = unique[np.argmax(counts)]
    minority_class = unique[np.argmin(counts)]
    idx_major = np.where(y == majority_class)[0]
    idx_minor = np.where(y == minority_class)[0]

    current_ratio = len(idx_minor) / (len(idx_major) + len(idx_minor))
    print(f"Current minority ratio: {current_ratio:.4f}, Desired: {imbalance_ratio}")

    rng = np.random.RandomState(random_state)

    # CASE 1: UNDERSAMPLING (if desired ratio < current)
    if imbalance_ratio < current_ratio:
        n_major = len(idx_major)
        n_minor_new = int(n_major * imbalance_ratio / (1 - imbalance_ratio))
        n_minor_new = min(len(idx_minor), n_minor_new)

        idx_minor_sampled = rng.choice(idx_minor, size=n_minor_new, replace=False)
        idx_combined = np.concatenate([idx_major, idx_minor_sampled])
        rng.shuffle(idx_combined)
        x_new, y_new = x[idx_combined], y[idx_combined]

    # CASE 2: OVERSAMPLING (if desired ratio > current)
    else:
        # Use SMOTE or other oversampler to create synthetic minority points
        smote = SMOTE(sampling_strategy=imbalance_ratio, random_state=random_state)
        x_new, y_new = smote.fit_resample(x, y)

    # Log final distribution
    final_counts = dict(Counter(y_new))
    print(f"Class distribution AFTER imbalance: {final_counts}")
    print(f"Total samples: {len(y_new)}\n")
    return x_new, y_new


In [21]:
x = np.random.randn(1600, 10)
y = np.array([0]*1500 + [1]*100)
x_imb, y_imb = _introduce_class_imbalance(x, y, imbalance_ratio=0.1)
print('Class 0:', np.sum(y_imb == 0), 'Class 1:', np.sum(y_imb == 1))



Class distribution BEFORE imbalance: {0: 1500, 1: 100}
Current minority ratio: 0.0625, Desired: 0.1
Class distribution AFTER imbalance: {0: 1500, 1: 150}
Total samples: 1650

Class 0: 1500 Class 1: 150
