In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from itertools import combinations    
import sys
from utils_ml import (StatisticalDataNormalizer, Trainer, MetricCalculator)


## Make Dataset

In [None]:
# prepare

df_path = ''
df = pd.read_csv(df_path)
label_col = ''
all_seed_list = [666, 777, 888, 999, 111]

## Train Model

In [None]:
max_depth = -1
n_estimators = 100

trainer = Trainer(df, all_seed_list, label_col=label_col, enable_shap_plot=True, debug=False, 
                  model_name_list=['LR', 'LGBM'], max_depth=max_depth, n_estimators=n_estimators)
df_outputs, all_model_dict, df_feat_importances = trainer.cross_validate()

metric_calculator = MetricCalculator(all_model_dict, df_outputs, label_col=label_col)
df_metrics = metric_calculator.calculate()

In [None]:
df_metrics.groupby(['model', 'phase']).mean()

In [None]:
pd.set_option('display.float_format', lambda x: '%.8f' % x)
df_feat_importances.groupby('model').mean().transpose().sort_values('LogisticRegression')

In [None]:
trainer.save_model_trained_with_all_data(model_name='LR')

# Calibration

In [None]:
import seaborn as sns
df = df_outputs
pred_col = ''
df['pred'] = df[pred_col]
df['gt'] = df[label_col]

In [None]:
predictions = df['pred']
data = []
x = []
y = []
for lower in np.arange(0, 1, 0.1):
    upper = lower + 0.1
    x.append(f'{lower:.1f}')
    y.append(predictions[(predictions>=lower)&(predictions<upper)].shape[0]/predictions.shape[0])

sns.barplot(x, y)
plt.title('Reliability Diagrams')

In [None]:
from sklearn.isotonic import IsotonicRegression

class SigmoidCalibrator:
    def __init__(self, prob_pred, prob_true):
        prob_pred, prob_true = self._filter_out_of_domain(prob_pred, prob_true)
        prob_true = np.log(prob_true / (1 - prob_true))
        self.regressor = LinearRegression().fit(
            prob_pred.reshape(-1, 1), prob_true.reshape(-1, 1)
        )

    def calibrate(self, probabilities):
        return 1 / (1 + np.exp(-self.regressor.predict(probabilities.reshape(-1, 1)).flatten()))

    def _filter_out_of_domain(self, prob_pred, prob_true):
        filtered = list(zip(*[p for p in zip(prob_pred, prob_true) if 0 < p[1] < 1]))
        return np.array(filtered)


class IsotonicCalibrator:
    def __init__(self, prob_pred, prob_true):
        self.regressor = IsotonicRegression(out_of_bounds="clip")
        self.regressor.fit(prob_pred, prob_true)

    def calibrate(self, probabilities):
        return self.regressor.predict(probabilities)

In [None]:
calibrators = {}
from sklearn.calibration import calibration_curve
prob_true, prob_pred = calibration_curve(df['gt'], predictions, n_bins=10)

calibrators["sigmoid"] = SigmoidCalibrator(prob_pred, prob_true)
calibrators["isotonic"] = IsotonicCalibrator(prob_pred, prob_true)


for k, v in calibrators.items():
    calibrated_preds = v.calibrate(np.array(predictions))
    prob_true, prob_pred = calibration_curve(df['gt'], calibrated_preds, n_bins=10)
    plt.plot(prob_true)
    plt.plot(prob_pred)
    plt.show()