# Тестирование сжатия WOE-значений в классе DataSamples

## Импорт библиотек

In [1]:
import pandas as pd
import numpy as np
import string
import ipytest
import pytest
import sys

sys.path.append("/mnt/d/repo/packages/")
import vtb_mlkit

ipytest.autoconfig()

pd.set_option("display.float_format", lambda x: "%.5f" % x)
pd.set_option("display.max_columns", None)


(CVXPY) Jul 21 12:21:53 PM: Encountered unexpected exception importing solver GLOP:
RuntimeError('Unrecognized new version of ortools (9.6.2534). Expected < 9.5.0.Please open a feature request on cvxpy to enable support for this version.')
(CVXPY) Jul 21 12:21:53 PM: Encountered unexpected exception importing solver PDLP:
RuntimeError('Unrecognized new version of ortools (9.6.2534). Expected < 9.5.0.Please open a feature request on cvxpy to enable support for this version.')


In [2]:
# проверка, что фреймворк нужной версии
assert vtb_mlkit.__version__ == "1.0.0"

## Загружаем данные

In [3]:
temp = pd.read_csv("./data/weatherAUS.csv", parse_dates=["Date"])

In [4]:
temp.describe()


Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm
count,143975.0,144199.0,142199.0,82670.0,75625.0,135197.0,143693.0,142398.0,142806.0,140953.0,130395.0,130432.0,89572.0,86102.0,143693.0,141851.0
mean,12.19403,23.22135,2.36092,5.46823,7.61118,40.03523,14.04343,18.66266,68.88083,51.53912,1017.64994,1015.25589,4.44746,4.50993,16.99063,21.68339
std,6.39849,7.11905,8.47806,4.1937,3.78548,13.60706,8.91538,8.8098,19.02916,20.7959,7.10653,7.03741,2.88716,2.72036,6.48875,6.93665
min,-8.5,-4.8,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,980.5,977.1,0.0,0.0,-7.2,-5.4
25%,7.6,17.9,0.0,2.6,4.8,31.0,7.0,13.0,57.0,37.0,1012.9,1010.4,1.0,2.0,12.3,16.6
50%,12.0,22.6,0.0,4.8,8.4,39.0,13.0,19.0,70.0,52.0,1017.6,1015.2,5.0,5.0,16.7,21.1
75%,16.9,28.2,0.8,7.4,10.6,48.0,19.0,24.0,83.0,66.0,1022.4,1020.0,7.0,7.0,21.6,26.4
max,33.9,48.1,371.0,145.0,14.5,135.0,130.0,87.0,100.0,100.0,1041.0,1039.6,9.0,9.0,40.2,46.7


In [5]:
temp.isna().mean().sort_values(ascending=False)


Sunshine        0.48010
Evaporation     0.43167
Cloud3pm        0.40807
Cloud9am        0.38422
Pressure9am     0.10357
Pressure3pm     0.10331
WindDir9am      0.07264
WindGustDir     0.07099
WindGustSpeed   0.07056
Humidity3pm     0.03098
WindDir3pm      0.02907
Temp3pm         0.02481
RainTomorrow    0.02246
Rainfall        0.02242
RainToday       0.02242
WindSpeed3pm    0.02105
Humidity9am     0.01825
Temp9am         0.01215
WindSpeed9am    0.01215
MinTemp         0.01021
MaxTemp         0.00867
Location        0.00000
Date            0.00000
dtype: float64

In [6]:
temp = temp.dropna(subset=["RainTomorrow"])

In [7]:
temp["RainTomorrow"] = temp["RainTomorrow"].replace({"No": 0, "Yes": 1}).astype("int")

In [8]:
temp = temp.sample(10000)


In [9]:
ds = vtb_mlkit.scorekit.DataSamples(
    samples={"train": temp.copy()},
    time_column="Date",
    target="RainTomorrow",
    result_folder="rain_tomorrow_output",
    samples_split={},
    bootstrap_split={},
    cat_columns=[],
)


[INFO] [2023-07-21 12:22:27] ---------------------------------------------------------------- Creating DataSamples ----------------------------------------------------------------
[INFO] [2023-07-21 12:22:27] Selected 16 features: ['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine', 'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm']
[INFO] [2023-07-21 12:22:27] 100 bootstrap samples with 10000 observation each and 0.2268 mean target rate were created
[INFO] [2023-07-21 12:22:27] Actual parts of samples after samples split:
       train    Test
part 0.70000 0.30000
[INFO] [2023-07-21 12:22:27] DataSamples stats:
                                                 train                                       Test                             Bootstrap base
amount                                            7000                                       3000                              

## Проверка, что при round_woe=3 будет тип float32

In [10]:
binning = vtb_mlkit.scorekit.WOE(
    ds,  # ДатаСэмпл, для которого будут рассчитываться биннинги
    features=None,  # список переменных. При None берется ds.features
    scorecard=None,  # путь к эксель файлу или датафрейм с готовыми биннингами для импорта
    round_digits=3,  # число знаков после запятой для округления значений границ бинов.
    # При округлении происходит проверка на долю мигрирующих наблюдений. Если округление приедет к миграции большой доли наблюдений,
    # то round_digits увеличивается до тех пор, пока доля не упадет ниже rounding_migration_coef
    round_woe=3,  # число знаков после запятой для округления значений WOE
    rounding_migration_coef=0.005,  # максимально допустимая доля наблюдений для миграции между бинами при округлении
    # ---Параметры для расчета WOE---
    simple=True,  # если True, то расчет WOE происходит на трэйн сэмпле, иначе берется среднее значение по фолдам
    n_folds=5,  # кол-во фолдов для расчета WOE при simple=False
    woe_adjust=0.5,  # корректировочный параметр для расчета EventRate_i в бине i
    alpha=0,  # коэффициент регуляризации для расчета WOE
    alpha_range=None,  # если alpha=None, то подбирается оптимальное значение alpha из диапазона alpha_range. При None берется диапазон range(10, 100, 10)
    alpha_scoring="neg_log_loss",  # метрика, используемая для оптимизации alpha
    alpha_best_criterion="min",  # 'min' - минимизация метрики alpha_scoring, 'max' - максимизация метрики
    missing_process="max_or_separate",  # способ обработки пустых значений:
    #     'separate' - помещать в отдельный бин
    #     'min' - объединять с бином с минимальным WOE
    #     'max' - объединять с бином с максимальным WOE
    #     'nearest' - объединять с ближайшим по WOE биномом
    #     'min_or_separate' - если доля пустых значений меньше missing_min_part, то объединять с бином с минимальным WOE, иначе помещать в отдельный бин
    #     'max_or_separate' - если доля пустых значений меньше missing_min_part, то объединять с бином с максимальным WOE, иначе помещать в отдельный бин
    #     'nearest_or_separate' - если доля пустых значений меньше missing_min_part, то объединять с ближайшим по WOE бином, иначе помещать в отдельный бин
    missing_min_part=0.01,  # минимальная доля пустых значений для выделения отдельного бина при missing_process 'min_or_separate', 'max_or_separate' или 'nearest_or_separate'
    others="missing_or_max",  # Способ обработки значений, не попавших в биннинг:
    #     'min': остальным значениям присваивается минимальный WOE
    #     'max': остальным значениям присваивается максимальный WOE
    #     'missing_or_min': если есть бакет с пустыми значениями, то остальным значениям присваивается его WOE, иначе минимальный WOE
    #     'missing_or_max': если есть бакет с пустыми значениями, то остальным значениям присваивается его WOE, иначе максимальный WOE
    #     float: отсутствующим значениям присваивается заданный фиксированный WOE
    opposite_sign_to_others=False,  # В случае, когда непрерывная переменная на выборке для разработки имеет только один знак,
    # то все значения с противоположным знаком относить к others
)

In [11]:
binning.auto_fit(
    features=None,
    autofit_folder="auto_fit",
    plot_flag=-1,
    verbose=False,
    params_space=None,
    woe_best_samples=None,
    method="opt",
    max_n_bins=10,
    min_bin_size=0.05,
    criterion="entropy",
    scoring="neg_log_loss",
    max_depth=5,
    solver="cp",
    divergence="iv",
    WOEM_on=True,
    WOEM_woe_threshold=0.05,
    WOEM_with_missing=False,
    SM_on=False,
    SM_target_threshold=5,
    SM_size_threshold=100,
    BL_on=True,
    BL_allow_Vlogic_to_increase_gini=10,
    G_on=False,
    G_gini_threshold=5,
    G_with_test=False,
    G_gini_decrease_threshold=0.2,
    G_gini_increase_restrict=False,
    WOEO_on=True,
    WOEO_all_samples=False,
    cross_features_first_level=None,
    cross_num_second_level=1,
)

[INFO] [2023-07-21 12:22:41] ------------------------------------------------------------------------ SFA -------------------------------------------------------------------------
[INFO] [2023-07-21 12:22:41] Performing autobinning with parameters space of size 1...
[INFO] [2023-07-21 12:22:41] Using parameters set 1/1: {'method': 'opt', 'max_n_bins': 10, 'min_bin_size': 0.05, 'criterion': 'entropy', 'scoring': 'neg_log_loss', 'max_depth': 5, 'solver': 'cp', 'divergence': 'iv', 'WOEM_on': True, 'WOEM_woe_threshold': 0.05, 'WOEM_with_missing': False, 'SM_on': False, 'SM_target_threshold': 5, 'SM_size_threshold': 100, 'G_on': False, 'G_gini_threshold': 5, 'G_gini_decrease_threshold': 0.2, 'G_gini_increase_restrict': False, 'G_with_test': False, 'BL_on': True, 'BL_allow_Vlogic_to_increase_gini': 10, 'WOEO_on': True, 'WOEO_all_samples': False, 'verbose': False}
[INFO] [2023-07-21 12:22:41] Processing 16 features on 1 CPU...


100%|██████████| 16/16 [00:34<00:00,  2.17s/it]


[INFO] [2023-07-21 12:23:16] Performing autobinning for cross features with parameters space of size 1...
[INFO] [2023-07-21 12:23:16] Finding the best pairs to first-level features...


100%|██████████| 16/16 [00:01<00:00,  8.30it/s]

[INFO] [2023-07-21 12:23:18] Creating feature_crosses...



100%|██████████| 16/16 [00:09<00:00,  1.72it/s]

[INFO] [2023-07-21 12:23:27] Using parameters set 1/1: {'method': 'opt', 'max_n_bins': 10, 'min_bin_size': 0.05, 'criterion': 'entropy', 'scoring': 'neg_log_loss', 'max_depth': 5, 'solver': 'cp', 'divergence': 'iv', 'WOEM_on': True, 'WOEM_woe_threshold': 0.05, 'WOEM_with_missing': False, 'SM_on': False, 'SM_target_threshold': 5, 'SM_size_threshold': 100, 'G_on': False, 'G_gini_threshold': 5, 'G_gini_decrease_threshold': 0.2, 'G_gini_increase_restrict': False, 'G_with_test': False, 'BL_on': True, 'BL_allow_Vlogic_to_increase_gini': 10, 'WOEO_on': True, 'WOEO_all_samples': False, 'verbose': False}
[INFO] [2023-07-21 12:23:27] Processing 16 first level features on 1 CPU...



100%|██████████| 16/16 [02:52<00:00, 10.78s/it]


[INFO] [2023-07-21 12:26:30] Scorecard saved to the file rain_tomorrow_output/auto_fit_scorecard.xlsx
[INFO] [2023-07-21 12:26:30] All done! 16/16 features successfully binned. Found 16 cross features.


In [12]:
ds_t = binning.transform(ds).samples["train"]

woe_columns = [col for col in ds_t.columns if "WOE" in col]


In [13]:
ds_t[woe_columns]

Unnamed: 0,cross_MaxTemp&MinTemp_WOE,WindSpeed3pm_WOE,cross_WindSpeed3pm&Temp3pm_WOE,Pressure9am_WOE,cross_Rainfall&MaxTemp_WOE,Pressure3pm_WOE,Humidity3pm_WOE,WindGustSpeed_WOE,cross_Pressure9am&Pressure3pm_WOE,MaxTemp_WOE,cross_Humidity9am&MinTemp_WOE,cross_Humidity3pm&Temp9am_WOE,WindSpeed9am_WOE,cross_WindSpeed9am&Temp3pm_WOE,cross_Cloud3pm&MinTemp_WOE,cross_MinTemp&Temp3pm_WOE,Cloud3pm_WOE,cross_Temp9am&Pressure9am_WOE,cross_Evaporation&MaxTemp_WOE,Humidity9am_WOE,cross_Pressure3pm&Pressure9am_WOE,Evaporation_WOE,cross_Temp3pm&WindSpeed9am_WOE,Temp3pm_WOE,cross_WindGustSpeed&MaxTemp_WOE,Temp9am_WOE,cross_Sunshine&Temp9am_WOE,cross_Cloud9am&Temp9am_WOE,Sunshine_WOE,Cloud9am_WOE,Rainfall_WOE,MinTemp_WOE
101577,0.31800,0.10500,0.05200,-0.08700,0.46600,-0.15100,0.91100,0.09100,-0.05200,-0.07200,-0.55300,1.44700,0.10400,0.06800,0.67500,0.21100,0.80800,-0.08300,0.20300,-0.81500,-0.02300,0.04300,0.06200,-0.06000,0.35000,-0.01500,0.31700,-1.10900,0.07200,-1.21500,0.57200,-0.00400
131654,-1.42200,0.10500,-0.29900,-1.28600,-1.82800,-1.28200,-2.36900,0.56400,-1.45100,-0.69600,-0.79600,-2.04000,0.10400,-0.20900,0.16500,-0.73200,0.19600,-1.28300,-0.60200,-0.81500,-1.20900,-0.34500,-0.60700,-0.47200,-0.03500,-0.01500,-1.13400,0.14400,-1.81200,0.14400,-1.10400,-0.00400
143387,0.97900,0.10500,0.41400,-0.54800,-0.58000,-0.59700,-0.02900,0.09100,-0.48600,0.86500,-1.10800,0.14800,0.10400,0.22500,-0.18400,-0.32800,0.19600,-0.53900,1.06000,-0.34800,-0.41900,0.34200,0.17600,0.01400,1.69500,-0.01500,0.16400,-0.78300,0.00200,-0.38900,-0.70800,-0.44100
102334,1.37400,0.10500,0.05200,0.44400,0.46600,0.48500,-0.30700,0.56400,0.47000,-0.07200,0.87400,-0.30700,0.10400,0.06800,0.16500,0.21100,0.19600,0.46300,-0.21400,-0.34800,0.60300,-0.08700,0.06200,-0.06000,0.60800,-0.01500,-0.04200,0.57900,0.00200,0.50300,0.57200,-0.00400
51956,1.07100,0.10500,0.37700,-0.54800,0.74900,-1.28200,1.88900,0.09100,-1.19800,0.24900,-0.44500,1.97100,0.10400,0.36500,0.16500,0.94700,0.19600,-0.53900,0.47400,-0.34800,-1.20900,-0.08700,0.35500,0.32100,0.35000,-0.01500,-0.04200,0.14400,0.00200,0.14400,0.57200,-0.00400
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97187,-0.58400,0.10500,0.05200,0.44400,0.46600,0.15700,0.91100,0.09100,0.47000,-0.07200,1.01800,0.75100,0.10400,0.06800,0.16500,-0.13300,0.19600,0.46300,0.00700,1.01400,0.44100,0.77400,0.06200,-0.06000,0.35000,-0.01500,0.31700,0.14400,0.07200,0.14400,0.57200,-0.00400
46736,-0.58400,0.10500,0.05200,1.00500,0.46600,0.48500,0.24400,0.56400,1.00400,-0.07200,-0.44500,0.17000,0.10400,0.06800,0.67500,-0.13300,0.80800,0.94500,0.20300,-0.34800,0.60300,0.04300,0.06200,-0.06000,0.60800,-0.01500,0.31700,-1.10900,0.07200,-1.21500,0.57200,-0.00400
70905,0.31800,0.10500,0.05200,-0.08700,0.46600,-0.59700,0.24400,0.56400,-0.60300,-0.07200,-0.79600,0.17000,0.10400,0.06800,-0.64500,-0.13300,-0.67900,-0.08300,-0.20700,-0.81500,-0.41900,-0.34500,0.06200,-0.06000,0.60800,-0.01500,-2.08600,-0.29400,-1.81200,-0.38900,0.36100,-0.00400
94925,0.57100,-0.34400,-0.06100,0.44400,0.74900,0.48500,0.24400,0.09100,0.47000,0.24900,1.01800,0.17000,0.10400,0.36500,1.45400,0.94700,1.35000,0.46300,0.27200,1.01400,0.60300,0.34200,0.35500,0.32100,0.35000,-0.01500,1.95200,1.07000,1.70800,0.98000,0.57200,-0.00400


In [14]:
ds_t[woe_columns].dtypes.unique()

array([dtype('float32')], dtype=object)

Тип соответствует flaot32, ожидаемо

In [15]:
scorecard = binning.export_scorecard(out=None, features=None, full=True, history=False)


In [16]:
scorecard.head(20)

Unnamed: 0,feature,categorical_type,group,values,woe,missing,n,n0,n1,target_rate,sample_part,n0_part,n1_part,iteration
0,MinTemp,,0,"[-inf, 4.35]",0.778,1,803.0,708.0,95.0,0.118,0.115,0.131,0.06,4.0
1,MinTemp,,1,"[4.35, 19.65]",-0.004,0,5229.0,4039.0,1190.0,0.228,0.747,0.746,0.749,4.0
2,MinTemp,,2,"[19.65, inf]",-0.441,0,968.0,665.0,303.0,0.313,0.138,0.123,0.191,4.0
3,MinTemp,,others,all others,0.778,0,,,,,,,,4.0
4,MaxTemp,,0,"[-inf, 15.25]",-0.696,0,902.0,568.0,334.0,0.37,0.129,0.105,0.21,3.0
5,MaxTemp,,1,"[15.25, 24.25]",-0.072,0,3114.0,2368.0,746.0,0.24,0.445,0.438,0.47,3.0
6,MaxTemp,,2,"[24.25, 33.45]",0.249,0,2371.0,1930.0,441.0,0.186,0.339,0.357,0.278,3.0
7,MaxTemp,,3,"[33.45, inf]",0.865,1,613.0,546.0,67.0,0.109,0.088,0.101,0.042,3.0
8,MaxTemp,,others,all others,0.865,0,,,,,,,,3.0
9,Rainfall,,-1,,-0.723,1,72.0,45.0,27.0,0.375,0.01,0.008,0.017,1.0


Переведем woe-значения из скор-карты в тип string, все woe-значения добавим в словарь, где ключом будет соответствующий столбец

In [43]:
scorecard["woe"] = scorecard["woe"].astype("str")
woe_values_round_3_dict = scorecard.groupby(["feature"])["woe"].apply(set).to_dict()


In [44]:
woe_values_round_3_dict

{'Cloud3pm': {'-0.284',
  '-0.679',
  '-1.485',
  '0.196',
  '0.197',
  '0.808',
  '1.35',
  '3.817'},
 'Cloud9am': {'-0.389', '-1.215', '0.06', '0.144', '0.503', '0.98', '2.38'},
 'Evaporation': {'-0.087', '-0.345', '0.043', '0.342', '0.774'},
 'Humidity3pm': {'-0.029',
  '-0.307',
  '-0.782',
  '-1.051',
  '-2.369',
  '0.244',
  '0.911',
  '1.325',
  '1.889'},
 'Humidity9am': {'-0.192', '-0.348', '-0.815', '0.36', '1.014'},
 'MaxTemp': {'-0.072', '-0.696', '0.249', '0.865'},
 'MinTemp': {'-0.004', '-0.441', '0.778'},
 'Pressure3pm': {'-0.151',
  '-0.597',
  '-1.282',
  '0.008',
  '0.157',
  '0.485',
  '0.987'},
 'Pressure9am': {'-0.087', '-0.548', '-1.286', '0.021', '0.444', '1.005'},
 'Rainfall': {'-0.31',
  '-0.708',
  '-0.723',
  '-1.104',
  '-1.787',
  '0.361',
  '0.572'},
 'Sunshine': {'-0.687',
  '-0.991',
  '-1.812',
  '0.002',
  '0.072',
  '0.637',
  '1.708'},
 'Temp3pm': {'-0.06', '-0.472', '-1.042', '0.014', '0.321', '0.952'},
 'Temp9am': {'-0.015', '0.266'},
 'WindGustSpee

Проверка, что все наблюдаемые woe-значения есть в скор-карте

In [66]:
assert (
    ds_t[woe_columns]
    .astype("str")
    .apply(lambda col: set(col) <= woe_values_round_3_dict[col.name[:-4]])
).all()


## Проверка, что при round_woe=2 будет тип float16

In [67]:
binning_2 = vtb_mlkit.scorekit.WOE(
    ds,  # ДатаСэмпл, для которого будут рассчитываться биннинги
    features=None,  # список переменных. При None берется ds.features
    scorecard=None,  # путь к эксель файлу или датафрейм с готовыми биннингами для импорта
    round_digits=3,  # число знаков после запятой для округления значений границ бинов.
    # При округлении происходит проверка на долю мигрирующих наблюдений. Если округление приедет к миграции большой доли наблюдений,
    # то round_digits увеличивается до тех пор, пока доля не упадет ниже rounding_migration_coef
    round_woe=2,  # число знаков после запятой для округления значений WOE
    rounding_migration_coef=0.005,  # максимально допустимая доля наблюдений для миграции между бинами при округлении
    # ---Параметры для расчета WOE---
    simple=True,  # если True, то расчет WOE происходит на трэйн сэмпле, иначе берется среднее значение по фолдам
    n_folds=5,  # кол-во фолдов для расчета WOE при simple=False
    woe_adjust=0.5,  # корректировочный параметр для расчета EventRate_i в бине i
    alpha=0,  # коэффициент регуляризации для расчета WOE
    alpha_range=None,  # если alpha=None, то подбирается оптимальное значение alpha из диапазона alpha_range. При None берется диапазон range(10, 100, 10)
    alpha_scoring="neg_log_loss",  # метрика, используемая для оптимизации alpha
    alpha_best_criterion="min",  # 'min' - минимизация метрики alpha_scoring, 'max' - максимизация метрики
    missing_process="max_or_separate",  # способ обработки пустых значений:
    #     'separate' - помещать в отдельный бин
    #     'min' - объединять с бином с минимальным WOE
    #     'max' - объединять с бином с максимальным WOE
    #     'nearest' - объединять с ближайшим по WOE биномом
    #     'min_or_separate' - если доля пустых значений меньше missing_min_part, то объединять с бином с минимальным WOE, иначе помещать в отдельный бин
    #     'max_or_separate' - если доля пустых значений меньше missing_min_part, то объединять с бином с максимальным WOE, иначе помещать в отдельный бин
    #     'nearest_or_separate' - если доля пустых значений меньше missing_min_part, то объединять с ближайшим по WOE бином, иначе помещать в отдельный бин
    missing_min_part=0.01,  # минимальная доля пустых значений для выделения отдельного бина при missing_process 'min_or_separate', 'max_or_separate' или 'nearest_or_separate'
    others="missing_or_max",  # Способ обработки значений, не попавших в биннинг:
    #     'min': остальным значениям присваивается минимальный WOE
    #     'max': остальным значениям присваивается максимальный WOE
    #     'missing_or_min': если есть бакет с пустыми значениями, то остальным значениям присваивается его WOE, иначе минимальный WOE
    #     'missing_or_max': если есть бакет с пустыми значениями, то остальным значениям присваивается его WOE, иначе максимальный WOE
    #     float: отсутствующим значениям присваивается заданный фиксированный WOE
    opposite_sign_to_others=False,  # В случае, когда непрерывная переменная на выборке для разработки имеет только один знак,
    # то все значения с противоположным знаком относить к others
)

In [68]:
binning_2.auto_fit(
    features=None,
    autofit_folder="auto_fit",
    plot_flag=-1,
    verbose=False,
    params_space=None,
    woe_best_samples=None,
    method="opt",
    max_n_bins=10,
    min_bin_size=0.05,
    criterion="entropy",
    scoring="neg_log_loss",
    max_depth=5,
    solver="cp",
    divergence="iv",
    WOEM_on=True,
    WOEM_woe_threshold=0.05,
    WOEM_with_missing=False,
    SM_on=False,
    SM_target_threshold=5,
    SM_size_threshold=100,
    BL_on=True,
    BL_allow_Vlogic_to_increase_gini=10,
    G_on=False,
    G_gini_threshold=5,
    G_with_test=False,
    G_gini_decrease_threshold=0.2,
    G_gini_increase_restrict=False,
    WOEO_on=True,
    WOEO_all_samples=False,
    cross_features_first_level=None,
    cross_num_second_level=1,
)

[INFO] [2023-07-21 13:47:03] ------------------------------------------------------------------------ SFA -------------------------------------------------------------------------
[INFO] [2023-07-21 13:47:03] Performing autobinning with parameters space of size 1...
[INFO] [2023-07-21 13:47:03] Using parameters set 1/1: {'method': 'opt', 'max_n_bins': 10, 'min_bin_size': 0.05, 'criterion': 'entropy', 'scoring': 'neg_log_loss', 'max_depth': 5, 'solver': 'cp', 'divergence': 'iv', 'WOEM_on': True, 'WOEM_woe_threshold': 0.05, 'WOEM_with_missing': False, 'SM_on': False, 'SM_target_threshold': 5, 'SM_size_threshold': 100, 'G_on': False, 'G_gini_threshold': 5, 'G_gini_decrease_threshold': 0.2, 'G_gini_increase_restrict': False, 'G_with_test': False, 'BL_on': True, 'BL_allow_Vlogic_to_increase_gini': 10, 'WOEO_on': True, 'WOEO_all_samples': False, 'verbose': False}
[INFO] [2023-07-21 13:47:03] Processing 16 features on 1 CPU...


100%|██████████| 16/16 [00:39<00:00,  2.44s/it]


[INFO] [2023-07-21 13:47:42] Performing autobinning for cross features with parameters space of size 1...
[INFO] [2023-07-21 13:47:42] Finding the best pairs to first-level features...


100%|██████████| 16/16 [00:02<00:00,  7.49it/s]

[INFO] [2023-07-21 13:47:44] Creating feature_crosses...



100%|██████████| 16/16 [00:09<00:00,  1.61it/s]

[INFO] [2023-07-21 13:47:54] Using parameters set 1/1: {'method': 'opt', 'max_n_bins': 10, 'min_bin_size': 0.05, 'criterion': 'entropy', 'scoring': 'neg_log_loss', 'max_depth': 5, 'solver': 'cp', 'divergence': 'iv', 'WOEM_on': True, 'WOEM_woe_threshold': 0.05, 'WOEM_with_missing': False, 'SM_on': False, 'SM_target_threshold': 5, 'SM_size_threshold': 100, 'G_on': False, 'G_gini_threshold': 5, 'G_gini_decrease_threshold': 0.2, 'G_gini_increase_restrict': False, 'G_with_test': False, 'BL_on': True, 'BL_allow_Vlogic_to_increase_gini': 10, 'WOEO_on': True, 'WOEO_all_samples': False, 'verbose': False}
[INFO] [2023-07-21 13:47:54] Processing 16 first level features on 1 CPU...



100%|██████████| 16/16 [03:16<00:00, 12.30s/it]


[INFO] [2023-07-21 13:51:24] Scorecard saved to the file rain_tomorrow_output/auto_fit_scorecard.xlsx
[INFO] [2023-07-21 13:51:24] All done! 16/16 features successfully binned. Found 16 cross features.


In [69]:
ds_t2 = binning_2.transform(ds).samples["train"]


In [70]:
woe_columns_2 = [col for col in ds_t2.columns if "WOE" in col]


In [71]:
ds_t2[woe_columns_2].astype("str")


Unnamed: 0,cross_MaxTemp&MinTemp_WOE,WindSpeed3pm_WOE,cross_WindSpeed3pm&Temp3pm_WOE,Pressure9am_WOE,cross_Rainfall&MaxTemp_WOE,Pressure3pm_WOE,Humidity3pm_WOE,WindGustSpeed_WOE,cross_Pressure9am&Pressure3pm_WOE,MaxTemp_WOE,cross_Humidity9am&MinTemp_WOE,cross_Humidity3pm&Temp9am_WOE,WindSpeed9am_WOE,cross_WindSpeed9am&Temp3pm_WOE,cross_Cloud3pm&MinTemp_WOE,cross_MinTemp&Temp3pm_WOE,Cloud3pm_WOE,cross_Temp9am&Pressure9am_WOE,cross_Evaporation&MaxTemp_WOE,Humidity9am_WOE,cross_Pressure3pm&Pressure9am_WOE,Evaporation_WOE,cross_Temp3pm&WindSpeed9am_WOE,Temp3pm_WOE,cross_WindGustSpeed&MaxTemp_WOE,Temp9am_WOE,cross_Sunshine&Temp9am_WOE,cross_Cloud9am&Temp9am_WOE,Sunshine_WOE,Cloud9am_WOE,Rainfall_WOE,MinTemp_WOE
101577,0.32,0.11,0.05,-0.09,0.47,-0.15,0.91,0.09,-0.05,-0.07,-0.55,1.45,0.1,0.07,0.68,0.21,0.81,-0.08,0.2,-0.81,-0.02,0.04,0.06,-0.06,0.35,-0.02,0.32,-1.11,0.07,-1.22,0.57,-0.0
131654,-1.42,0.11,-0.3,-1.29,-1.83,-1.28,-2.37,0.56,-1.45,-0.7,-0.8,-2.04,0.1,-0.21,0.17,-0.73,0.2,-1.28,-0.6,-0.81,-1.21,-0.34,-0.61,-0.47,-0.04,-0.02,-1.13,0.14,-1.81,0.14,-1.1,-0.0
143387,0.98,0.11,0.41,-0.55,-0.58,-0.6,-0.03,0.09,-0.49,0.87,-1.11,0.15,0.1,0.23,-0.18,-0.33,0.2,-0.54,1.06,-0.35,-0.42,0.34,0.18,0.01,1.7,-0.02,0.16,-0.78,0.0,-0.39,-0.71,-0.44
102334,1.37,0.11,0.05,0.44,0.47,0.48,-0.31,0.56,0.47,-0.07,0.87,-0.31,0.1,0.07,0.17,0.21,0.2,0.46,-0.21,-0.35,0.6,-0.09,0.06,-0.06,0.61,-0.02,-0.04,0.58,0.0,0.5,0.57,-0.0
51956,1.07,0.11,0.38,-0.55,0.75,-1.28,1.89,0.09,-1.2,0.25,-0.44,1.97,0.1,0.37,0.17,0.95,0.2,-0.54,0.47,-0.35,-1.21,-0.09,0.36,0.32,0.35,-0.02,-0.04,0.14,0.0,0.14,0.57,-0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97187,-0.58,0.11,0.05,0.44,0.47,0.16,0.91,0.09,0.47,-0.07,1.02,0.75,0.1,0.07,0.17,-0.13,0.2,0.46,0.01,1.01,0.44,0.77,0.06,-0.06,0.35,-0.02,0.32,0.14,0.07,0.14,0.57,-0.0
46736,-0.58,0.11,0.05,1.0,0.47,0.48,0.24,0.56,1.0,-0.07,-0.44,0.17,0.1,0.07,0.68,-0.13,0.81,0.95,0.2,-0.35,0.6,0.04,0.06,-0.06,0.61,-0.02,0.32,-1.11,0.07,-1.22,0.57,-0.0
70905,0.32,0.11,0.05,-0.09,0.47,-0.6,0.24,0.56,-0.6,-0.07,-0.8,0.17,0.1,0.07,-0.64,-0.13,-0.68,-0.08,-0.21,-0.81,-0.42,-0.34,0.06,-0.06,0.61,-0.02,-2.09,-0.29,-1.81,-0.39,0.36,-0.0
94925,0.57,-0.34,-0.06,0.44,0.75,0.48,0.24,0.09,0.47,0.25,1.02,0.17,0.1,0.37,1.45,0.95,1.35,0.46,0.27,1.01,0.6,0.34,0.36,0.32,0.35,-0.02,1.95,1.07,1.71,0.98,0.57,-0.0


In [72]:
ds_t2[woe_columns_2].dtypes.unique()

array([dtype('float16')], dtype=object)

In [73]:
scorecard2 = binning_2.export_scorecard(
    out=None, features=None, full=True, history=False
)


In [74]:
scorecard2["woe"] = scorecard2["woe"].astype("str")
woe_values_round_2_dict = scorecard2.groupby(["feature"])["woe"].apply(set).to_dict()


In [75]:
woe_values_round_2_dict

{'Cloud3pm': {'-0.28', '-0.68', '-1.49', '0.2', '0.81', '1.35', '3.82'},
 'Cloud9am': {'-0.39', '-1.22', '0.06', '0.14', '0.5', '0.98', '2.38'},
 'Evaporation': {'-0.09', '-0.34', '0.04', '0.34', '0.77'},
 'Humidity3pm': {'-0.03',
  '-0.31',
  '-0.78',
  '-1.05',
  '-2.37',
  '0.24',
  '0.91',
  '1.33',
  '1.89'},
 'Humidity9am': {'-0.19', '-0.35', '-0.81', '0.36', '1.01'},
 'MaxTemp': {'-0.07', '-0.7', '0.25', '0.87'},
 'MinTemp': {'-0.0', '-0.44', '0.78'},
 'Pressure3pm': {'-0.15', '-0.6', '-1.28', '0.01', '0.16', '0.48', '0.99'},
 'Pressure9am': {'-0.09', '-0.55', '-1.29', '0.02', '0.44', '1.0'},
 'Rainfall': {'-0.31', '-0.71', '-0.72', '-1.1', '-1.79', '0.36', '0.57'},
 'Sunshine': {'-0.69', '-0.99', '-1.81', '0.0', '0.07', '0.64', '1.71'},
 'Temp3pm': {'-0.06', '-0.47', '-1.04', '0.01', '0.32', '0.95'},
 'Temp9am': {'-0.02', '0.27'},
 'WindGustSpeed': {'-0.11', '-0.42', '-0.88', '-1.23', '0.09', '0.56'},
 'WindSpeed3pm': {'-0.34', '-0.65', '0.11'},
 'WindSpeed9am': {'-0.09', '-0.5

In [76]:
assert (
    ds_t2[woe_columns]
    .astype("str")
    .apply(lambda col: set(col) <= woe_values_round_2_dict[col.name[:-4]])
).all()
