In [1]:
import optbinning
import numpy as np
from pandas import DataFrame, Series, to_datetime

def random_dates(start, end, n=10):

    start = to_datetime(start)
    end = to_datetime(end)
    start_u = start.value//10**9
    end_u = end.value//10**9

    return to_datetime(np.random.randint(start_u, end_u, n), unit='s')

# random data
df = DataFrame({
    'date': random_dates('2020-01-01', '2023-06-01', n=1000),
    'id': np.random.choice(['a', 'b', 'c'], size=1000),
    'num1': np.random.normal(loc=100, scale=10, size=1000),
    'num2': np.random.normal(loc=100, scale=10, size=1000),
    'num3': np.random.normal(loc=100, scale=10, size=1000),
    'num4': np.random.normal(loc=100, scale=10, size=1000),
    'num5': np.random.normal(loc=100, scale=10, size=1000),
    'cat': np.random.choice(['1', '2', '3'], size=1000),
    'target': np.random.choice([0, 1], p=[0.85, 0.15], size=1000)
})

(CVXPY) May 01 01:28:42 PM: Encountered unexpected exception importing solver GLOP:
RuntimeError('Unrecognized new version of ortools (9.6.2534). Expected < 9.5.0.Please open a feature request on cvxpy to enable support for this version.')
(CVXPY) May 01 01:28:42 PM: Encountered unexpected exception importing solver PDLP:
RuntimeError('Unrecognized new version of ortools (9.6.2534). Expected < 9.5.0.Please open a feature request on cvxpy to enable support for this version.')


In [153]:
df.isna().sum()

date      0
id        0
num1      0
num2      0
num3      0
num4      0
num5      0
cat       0
target    0
dtype: int64

In [4]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler 

sampler = RandomOverSampler()
X = df.drop(['date', 'id', 'target'], axis=1)
y = df[['target']]

X_resample, y_resample = sampler.fit_resample(X, y)

In [5]:
from pandas import concat

concat([X_resample, y_resample], axis=1)

Unnamed: 0,num1,num2,num3,num4,num5,cat,target
0,120.500648,93.967088,95.171604,104.456628,97.526229,2,1
1,88.052175,106.227801,91.090535,110.405582,98.284713,1,0
2,95.171007,89.980837,125.059743,98.070855,80.034791,2,0
3,79.659561,100.981797,91.451116,106.985178,94.078506,1,1
4,100.378915,108.800250,111.512297,94.392485,114.629672,2,0
...,...,...,...,...,...,...,...
1691,103.137564,110.842081,100.837295,89.253919,117.570541,1,1
1692,88.148905,110.825553,72.924125,118.974806,101.476731,3,1
1693,99.216231,81.517102,88.352876,88.118607,83.533430,3,1
1694,102.566477,110.790237,95.622396,103.575648,81.396106,1,1


In [155]:
from pandas import concat
from optbinning import OptimalBinning

# get dtypes
KEYS = ['date', 'id']
TARGET_COL = 'target'
var_types = {}
binning_tables = DataFrame(columns=['variable', 'Bin', 'Count (%)', 'Non-event', 'Event', 'WoE', 'IV',' JS'])

for (col, dtype) in zip(df.drop(KEYS+[TARGET_COL], axis=1).columns, df.drop(KEYS+[TARGET_COL], axis=1).dtypes):
    if dtype not in ('object','str'):
        var_types[col] = 'numerical'
    else:
        var_types[col] = 'categorical'

print(var_types)
for variable, dtype in var_types.items():


    optb = OptimalBinning(name=variable, dtype=dtype, solver="cp",
                      monotonic_trend="auto", max_n_prebins=100,
                      min_prebin_size=0.001, time_limit=200, )


    values = df[variable].values
    target = df.target.values
    optb.fit(values, target)

    binning_table = DataFrame(optb.binning_table.build()).assign(variable=variable)

    binning_tables = concat([binning_tables, binning_table], axis=0, ignore_index=True)
    df[variable] = optb.transform(values, metric='woe')

{'num1': 'numerical', 'num2': 'numerical', 'num3': 'numerical', 'num4': 'numerical', 'num5': 'numerical', 'cat': 'categorical'}


In [156]:
binning_tables

Unnamed: 0,variable,Bin,Count (%),Non-event,Event,WoE,IV,JS,Count,Event rate,JS.1
0,num1,"(-inf, 85.86)",0.074,61,13,-0.157642,0.001941,,74.0,0.175676,0.000242
1,num1,"[85.86, 91.06)",0.104,89,15,0.077019,0.000601,,104.0,0.144231,7.5e-05
2,num1,"[91.06, 95.37)",0.154,133,21,0.14226,0.002966,,154.0,0.136364,0.00037
3,num1,"[95.37, 96.55)",0.045,43,2,1.364486,0.051633,,45.0,0.044444,0.005996
4,num1,"[96.55, 98.51)",0.071,62,9,0.226343,0.00336,,71.0,0.126761,0.000419
5,num1,"[98.51, 105.04)",0.244,205,39,-0.044118,0.000482,,244.0,0.159836,6e-05
6,num1,"[105.04, 108.87)",0.126,104,22,-0.150218,0.002993,,126.0,0.174603,0.000374
7,num1,"[108.87, inf)",0.182,149,33,-0.196128,0.007485,,182.0,0.181319,0.000934
8,num1,Special,0.0,0,0,0.0,0.0,,0.0,0.0,0.0
9,num1,Missing,0.0,0,0,0.0,0.0,,0.0,0.0,0.0


In [157]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import InstanceHardnessThreshold

def undersample_features(train, val, target_col_name='target', keys=['date', 'id'], undersample_params={}):
    
    exclude_cols = keys + [target_col_name]
    undersampler = InstanceHardnessThreshold(**undersample_params) 
    train_resample = undersampler.fit_resample(train.drop(exclude_cols, axis=1), train[target_col_name])



    return undersampler,train_resample


In [158]:
train = df
val = df

In [159]:
params = {
    'sampling_strategy': 'auto',
    'cv': 5,
}

undersampler, train_resample = undersample_features(train, val, undersample_params=params)

In [160]:
undersampler.resample

AttributeError: 'InstanceHardnessThreshold' object has no attribute 'resample'

In [161]:
from sklearn.feature_selection import VarianceThreshold

In [162]:
v = VarianceThreshold(threshold=0.001)

v_fit = v.fit(train.drop(['target', 'date', 'id'], axis=1), train[['target']].values)
v_fit

In [163]:
from sklearn.feature_selection import SelectKBest

In [164]:
fs = SelectKBest(k=3)

In [165]:
fs.fit(train.drop(['target', 'date', 'id'], axis=1), train[['target']].values)

  y = column_or_1d(y, warn=True)


In [170]:
fs.transform(train.drop(['target', 'date', 'id'], axis=1))

array([[-0.1249005 , -0.15491374,  0.18097445],
       [-0.06041779, -0.09412884, -0.12922016],
       [-0.06041779, -0.15491374, -0.12922016],
       ...,
       [-0.06041779, -0.15491374,  0.67906104],
       [-0.12311638, -0.09412884, -0.12922016],
       [-0.06041779, -0.15491374,  0.25652803]])

In [172]:
fs.get_feature_names_out()

array(['num3', 'num4', 'num5'], dtype=object)