In [1]:
import pandas as pd
import numpy as np
from scipy.optimize import minimize
from sklearn.metrics import roc_auc_score
from scipy.stats import chi2_contingency
from scipy.sparse import csc_matrix

### Objective Function

##### Maximize IV value of the bins

IV = ∑((Good% - Bad%) * ln(Good% / Bad%)), where Good% and Bad% are the proportions of non-default and default observations, respectively, in each bin.

##### Constraints:

- The default rate per bin should be monotonically increasing or decreasing.
- The bins should be statistically different (determined by the Chi-square test or another suitable hypothesis test with a chosen significance level).
- The minimum number of observations per bin should be met.
- The minimum and maximum number of bins should be within the specified range.

##### Variables:

- Bins as a set of ordinal categories.
- Default rate: The proportion of default observations (default_flag = 1) in each bin.

A similar problem is solved here: https://github.com/guillermo-navas-palencia/optbinning/blob/master/optbinning/binning/cp.py

**ChatGPT prompt one could use**

Create CP model using ortools that solves the optimisation problem, where an input is a pandas dataframe consisting of a discrete explanatory variable with 50 bins and a binary target variable (default or not a default). The model should combine adjacent bins that maximise IV given that all constraints are satisified. IV must be calculated for every proposed combination of existing bins from stratch. The model should return a mapping table where all 50 bins are mapped to new categories. Focus onmake the code run.

##### Maximize IV value of the bins

IV = ∑((Good% - Bad%) * ln(Good% / Bad%)), where Good% and Bad% are the proportions of non-default and default observations, respectively, in each bin.

##### Constraints:

- The event rate (default rate) per bin should be monotonically increasing or decreasing.
- The bins should be statistically different (determined by the Chi-square test or another suitable hypothesis test with a chosen significance level).
- The minimum number of observations per bin should be met.
- The minimum and maximum number of bins should be within the specified range.

##### Variables:

- Bins as a set of ordinal categories.
- Default rate: The proportion of default observations (default_flag = 1) in each bin.


##### Helper functions:

from ortools.sat.python import cp_model

def calculate_iv(df, binned_column, target_column):
    bin_summary = df.groupby(binned_column)[target_column].agg(['count', 'sum'])
    bin_summary['non_target'] = bin_summary['count'] - bin_summary['sum']
    
    bin_summary['target_dist'] = bin_summary['sum'] / bin_summary['sum'].sum()
    bin_summary['non_target_dist'] = bin_summary['non_target'] / bin_summary['non_target'].sum()
    
    bin_summary['woe'] = np.log(bin_summary['target_dist'] / bin_summary['non_target_dist'])
    bin_summary['iv'] = (bin_summary['target_dist'] - bin_summary['non_target_dist']) * bin_summary['woe']
    
    return bin_summary['iv'].sum()


def check_monotonic_constraint(df, binned_column, target_column):
    # Calculate the default rate for each bin
    default_rate = df.groupby(binned_column)[target_column].mean()

    # Check if the difference between adjacent bins' default rates is either positive or negative
    diff = np.diff(default_rate)

    # Check if the constraint is satisfied (either all positive or all negative differences)
    is_monotonic = np.all(diff >= 0) or np.all(diff <= 0)

    return is_monotonic


def calculate_statistical_difference(df, binned_column, target_column, significance_level=0.05):
    bin_pairs_p_values = []
    
    unique_bins = df[binned_column].unique()
    for i in range(len(unique_bins) - 1):
        bin1 = unique_bins[i]
        bin2 = unique_bins[i + 1]
        
        contingency_table = pd.crosstab(df[df[binned_column].isin([bin1, bin2])][binned_column], df[target_column])
        _, p, _, _ = chi2_contingency(contingency_table)
        
        bin_pairs_p_values.append((bin1, bin2, p))
    
    return pd.DataFrame(bin_pairs_p_values, columns=['Bin1', 'Bin2', 'P-value'])

### A. Create test data

In [26]:
np.random.seed(42)

def generate_data(n):
    years = np.random.choice([2016, 2017, 2018, 2019, 2020, 2021], n)
    economic_cycle = np.random.rand(n)
    explanatory_variable = np.random.rand(n) * 10

    # Add noise to the relationship between explanatory_variable and economic_cycle
    noise = np.random.normal(0, 0.1, n)

    # Define a non-trivial and random relationship between target and explanatory variable
    z = (explanatory_variable / 10) * economic_cycle + noise
    target_proba = 1 / (1 + np.exp(-z))  # Apply logistic function
    target_variable = (target_proba > 0.5).astype(int)

    data = {
        'explanatory_variable': explanatory_variable,
        'target_variable': target_variable,
        'year': years,
        'economic_cycle': economic_cycle
    }

    return pd.DataFrame(data)

# Choose the number of observations and number of bins
n = 200_000 
num_bins = 50

# Create dataset and digitize outcome
df = generate_data(n)
df['binned_column'] = pd.qcut(df['explanatory_variable'], num_bins, labels=False)

print(df.head())

   explanatory_variable  target_variable  year  economic_cycle  binned_column
0              6.678090                0  2019        0.140130             33
1              8.521948                1  2020        0.914430             42
2              4.038421                1  2018        0.783868             20
3              3.281745                1  2020        0.119363             16
4              8.264844                1  2020        0.568544             41


In [14]:
np.max(df['explanatory_variable'])

9.999969099974354

### B. Ancilliary functions

In [3]:
def calculate_iv(df, binned_column, target_column):
    bin_summary = df.groupby(binned_column)[target_column].agg(['count', 'sum'])
    bin_summary['non_target'] = bin_summary['count'] - bin_summary['sum']
    
    bin_summary['target_dist'] = bin_summary['sum'] / bin_summary['sum'].sum()
    bin_summary['non_target_dist'] = bin_summary['non_target'] / bin_summary['non_target'].sum()
    
    bin_summary['woe'] = np.log(bin_summary['target_dist'] / bin_summary['non_target_dist'])
    bin_summary['iv'] = (bin_summary['target_dist'] - bin_summary['non_target_dist']) * bin_summary['woe']
    
    return bin_summary['iv'].sum()


def check_monotonic_constraint(df, binned_column, target_column):
    # Calculate the default rate for each bin
    default_rate = df.groupby(binned_column)[target_column].mean()

    # Check if the difference between adjacent bins' default rates is either positive or negative
    diff = np.diff(default_rate)

    # Check if the constraint is satisfied (either all positive or all negative differences)
    is_monotonic = np.all(diff >= 0) or np.all(diff <= 0)

    return is_monotonic


def calculate_statistical_difference(df, binned_column, target_column, significance_level=0.05):
    bin_pairs_p_values = []
    
    unique_bins = df[binned_column].unique()
    for i in range(len(unique_bins) - 1):
        bin1 = unique_bins[i]
        bin2 = unique_bins[i + 1]
        
        contingency_table = pd.crosstab(df[df[binned_column].isin([bin1, bin2])][binned_column], df[target_column])
        _, p, _, _ = chi2_contingency(contingency_table)
        
        bin_pairs_p_values.append((bin1, bin2, p))
    
    return pd.DataFrame(bin_pairs_p_values, columns=['Bin1', 'Bin2', 'P-value'])

### C. Implement using OR-tools

**This is the gpt-generated output. The task is to fix it.**

In [28]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency
from ortools.sat.python import cp_model


from ortools.sat.python import cp_model

# Define a custom constraint callback function
def merged_bins_satisfy_constraints_callback(df, binned_column, target_column, significance_level):
    def callback(vars):
        merged_bins = {i: vars[i].Value() for i in range(len(vars))}
        return merged_bins_satisfy_constraints(df, binned_column, target_column, merged_bins, significance_level)
    return callback

def optimize_bins(df, binned_column, target_column, min_bins, max_bins, min_observations_per_bin, significance_level=0.05):
    model = cp_model.CpModel()

    # Create a variable for each bin and its corresponding merged bin
    bin_vars = {i: model.NewIntVar(0, max_bins - 1, f"bin_{i}") for i in range(50)}

    # Add constraint: minimum number of observations per bin
    for i in range(50):
        model.Add(sum(df[binned_column] == i) >= min_observations_per_bin)

    # Add custom constraint for monotonic and statistical difference using a callback function
    model.AddConstraint(merged_bins_satisfy_constraints_callback(df, binned_column, target_column, significance_level), list(bin_vars.values()))

    # Define the objective function (IV value)
    iv_expression = []
    for i in range(50):
        df_temp = df[df[binned_column] == i]
        iv_value = calculate_iv(df_temp, binned_column, target_column)
        iv_expression.append(iv_value * bin_vars[i])

    model.Maximize(sum(iv_expression))

    # Solve the model
    solver = cp_model.CpSolver()
    status = solver.Solve(model)

    # Return the mapping table
    if status == cp_model.OPTIMAL:
        new_categories = {i: solver.Value(bin_vars[i]) for i in range(50)}
        return new_categories
    else:
        return None

# Example usage
min_bins = 10
max_bins = 20
min_observations_per_bin = 100
significance_level = 0.05

new_categories = optimize_bins(df, 'binned_column', 'target_column', min_bins, max_bins, min_observations_per_bin, significance_level)

new_categories

AttributeError: 'CpModel' object has no attribute 'AddConstraint'

In [None]:
# Inspect the monotinicity of the solution
df.groupby(['bin']).agg({'target_variable':['mean','count']})

In [None]:
xxxxxxxxxx

### Appendices

To determine whether adjacent bins are statistically different, you can perform a hypothesis test, such as the Chi-square test for independence. 

Here's a step-by-step process for doing this:
- Create a contingency table for the adjacent bins, showing the frequency distribution of the binary default flag (0 and 1) for each pair of adjacent bins.
- Calculate the expected frequency for each cell in the contingency table under the assumption that the bins are independent. To do this, multiply the row total (sum of binary default flags in a bin) by the column total (sum of observations in the two adjacent bins) and divide by the total number of observations.
- Calculate the Chi-square statistic. For each cell, find the difference between the observed frequency and the expected frequency, square the result, and divide by the expected frequency. Add up all these values to get the Chi-square statistic.
- Determine the degrees of freedom for the Chi-square test. For a 2x2 contingency table (2 rows, one for each binary default flag, and 2 columns, one for each adjacent bin), the degrees of freedom are (number of rows - 1) * (number of columns - 1) = 1.
- Determine the critical value and the p-value for the calculated Chi-square statistic using the Chi-square distribution table or a statistical software with the corresponding degrees of freedom.
- Set a significance level (commonly 0.05). If the p-value is less than the chosen significance level, reject the null hypothesis that the adjacent bins are independent, which means there is a statistically significant difference between the bins. If the p-value is greater than the chosen significance level, you cannot reject the null hypothesis, and there is no evidence to suggest a statistically significant difference between the bins.

Repeat these steps for each pair of adjacent bins to determine whether there is a statistically significant difference between them.

In [8]:
import pandas as pd
from scipy.stats import chi2_contingency

# Sample data
data = {
    'bin': ['A', 'A', 'A', 'B', 'B', 'B', 'C', 'C', 'C', 'D', 'D', 'D', 'E', 'E', 'E'],
    'default_flag': [0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1]
}

df = pd.DataFrame(data)

# Perform Chi-square test for independence on adjacent bins
significance_level = 0.05

for i in range(len(df['bin'].unique()) - 1):
    bin1 = df['bin'].unique()[i]
    bin2 = df['bin'].unique()[i + 1]
    contingency_table = pd.crosstab(df[df['bin'].isin([bin1, bin2])]['bin'], df['default_flag'])
    chi2, p, _, _ = chi2_contingency(contingency_table)
    
    print(f"Chi-square test for {bin1} and {bin2}:")
    print(f"Chi2 = {chi2:.4f}, p-value = {p:.4f}")
    
    if p < significance_level:
        print(f"The difference between bins {bin1} and {bin2} is statistically significant.\n")
    else:
        print(f"There is no evidence to suggest a statistically significant difference between bins {bin1} and {bin2}.\n")


Chi-square test for A and B:
Chi2 = 0.0000, p-value = 1.0000
There is no evidence to suggest a statistically significant difference between bins A and B.

Chi-square test for B and C:
Chi2 = 0.0000, p-value = 1.0000
There is no evidence to suggest a statistically significant difference between bins B and C.

Chi-square test for C and D:
Chi2 = 0.0000, p-value = 1.0000
There is no evidence to suggest a statistically significant difference between bins C and D.

Chi-square test for D and E:
Chi2 = 0.0000, p-value = 1.0000
There is no evidence to suggest a statistically significant difference between bins D and E.



**This is a collection of related, but not working, code**

In [None]:
# Initialize model
model = cp_model.CpModel()

# Decision variables
x, y, t, d, u, bin_size_diff = self.decision_variables(model, n)

# Objective function
total_records = int(n_records.sum())
regularization = int(np.ceil(M * self.gamma / total_records))
pmax = model.NewIntVar(0, total_records, "pmax")
pmin = model.NewIntVar(0, total_records, "pmin")

model.Maximize(sum([(V[i][i] * x[i, i]) +
               sum([(V[i][j] - V[i][j+1]) * x[i, j]
                    for j in range(i)]) for i in range(n)]) -
               regularization * (pmax - pmin))

# Constraint: unique assignment
self.add_constraint_unique_assignment(model, n, x)

# Constraint: min / max bins
self.add_constraint_min_max_bins(model, n, x, d)

# Constraint: min / max bin size
self.add_constraint_min_max_bin_size(model, n, x, u, n_records,
                                     bin_size_diff)


In [None]:
def model_data(divergence, n_nonevent, n_event, max_pvalue, max_pvalue_policy,
               min_event_rate_diff, scale=None, return_nonevent_event=False):
    n = len(n_nonevent)

    t_n_event = n_event.sum()
    t_n_nonevent = n_nonevent.sum()

    D = []
    V = []

    E = []
    NE = []

    for i in range(1, n + 1):
        s_event = n_event[:i][::-1].cumsum()[::-1]
        s_nonevent = n_nonevent[:i][::-1].cumsum()[::-1]
        rate = s_event / (s_nonevent + s_event)

        p = s_event / t_n_event
        q = s_nonevent / t_n_nonevent

        if divergence == "iv":
            iv = jeffrey(p, q)
        elif divergence == "js":
            iv = jensen_shannon(p, q)
        elif divergence == "hellinger":
            iv = hellinger(p, q)
        elif divergence == "triangular":
            iv = triangular(p, q)

        if scale is not None:
            rate *= scale
            iv *= scale

            D.append(rate.astype(np.int64))
            V.append(iv.astype(np.int64))
        else:
            D.append(rate)
            V.append(iv)

        if max_pvalue is not None or return_nonevent_event:
            E.append(s_event)
            NE.append(s_nonevent)

    if max_pvalue is not None:
        pvalue_violation_indices = find_pvalue_violation_indices(
            n, E, NE, max_pvalue, max_pvalue_policy)
    else:
        pvalue_violation_indices = []

    if min_event_rate_diff > 0:
        if scale is not None:
            min_diff = int(min_event_rate_diff * scale)
        else:
            min_diff = min_event_rate_diff

        min_diff_violation_indices = find_min_diff_violation_indices(
            n, D, min_diff)
    else:
        min_diff_violation_indices = []

    if return_nonevent_event:
        return D, V, NE, E, pvalue_violation_indices

    return D, V, pvalue_violation_indices, min_diff_violation_indices

def build_model(self, divergence, n_nonevent, n_event, trend_change):
        # Parameters
        M = int(1e6)
        (D, V, pvalue_violation_indices,
         min_diff_violation_indices) = model_data(
            divergence, n_nonevent, n_event, self.max_pvalue,
            self.max_pvalue_policy, self.min_event_rate_diff, M)

        n = len(n_nonevent)
        n_records = n_nonevent + n_event

        # Initialize model
        model = cp_model.CpModel()

        # Decision variables
        x, y, t, d, u, bin_size_diff = self.decision_variables(model, n)

        # Objective function
        if self.gamma:
            total_records = int(n_records.sum())
            regularization = int(np.ceil(M * self.gamma / total_records))
            pmax = model.NewIntVar(0, total_records, "pmax")
            pmin = model.NewIntVar(0, total_records, "pmin")

            model.Maximize(sum([(V[i][i] * x[i, i]) +
                           sum([(V[i][j] - V[i][j+1]) * x[i, j]
                                for j in range(i)]) for i in range(n)]) -
                           regularization * (pmax - pmin))
        else:
            model.Maximize(sum([(V[i][i] * x[i, i]) +
                           sum([(V[i][j] - V[i][j+1]) * x[i, j]
                                for j in range(i)]) for i in range(n)]))

        # Constraint: min / max bins
        self.add_constraint_min_max_bins(model, n, x, d)
        
def jeffrey(x, y, return_sum=False):
    """Calculate the Jeffrey's divergence between two distributions.

    Parameters
    ----------
    x : array-like
        Discrete probability distribution.

    y : array-like
        Discrete probability distribution.

    return_sum : bool
        Return sum of jeffrey values.

    Returns
    -------
    jeffrey : float or numpy.ndarray
    """
    x, y = _check_x_y(x, y)

    j = special.xlogy(x - y, x / y)

    if return_sum:
        return j.sum()
    else:
        return j