In [1]:
import pandas as pd
import numpy as np
from scipy.optimize import minimize
from sklearn.metrics import roc_auc_score
from scipy.stats import chi2_contingency

### Objective Function

##### Maximize IV value of the bins

IV = ∑((Good% - Bad%) * ln(Good% / Bad%)), where Good% and Bad% are the proportions of non-default and default observations, respectively, in each bin.

##### Constraints:

The default rate per bin should be monotonically increasing or decreasing.
The bins should be statistically different (determined by the Chi-square test or another suitable hypothesis test with a chosen significance level).
The minimum number of observations per bin should be met.
The minimum and maximum number of bins should be within the specified range.

##### Variables:

Bins: A set of ordinal categories.
Default rate: The proportion of default observations (default_flag = 1) in each bin.

##### Sequential Least Squares Quadratic Programming

SLSQP stands for Sequential Least Squares Quadratic Programming. It is an optimization algorithm available in the scipy.optimize module. The SLSQP method is a gradient-based optimization algorithm used for solving nonlinear optimization problems with equality and inequality constraints. It is particularly suitable for solving problems that have a smooth objective function and smooth constraints. The method takes an objective function, an initial guess for the solution, bounds on the variables, and constraints as input. It then tries to find the optimal solution that minimizes the objective function while satisfying the constraints.

Here's a brief overview of how the SLSQP algorithm works:

- The algorithm starts with an initial guess for the solution and calculates the objective function value, its gradient (first-order derivatives), and the constraint values and their gradients at this initial point.
- It then iteratively improves the solution by approximating the objective function and constraints with quadratic functions (hence the name Quadratic Programming) using the calculated gradient information. The algorithm solves a sequence of quadratic subproblems subject to linearized constraints to find a new solution.
- The new solution is checked for feasibility (i.e., whether it satisfies the constraints) and optimality (i.e., whether the objective function value has improved). If necessary, the algorithm adjusts the solution using a line search method to satisfy the constraints.
- The algorithm proceeds iteratively, updating the solution and recalculating the objective function, gradients, and constraint values until convergence is achieved, or a stopping criterion is met.

SLSQP is a good choice for problems with a moderate number of variables and constraints where the objective function and constraints are smooth (i.e., differentiable). However, it may not be the best choice for large-scale, non-smooth, or non-convex problems. In such cases, other optimization algorithms might be more suitable.

### A. Create test data

In [2]:
np.random.seed(42)

def generate_data(n):
    years = np.random.choice([2016, 2017, 2018, 2019, 2020, 2021], n)
    economic_cycle = np.random.rand(n)
    explanatory_variable = np.random.rand(n) * 10

    # Add noise to the relationship between explanatory_variable and economic_cycle
    noise = np.random.normal(0, 0.1, n)

    # Define a non-trivial and random relationship between target and explanatory variable
    z = (explanatory_variable / 10) * economic_cycle + noise
    target_proba = 1 / (1 + np.exp(-z))  # Apply logistic function
    target_variable = (target_proba > 0.5).astype(int)

    data = {
        'explanatory_variable': explanatory_variable,
        'target_variable': target_variable,
        'year': years,
        'economic_cycle': economic_cycle
    }

    return pd.DataFrame(data)

# Choose the number of observations
n = 200_000 

# Create dataset
df = generate_data(n)
print(df.head())

   explanatory_variable  target_variable  year  economic_cycle
0              6.678090                0  2019        0.140130
1              8.521948                1  2020        0.914430
2              4.038421                1  2018        0.783868
3              3.281745                1  2020        0.119363
4              8.264844                1  2020        0.568544


### B. Create optimal pooling/calibration option

In [3]:
def calculate_iv(df, bin_edges):
    binned_data = pd.cut(df['explanatory_variable'], bins=bin_edges, include_lowest=True)
    bin_summary = df.groupby(binned_data)['target_variable'].agg(['count', 'sum'])
    bin_summary['non_target'] = bin_summary['count'] - bin_summary['sum']
    
    bin_summary['target_dist'] = bin_summary['sum'] / bin_summary['sum'].sum()
    bin_summary['non_target_dist'] = bin_summary['non_target'] / bin_summary['non_target'].sum()
    
    bin_summary['woe'] = np.log(bin_summary['target_dist'] / bin_summary['non_target_dist'])
    bin_summary['iv'] = (bin_summary['target_dist'] - bin_summary['non_target_dist']) * bin_summary['woe']
    
    return bin_summary['iv'].sum()

def calculate_event_rates(df, bin_edges):
    annual_event_rates = []

    for year in df['year'].unique():
        df_year = df[df['year'] == year]
        bin_indices = np.digitize(df_year['explanatory_variable'], bin_edges)
        bin_counts = np.bincount(bin_indices)[1:len(bin_edges)]
        event_counts = np.bincount(bin_indices, weights=df_year['target_variable'])[1:len(bin_edges)]
        annual_event_rates.append(event_counts / bin_counts)

    average_annual_event_rates = np.mean(annual_event_rates, axis=0)
    return average_annual_event_rates

In [4]:
def optimize_iv(df, min_bins=2, max_bins=10, min_bin_size=0.01, monotonic=True):
    best_bin_edges = None
    best_objective_value = np.inf
    
    def iv_objective(bin_edges):
        extended_edges = [df['explanatory_variable'].min()] + list(bin_edges) + [df['explanatory_variable'].max()]
        print(f'Number of edges: {len(bin_edges)}, IV: {calculate_iv(df, extended_edges):.2%}')
        return -calculate_iv(df, extended_edges)

    def monotonic_constraint(bin_edges):
        extended_edges = [df['explanatory_variable'].min()] + list(bin_edges) + [df['explanatory_variable'].max()]
        event_rates = calculate_event_rates(df, extended_edges)
        
        if monotonic:
            if np.all(np.diff(event_rates) >= 0) or np.all(np.diff(event_rates) <= 0):
                return 0
            else:
                return np.inf
        else:
            return 0
        
    def bins_statistical_difference(bin_edges, df, significance_level=0.05):
        extended_edges = [df['explanatory_variable'].min()] + list(bin_edges) + [df['explanatory_variable'].max()]
        df['temp_bin'] = pd.cut(df['explanatory_variable'], bins=extended_edges, labels=False, include_lowest=True)

        for i in range(len(extended_edges) - 2):
            contingency_table = pd.crosstab(df[df['temp_bin'].isin([i, i + 1])]['temp_bin'], df['target_variable'])
            _, p, _, _ = chi2_contingency(contingency_table)
            # print(f'p-value: {p}')

            if p > significance_level:
                return np.inf

        return 0

    for num_bins in range(min_bins, max_bins + 1):
        initial_bin_edges = np.linspace(df['explanatory_variable'].min(), df['explanatory_variable'].max(), num_bins + 1)[1:-1]

        constraints = [{'type': 'ineq', 'fun': monotonic_constraint},
                       {'type': 'ineq', 'fun': lambda x: bins_statistical_difference(x, df)}
                      ]
        bounds = [(min_bin_size, None) for _ in range(num_bins - 1)]

        result = minimize(iv_objective, initial_bin_edges, method='SLSQP', bounds=bounds, constraints=constraints)

        if result.success and -result.fun < best_objective_value:
            best_objective_value = -result.fun
            best_bin_edges = [df['explanatory_variable'].min()] + list(result.x) + [df['explanatory_variable'].max()]

    return best_bin_edges

optimal_bin_edges = optimize_iv(df, min_bins=3, max_bins=6)
print("Optimal bin edges:", optimal_bin_edges)

# Create a new column in the dataset that represents the bin assignments
df['bin'] = pd.cut(df['explanatory_variable'], bins=optimal_bin_edges, labels=False, include_lowest=True)

# Calculate ROC AUC scores
roc_auc_non_binned = roc_auc_score(df['target_variable'], df['explanatory_variable'])
roc_auc_binned = roc_auc_score(df['target_variable'], df['bin'])

print(f"ROC AUC score (non-binned): {roc_auc_non_binned:.4f}")
print(f"ROC AUC score (binned): {roc_auc_binned:.4f}")


Number of edges: 2, IV: 64.38%
Number of edges: 2, IV: 64.38%
Number of edges: 2, IV: 64.38%
Number of edges: 3, IV: 71.44%
Number of edges: 3, IV: 71.44%
Number of edges: 3, IV: 71.44%
Number of edges: 3, IV: 71.44%
Number of edges: 4, IV: 75.07%
Number of edges: 4, IV: 75.07%
Number of edges: 4, IV: 75.07%
Number of edges: 4, IV: 75.07%
Number of edges: 4, IV: 75.07%
Number of edges: 5, IV: 76.85%
Number of edges: 5, IV: 76.85%
Number of edges: 5, IV: 76.85%
Number of edges: 5, IV: 76.85%
Number of edges: 5, IV: 76.85%
Number of edges: 5, IV: 76.85%
Optimal bin edges: [3.339410015512634e-05, 3.333345296058221, 6.666657198016288, 9.999969099974354]
ROC AUC score (non-binned): 0.7423
ROC AUC score (binned): 0.7056


In [5]:
xxxxxxxxxx

NameError: name 'xxxxxxxxxx' is not defined

### Appendix

To determine whether adjacent bins are statistically different, you can perform a hypothesis test, such as the Chi-square test for independence. 

Here's a step-by-step process for doing this:
- Create a contingency table for the adjacent bins, showing the frequency distribution of the binary default flag (0 and 1) for each pair of adjacent bins.
- Calculate the expected frequency for each cell in the contingency table under the assumption that the bins are independent. To do this, multiply the row total (sum of binary default flags in a bin) by the column total (sum of observations in the two adjacent bins) and divide by the total number of observations.
- Calculate the Chi-square statistic. For each cell, find the difference between the observed frequency and the expected frequency, square the result, and divide by the expected frequency. Add up all these values to get the Chi-square statistic.
- Determine the degrees of freedom for the Chi-square test. For a 2x2 contingency table (2 rows, one for each binary default flag, and 2 columns, one for each adjacent bin), the degrees of freedom are (number of rows - 1) * (number of columns - 1) = 1.
- Determine the critical value and the p-value for the calculated Chi-square statistic using the Chi-square distribution table or a statistical software with the corresponding degrees of freedom.
- Set a significance level (commonly 0.05). If the p-value is less than the chosen significance level, reject the null hypothesis that the adjacent bins are independent, which means there is a statistically significant difference between the bins. If the p-value is greater than the chosen significance level, you cannot reject the null hypothesis, and there is no evidence to suggest a statistically significant difference between the bins.

Repeat these steps for each pair of adjacent bins to determine whether there is a statistically significant difference between them.

In [None]:
import pandas as pd
from scipy.stats import chi2_contingency

# Sample data
data = {
    'bin': ['A', 'A', 'A', 'B', 'B', 'B', 'C', 'C', 'C', 'D', 'D', 'D', 'E', 'E', 'E'],
    'default_flag': [0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1]
}

df = pd.DataFrame(data)

# Perform Chi-square test for independence on adjacent bins
significance_level = 0.05

for i in range(len(df['bin'].unique()) - 1):
    bin1 = df['bin'].unique()[i]
    bin2 = df['bin'].unique()[i + 1]
    contingency_table = pd.crosstab(df[df['bin'].isin([bin1, bin2])]['bin'], df['default_flag'])
    chi2, p, _, _ = chi2_contingency(contingency_table)
    
    print(f"Chi-square test for {bin1} and {bin2}:")
    print(f"Chi2 = {chi2:.4f}, p-value = {p:.4f}")
    
    if p < significance_level:
        print(f"The difference between bins {bin1} and {bin2} is statistically significant.\n")
    else:
        print(f"There is no evidence to suggest a statistically significant difference between bins {bin1} and {bin2}.\n")


In [None]:
import numpy as np
from scipy.stats import chi2_contingency

data = np.array([
    [60, 30],
    [20, 50],
])

chi2, p_value, dof, expected = chi2_contingency(data)

print(chi2, p_value, dof, expected)