In [4]:
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.sql import Row
from optbinning import OptimalBinning
import numpy as np
import pandas as pd

# Initialize SparkSession
spark = SparkSession.builder.appName("OptBinningExample").getOrCreate()
sc = SparkContext.getOrCreate()

# Generate the target variable
np.random.seed(42)
observations = 1_000
target = np.random.randint(0, 2, observations)

# Create correlated variables
data = pd.DataFrame({
    'var1': (target + 0.2) * np.random.randn(observations),
    'var2': (target + 0.3) * np.random.randn(observations),
    'var3': (-target + 0.2) * np.random.randn(observations),
    'var4': (target + 0.1) * np.random.randn(observations),
    'var5': (target + 0.25) * np.random.randn(observations),
    'var6': np.random.rand(observations),
    'var7': np.random.rand(observations),
    'var8': np.random.rand(observations),
    'var9': np.random.rand(observations),
    'var10': np.random.rand(observations),
    'target': target
})

# Broadcast the entire DataFrame (or the necessary portion of it)
broadcast_data = sc.broadcast(data.to_dict(orient='list'))

def binning_function(var):
    # Perform Optimal Binning for a single variable using the full data column
    optb = OptimalBinning(name=var, dtype="numerical", solver="cp")
    optb.fit(np.array(broadcast_data.value[var]), np.array(broadcast_data.value['target']))

    # Get Information Value
    binning_table = optb.binning_table
    binning_table.build()
    iv = binning_table.iv
    splits = binning_table.splits
    
    # Return the results as a dictionary
    return {
        'variable': var,
        'iv': iv,
        'splits': splits
    }
    return optb.splits.tolist()

# Map the binning function to each variable (excluding target)
binned_results = sc.parallelize(data.columns.difference(['target'])).map(binning_function).collect()

# Print the results
for result in binned_results:
    print(f"Variable: {result['variable']}, IV: {result['iv']:0.3f}, Splits: {result['splits']}")

Variable: var1, IV: 3.870, Splits: [-0.43191871 -0.28221758 -0.16728848 -0.02888381  0.03069652  0.14377302
  0.22032083  0.35922444  0.51510692]
Variable: var10, IV: 0.057, Splits: [0.11854626 0.24119402 0.31879368 0.37129214 0.73273483 0.87061062
 0.92411289]
Variable: var2, IV: 2.386, Splits: [-0.53381741 -0.378314   -0.24581693 -0.10081048 -0.03216861  0.2386419
  0.45663874  0.63207525]
Variable: var3, IV: 2.751, Splits: [-0.4505313  -0.33470391 -0.24247514 -0.15137108 -0.07585113  0.1479005
  0.21693115  0.36852024]
Variable: var4, IV: 3.637, Splits: [-0.16165832 -0.09186092 -0.02178705  0.00238991  0.03475822  0.12112483
  0.19900268]
Variable: var5, IV: 3.181, Splits: [-0.55031744 -0.31986973 -0.2296852  -0.14298058 -0.07637362  0.23498204
  0.3138652   0.46967517]
Variable: var6, IV: 0.023, Splits: [0.09464763 0.15141959 0.24033074 0.90484789]
Variable: var7, IV: 0.035, Splits: [0.12587149 0.17060392 0.32343714 0.59561449]
Variable: var8, IV: 0.033, Splits: [0.3757185  0.51269