In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.sql import Row
from optbinning import OptimalBinning
import numpy as np
import pandas as pd

### A. Create a dummy PySpark dataframe

In [2]:
# Initialize SparkSession
spark = SparkSession.builder.appName("OptBinningExample").getOrCreate()
sc = SparkContext.getOrCreate()

# Generate the target variable
np.random.seed(42)
observations = 1_000
target = np.random.randint(0, 2, observations)

# Create correlated variables
data_pandas = pd.DataFrame({
    'var1': (target + 0.2) * np.random.randn(observations),
    'var2': (target + 0.3) * np.random.randn(observations),
    'var3': (-target + 0.2) * np.random.randn(observations),
    'var4': (target + 0.1) * np.random.randn(observations),
    'var5': (target + 0.25) * np.random.randn(observations),
    'var6': np.random.rand(observations),
    'var7': np.random.rand(observations),
    'var8': np.random.rand(observations),
    'var9': np.random.rand(observations),
    'var10': np.random.rand(observations),
    'target': target
})

# Convert to PySpark dataframe
data_pyspark = spark.createDataFrame(data_pandas)

### B. Create optimal WoE bins

In [3]:
# Initialize SparkContext
sc = SparkContext.getOrCreate()

def binning_function(var, broadcast_chunk):
    # Perform Optimal Binning for a single variable using the chunk data
    optb = OptimalBinning(name=var, dtype="numerical", solver="cp", max_n_bins=5)
    optb.fit(np.array(broadcast_chunk.value[var]), np.array(broadcast_chunk.value['target']))

    # Get Information Value
    binning_table = optb.binning_table
    binning_table.build()
    iv = binning_table.iv
    splits = binning_table.splits
    
    # Return the results as a dictionary
    return {
        'variable': var,
        'iv': iv,
        'splits': splits
    }

def process_in_chunks(data, data_columns, n):
    # Split the data_columns into chunks of size n
    for i in range(0, len(data_columns), n):
        chunk_vars = data_columns[i:i+n]
        chunk_data = data[['target'] + list(chunk_vars)]
        
        # Broadcast the chunk of data
        broadcast_chunk = sc.broadcast(chunk_data.to_dict(orient='list'))
        
        # Run the binning function for each variable in the chunk
        chunk_results = sc.parallelize(chunk_vars).map(lambda var: binning_function(var, broadcast_chunk)).collect()
        
        # Unpersist the broadcast variable to free up memory
        broadcast_chunk.unpersist()
        
        yield chunk_results

# Set the maximum number of variables per chunk
n = 3

# Convert pyspark dataset to pandas
data = data_pyspark.toPandas()

# Get the list of variables excluding the target
variables = data.columns.difference(['target'])

# Initialize an empty list to store the final results
final_results = []

# Process variables in chunks and aggregate the results
for chunk_result in process_in_chunks(data, variables, n):
    final_results.extend(chunk_result)

# Print the results
for result in final_results:
    print(f"Variable: {result['variable']}, IV: {result['iv']:0.3f}, Splits: {result['splits']}")

Variable: var1, IV: 3.807, Splits: [-0.43191871 -0.28221758  0.22032083  0.51510692]
Variable: var10, IV: 0.054, Splits: [0.11854626 0.31879368 0.37129214 0.92411289]
Variable: var2, IV: 2.298, Splits: [-0.53381741 -0.24581693  0.45663874  0.63207525]
Variable: var3, IV: 2.675, Splits: [-0.4505313  -0.24247514  0.1479005   0.36852024]
Variable: var4, IV: 3.571, Splits: [-0.16165832 -0.09186092  0.12112483  0.19900268]
Variable: var5, IV: 3.132, Splits: [-0.55031744 -0.14298058  0.23498204  0.46967517]
Variable: var6, IV: 0.023, Splits: [0.09464763 0.15141959 0.24033074 0.90484789]
Variable: var7, IV: 0.035, Splits: [0.12587149 0.17060392 0.32343714 0.59561449]
Variable: var8, IV: 0.032, Splits: [0.3757185  0.51269495 0.5759775  0.94276121]
Variable: var9, IV: 0.075, Splits: [0.23872217 0.30109173 0.3597669  0.91622782]
