In [7]:
# ============================================================================
# MDLPC DISCRETIZATION - CONVERT CONTINUOUS TO CATEGORICAL
# ============================================================================

import pandas as pd
import numpy as np
import os
import math
from scipy import stats

In [8]:
# Import the MDLPDiscretizer class
# (You can copy the class definition from mdlpc_discretization.py or import it)
# For notebook use, we'll define it inline:

class MDLPDiscretizer:
    """MDLPC Discretizer for converting continuous to categorical variables."""
    
    def __init__(self):
        self.cut_points = {}
        self.feature_names = []
    
    def _entropy(self, y):
        """Calculate entropy of target variable."""
        if len(y) == 0:
            return 0
        unique, counts = np.unique(y, return_counts=True)
        probabilities = counts / len(y)
        entropy = -np.sum(probabilities * np.log2(probabilities + 1e-10))
        return entropy
    
    def _information_gain(self, x, y, cut_point):
        """Calculate information gain for a cut point."""
        left_mask = x <= cut_point
        right_mask = x > cut_point
        left_y = y[left_mask]
        right_y = y[right_mask]
        
        entropy_before = self._entropy(y)
        entropy_left = self._entropy(left_y)
        entropy_right = self._entropy(right_y)
        
        n_total = len(y)
        n_left = len(left_y)
        n_right = len(right_y)
        
        if n_total == 0:
            return 0
        
        entropy_after = (n_left / n_total) * entropy_left + (n_right / n_total) * entropy_right
        gain = entropy_before - entropy_after
        return gain
    
    def _mdlpc_criterion(self, x, y, cut_point):
        """Calculate MDLPC criterion (gain - penalty)."""
        n = len(y)
        if n == 0:
            return -np.inf
        
        gain = self._information_gain(x, y, cut_point)
        if gain <= 0:
            return -np.inf
        
        left_mask = x <= cut_point
        right_mask = x > cut_point
        left_y = y[left_mask]
        right_y = y[right_mask]
        
        n_left = len(left_y)
        n_right = len(right_y)
        k = len(np.unique(y))
        k_left = len(np.unique(left_y))
        k_right = len(np.unique(right_y))
        
        delta = (math.log2(n - 1) / n) + (math.log2(3**k - 2) / n) - \
                (k * self._entropy(y) - k_left * self._entropy(left_y) - k_right * self._entropy(right_y)) / n
        
        criterion = gain - delta
        return criterion
    
    def _find_cut_points(self, x, y):
        """Recursively find optimal cut points using MDLPC."""
        if len(np.unique(x)) <= 1 or len(np.unique(y)) <= 1:
            return []
        
        sorted_indices = np.argsort(x)
        x_sorted = x[sorted_indices]
        y_sorted = y[sorted_indices]
        
        cut_candidates = []
        for i in range(len(x_sorted) - 1):
            if x_sorted[i] != x_sorted[i + 1]:
                cut_point = (x_sorted[i] + x_sorted[i + 1]) / 2
                cut_candidates.append((cut_point, i))
        
        if len(cut_candidates) == 0:
            return []
        
        best_criterion = -np.inf
        best_cut = None
        best_index = None
        
        for cut_point, idx in cut_candidates:
            criterion = self._mdlpc_criterion(x_sorted, y_sorted, cut_point)
            if criterion > best_criterion:
                best_criterion = criterion
                best_cut = cut_point
                best_index = idx
        
        if best_criterion <= 0:
            return []
        
        left_mask = x_sorted <= best_cut
        right_mask = x_sorted > best_cut
        
        left_x = x_sorted[left_mask]
        left_y = y_sorted[left_mask]
        right_x = x_sorted[right_mask]
        right_y = y_sorted[right_mask]
        
        left_cuts = self._find_cut_points(left_x, left_y)
        right_cuts = self._find_cut_points(right_x, right_y)
        
        all_cuts = left_cuts + [best_cut] + right_cuts
        return sorted(all_cuts)
    
    def fit(self, X, y):
        """Fit the discretizer on the data."""
        if isinstance(X, pd.DataFrame):
            self.feature_names = X.columns.tolist()
            X = X.values
        else:
            self.feature_names = [f'feature_{i}' for i in range(X.shape[1])]
        
        y = np.array(y)
        
        for i, feature_name in enumerate(self.feature_names):
            x = X[:, i]
            valid_mask = ~np.isnan(x)
            x_clean = x[valid_mask]
            y_clean = y[valid_mask]
            
            if len(x_clean) == 0:
                self.cut_points[feature_name] = []
                continue
            
            cut_points = self._find_cut_points(x_clean, y_clean)
            self.cut_points[feature_name] = sorted(cut_points)
            
            print(f"Feature {feature_name}: Found {len(cut_points)} cut point(s)")
            if len(cut_points) > 0:
                print(f"  Cut points: {cut_points}")
    
    def transform(self, X):
        """Transform continuous variables to categorical."""
        if isinstance(X, pd.DataFrame):
            feature_names = X.columns.tolist()
            X = X.values
        else:
            feature_names = self.feature_names
        
        transformed_data = {}
        
        for i, feature_name in enumerate(feature_names):
            x = X[:, i]
            cut_points = self.cut_points.get(feature_name, [])
            
            if len(cut_points) == 0:
                transformed_data[feature_name] = ['all'] * len(x)
            else:
                labels = []
                for val in x:
                    if np.isnan(val):
                        labels.append('missing')
                    else:
                        bin_idx = np.searchsorted(cut_points, val)
                        labels.append(f'bin_{bin_idx}')
                transformed_data[feature_name] = labels
        
        return pd.DataFrame(transformed_data)



In [9]:

# ============================================================================
# APPLY MDLPC DISCRETIZATION
# ============================================================================

print("=" * 80)
print("MDLPC DISCRETIZATION")
print("=" * 80)

# Ensure df is loaded (from previous cells)
# If not, load it:
df = pd.read_csv('../../Data/raw/new_Base_CDM_balanced_V2.csv', sep=';', skiprows=[1])

# Define variables
target = 'Y'
continuous_vars = ['X1', 'X2', 'X3', 'X4', 'X6']
nominal_vars = ['X5', 'X7']

# Convert continuous variables to numeric
for col in continuous_vars:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Initialize and fit discretizer
print("\nFitting MDLPC discretizer...")
discretizer = MDLPDiscretizer()
X_continuous = df[continuous_vars]
y = df[target]

discretizer.fit(X_continuous, y)

# Transform continuous variables
print("\nTransforming continuous variables to categorical...")
X_categorical = discretizer.transform(X_continuous)

# Create final categorical dataset
df_categorical = pd.DataFrame()

# Add discretized continuous variables
for col in continuous_vars:
    df_categorical[f'{col}_cat'] = X_categorical[col]

# Add nominal variables (already categorical)
for col in nominal_vars:
    df_categorical[col] = df[col]

# Add target variable
df_categorical[target] = df[target]

print(f"\n✓ Categorical dataset created: {df_categorical.shape}")
print(f"  Columns: {list(df_categorical.columns)}")

# Display summary
print("\n" + "=" * 80)
print("DISCRETIZATION SUMMARY")
print("=" * 80)

for col in continuous_vars:
    print(f"\n{col}:")
    print(f"  Original range: [{df[col].min():.2f}, {df[col].max():.2f}]")
    print(f"  Categories created: {df_categorical[f'{col}_cat'].nunique()}")
    print(f"  Distribution:")
    value_counts = df_categorical[f'{col}_cat'].value_counts()
    for cat, count in value_counts.head(5).items():
        print(f"    {cat}: {count} ({count/len(df_categorical)*100:.2f}%)")


MDLPC DISCRETIZATION

Fitting MDLPC discretizer...


Feature X1: Found 6 cut point(s)
  Cut points: [4.51, 7.3100000000000005, 9.375, 13.195, 28.5, 47.5]
Feature X2: Found 7 cut point(s)
  Cut points: [8.375, 33.545, 67.59, 92.39500000000001, 139.34, 213.635, 587.515]
Feature X3: Found 18 cut point(s)
  Cut points: [28722.5, 32738.0, 33144.0, 33585.0, 38646.5, 43828.5, 44406.0, 45848.5, 46307.5, 51511.0, 51544.0, 57174.5, 57374.5, 85611.0, 89561.5, 94439.5, 109168.5, 109746.5]
Feature X4: Found 17 cut point(s)
  Cut points: [11.5, 24.5, 25.5, 26.5, 27.5, 29.0, 31.0, 32.5, 33.5, 40.5, 41.5, 42.5, 43.5, 47.0, 56.5, 62.0, 68.0]
Feature X6: Found 12 cut point(s)
  Cut points: [15.5, 52.5, 55.0, 59.495000000000005, 74.5, 141.68, 287.5, 289.0, 478.5, 702.5, 926.5, 2047.0]

Transforming continuous variables to categorical...

✓ Categorical dataset created: (25782, 8)
  Columns: ['X1_cat', 'X2_cat', 'X3_cat', 'X4_cat', 'X6_cat', 'X5', 'X7', 'Y']

DISCRETIZATION SUMMARY

X1:
  Original range: [1.00, 1475.00]
  Categories created: 7
  Distribution

In [10]:

# Save processed data
output_dir = 'Data/processed/categorical'
os.makedirs(output_dir, exist_ok=True)

output_path = os.path.join(output_dir, 'data_categorical_mdlpc.csv')
df_categorical.to_csv(output_path, index=False, sep=';')
print(f"\n✓ Saved categorical data to: {output_path}")

print("\n" + "=" * 80)
print("DISCRETIZATION COMPLETE!")
print("=" * 80)


✓ Saved categorical data to: Data/processed/categorical\data_categorical_mdlpc.csv

DISCRETIZATION COMPLETE!
