In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from CMI_torch import compute_all_cmi_methods, estimate_CMI_soft_kronecker_gaussian, estimate_CMI_gumbel_softmax_kernel, \
    estimate_CMI_separate_kernel
import sys
import torch

In [2]:
adult_data_train = pd.read_csv(r'C:\Users\admin\Desktop\projects\Imputation_Fairness\Data\adult\adult.data',
                         header=None, 
                        delimiter=',')
adult_data_test= pd.read_csv(r'C:\Users\admin\Desktop\projects\Imputation_Fairness\Data\adult\adult.test',
                         header=None, 
                        delimiter=',', skiprows=1)
adult_data = pd.concat([adult_data_train, adult_data_test], axis=0).reset_index(drop=True)

In [3]:
adult_data.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
# Replace all ' ?' values with np.nan
adult_data = adult_data.replace(' ?', np.nan)
adult_data.isna().sum()

0        0
1     2799
2        0
3        0
4        0
5        0
6     2809
7        0
8        0
9        0
10       0
11       0
12       0
13     857
14       0
dtype: int64

In [5]:
missing_percentage = (adult_data.isnull().sum() / len(adult_data)) * 100
print(missing_percentage)

0     0.000000
1     5.730724
2     0.000000
3     0.000000
4     0.000000
5     0.000000
6     5.751198
7     0.000000
8     0.000000
9     0.000000
10    0.000000
11    0.000000
12    0.000000
13    1.754637
14    0.000000
dtype: float64


In [6]:
adult_data = adult_data.dropna()

In [7]:
adult_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45222 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       45222 non-null  int64 
 1   1       45222 non-null  object
 2   2       45222 non-null  int64 
 3   3       45222 non-null  object
 4   4       45222 non-null  int64 
 5   5       45222 non-null  object
 6   6       45222 non-null  object
 7   7       45222 non-null  object
 8   8       45222 non-null  object
 9   9       45222 non-null  object
 10  10      45222 non-null  int64 
 11  11      45222 non-null  int64 
 12  12      45222 non-null  int64 
 13  13      45222 non-null  object
 14  14      45222 non-null  object
dtypes: int64(6), object(9)
memory usage: 5.5+ MB


In [8]:
columns = [
    'age', 'workclass', 'fnlwgt', 'education', 'education-num',
    'marital-status', 'occupation', 'relationship', 'race', 'sex',
    'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income']
adult_data.columns = columns

In [9]:
adult_data.nunique()

age                  74
workclass             7
fnlwgt            26741
education            16
education-num        16
marital-status        7
occupation           14
relationship          6
race                  5
sex                   2
capital-gain        121
capital-loss         97
hours-per-week       96
native-country       41
income                4
dtype: int64

In [10]:
print("age",adult_data['age'].unique())
print("workclass",adult_data['workclass'].unique())
print("fnlwgt",adult_data['fnlwgt'].unique())
print("education",adult_data['education'].unique())
print("education-num",adult_data['education-num'].unique())
print("marital-status",adult_data['marital-status'].unique())
print("occupation",adult_data['occupation'].unique())
print("relationship",adult_data['relationship'].unique())
print("race",adult_data['race'].unique())
print("sex",adult_data['sex'].unique())
print("capital-gain",adult_data['capital-gain'].unique())
print("capital-loss",adult_data['capital-loss'].unique())
print("hours-per-week",adult_data['hours-per-week'].unique())
print("native-country",adult_data['native-country'].unique())
print("income",adult_data['income'].unique())

age [39 50 38 53 28 37 49 52 31 42 30 23 32 34 25 43 40 54 35 59 56 19 20 45
 22 48 21 24 57 44 41 29 47 46 36 79 27 18 33 76 55 61 70 64 71 66 51 58
 26 17 60 90 75 65 77 62 63 67 74 72 69 68 73 81 78 88 80 84 83 85 82 86
 89 87]
workclass [' State-gov' ' Self-emp-not-inc' ' Private' ' Federal-gov' ' Local-gov'
 ' Self-emp-inc' ' Without-pay']
fnlwgt [ 77516  83311 215646 ... 173449  89686 350977]
education [' Bachelors' ' HS-grad' ' 11th' ' Masters' ' 9th' ' Some-college'
 ' Assoc-acdm' ' 7th-8th' ' Doctorate' ' Assoc-voc' ' Prof-school'
 ' 5th-6th' ' 10th' ' Preschool' ' 12th' ' 1st-4th']
education-num [13  9  7 14  5 10 12  4 16 11 15  3  6  1  8  2]
marital-status [' Never-married' ' Married-civ-spouse' ' Divorced'
 ' Married-spouse-absent' ' Separated' ' Married-AF-spouse' ' Widowed']
occupation [' Adm-clerical' ' Exec-managerial' ' Handlers-cleaners' ' Prof-specialty'
 ' Other-service' ' Sales' ' Transport-moving' ' Farming-fishing'
 ' Machine-op-inspct' ' Tech-support' ' Craft-

In [11]:
# Drop unnecessary columns:
# 'education' is redundant because 'education-num' already encodes it numerically.
# 'fnlwgt' represents sampling weights and does not contribute to feature relationships.

adult_data = adult_data.drop(columns=['education', 'fnlwgt'])

In [12]:
adult_data.head(5)

Unnamed: 0,age,workclass,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [13]:
categorical_columns = [
    'workclass', 
    'marital-status', 
    'occupation', 
    'relationship', 
    'race', 
    'sex', 
    'native-country', 
    'income'
] 
for col in categorical_columns:
    encoder = LabelEncoder()
    not_null_idx = adult_data[col].notnull()
    adult_data.loc[not_null_idx, col] = encoder.fit_transform(adult_data.loc[not_null_idx, col])

In [14]:
adult_data.head(5)

Unnamed: 0,age,workclass,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,5,13,4,0,1,4,1,2174,0,40,38,0
1,50,4,13,2,3,0,4,1,0,0,13,38,0
2,38,2,9,0,5,1,4,1,0,0,40,38,0
3,53,2,7,2,5,0,2,1,0,0,40,38,0
4,28,2,13,2,9,5,2,0,0,0,40,4,0


In [15]:
# List of (X_index, Y_index, Z_index) triplets for CMI calculation
# Column indices after dropping 'education' and 'fnlwgt':

# 0: age
# 1: workclass
# 2: education-num
# 3: marital-status
# 4: occupation
# 5: relationship
# 6: race
# 7: sex
# 8: capital-gain
# 9: capital-loss
# 10: hours-per-week
# 11: native-country
# 12: income

cmi_triplets = [
    (0, 12, 2),  # (age, income, education-num)
    (6, 12, 2),  # (race, income, education-num)
    (7, 12, 2),  # (sex, income, education-num)
    (11, 12, 2), # (native-country, income, education-num)
    (0, 10, 4),  # (age, hours-per-week, occupation)
    (6, 10, 4),  # (race, hours-per-week, occupation)
    (7, 10, 3),  # (sex, hours-per-week, marital-status)
    (11, 10, 4), # (native-country, hours-per-week, occupation)
]


In [16]:
# continuous and discrete column indices
discrete_columns = [1, 3, 4, 5, 6, 7, 11, 12]
continuous_columns = [0, 2, 8, 9, 10]

# Create a copy of the original data to preserve the raw values
adult_data_scaled = adult_data.copy()

# Normalize only the continuous columns using MinMaxScaler
scaler = MinMaxScaler()
adult_data_scaled.iloc[:, continuous_columns] = scaler.fit_transform(adult_data_scaled.iloc[:, continuous_columns])

In [17]:
# Sampling strategy (Test vs Full)

#adult_data_sampled = adult_data_scaled

sample_indices = np.random.choice(adult_data_scaled.shape[0], size=100, replace=False)

adult_data_sampled = adult_data_scaled.iloc[sample_indices]
data_np = adult_data_sampled.values.astype(float)
data_tensor= torch.tensor(data_np,dtype=torch.float32)

In [18]:
#Encode and combine data

encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
one_hot_encoded = encoder.fit_transform(adult_data_sampled.iloc[:, discrete_columns])
continuous_part = adult_data_sampled.iloc[:, continuous_columns].values
data_processed = np.concatenate([one_hot_encoded, continuous_part], axis=1)

one_hot_encoded_tensor = torch.tensor(one_hot_encoded, dtype=torch.float32)
continuous_part_tensor = torch.tensor(continuous_part, dtype=torch.float32)
data_processed_tensor = torch.cat([one_hot_encoded_tensor, continuous_part_tensor], dim=1)


In [19]:
# GRID SERACH
#BOOTSTRAP

In [20]:
sys.path.append(r"C:\Users\admin\Desktop\Imputation_Fairness") 

import Utils
import Inject_Missing_Values
import RR_imputer
import Sinkhorn_CMI
import SinkhornImputation
import SoftImpute


In [21]:
for triplet in cmi_triplets:
    cmi1, cmi2, cmi3 = compute_all_cmi_methods(data_tensor, data_processed_tensor, one_hot_encoded_tensor, continuous_part_tensor,
                            triplet, encoder, discrete_columns, continuous_columns
    )

    print(f"Triplet {triplet} :")
    print("  CMI (Soft Kronecker + Gaussian):", cmi1)
    print("  CMI (Gumbel-Softmax + Gaussian):", cmi2)
    print("  CMI (Gumbel on Discrete + Separate Kernels):", cmi3)
    print()

Triplet (0, 12, 2) :
  CMI (Soft Kronecker + Gaussian): tensor(0.1072)
  CMI (Gumbel-Softmax + Gaussian): tensor(0.2180)
  CMI (Gumbel on Discrete + Separate Kernels): tensor(0.1479)

Triplet (6, 12, 2) :
  CMI (Soft Kronecker + Gaussian): tensor(0.0563)
  CMI (Gumbel-Softmax + Gaussian): tensor(0.3706)
  CMI (Gumbel on Discrete + Separate Kernels): tensor(0.2692)

Triplet (7, 12, 2) :
  CMI (Soft Kronecker + Gaussian): tensor(0.0435)
  CMI (Gumbel-Softmax + Gaussian): tensor(0.0967)
  CMI (Gumbel on Discrete + Separate Kernels): tensor(0.1631)

Triplet (11, 12, 2) :
  CMI (Soft Kronecker + Gaussian): tensor(0.1053)
  CMI (Gumbel-Softmax + Gaussian): tensor(0.5995)
  CMI (Gumbel on Discrete + Separate Kernels): tensor(0.6196)

Triplet (0, 10, 4) :
  CMI (Soft Kronecker + Gaussian): tensor(0.1170)
  CMI (Gumbel-Softmax + Gaussian): tensor(0.1174)
  CMI (Gumbel on Discrete + Separate Kernels): tensor(0.1423)

Triplet (6, 10, 4) :
  CMI (Soft Kronecker + Gaussian): tensor(0.0709)
  CMI (G

# Injection of 25% MCAR Missingness

In [22]:
from Inject_Missing_Values import Inject_Missing_Values

# Step 1: Split the full dataset into features (X) and target (Y)
X = adult_data_sampled.iloc[:, :-1]
Y = adult_data_sampled.iloc[:, -1]

# Step 2: Inject 25% MCAR missingness into the feature set X
generator_mcar25 = Inject_Missing_Values()
X_miss_mcar25, index_mcar25 = generator_mcar25.MCAR(X, missing_rate=25)

# Step 3: Report total missing percentage
total_missing_percentage = X_miss_mcar25.isnull().sum().sum() / X_miss_mcar25.size * 100
print(f"Total Missing Percentage (MCAR 25%): {total_missing_percentage:.2f}%")

# Step 4: Report missing percentage per column
missing_per_column = (X_miss_mcar25.isnull().sum() / len(X_miss_mcar25)) * 100
print(missing_per_column)


Total Missing Percentage (MCAR 25%): 25.00%
age               31.0
workclass         24.0
education-num     20.0
marital-status    26.0
occupation        27.0
relationship      23.0
race              18.0
sex               23.0
capital-gain      28.0
capital-loss      28.0
hours-per-week    23.0
native-country    29.0
dtype: float64


In [23]:
X_miss_with_Y = X_miss_mcar25.copy()
X_miss_with_Y["target"] = Y


X_numpy = X.to_numpy(dtype=float)
X_tensor = torch.tensor(X_numpy, dtype=torch.float32)


Y_numpy = Y.to_numpy(dtype=int)
Y_tensor = torch.tensor(Y_numpy, dtype=torch.float32)

X_miss_mcar25_numpy = X_miss_mcar25.to_numpy(dtype=float)
X_miss_mcar25_tensor = torch.tensor(X_miss_mcar25_numpy, dtype=torch.float32) #converting to tensor

X_miss_with_Y_numpy = X_miss_with_Y.to_numpy(dtype=float)
X_miss_with_Y_tensor = torch.tensor(X_miss_with_Y_numpy, dtype=torch.float32)


In [24]:
#Sinkhorn 

from Utils import pick_epsilon, MAE, RMSE, nanmean
from SinkhornImputation import SinkhornImputation

# Get shape of the data
n_mcar25, d_mcar25 = X_miss_mcar25_tensor.shape

# Set Sinkhorn hyperparameters
batchsize = 28  #128
lr = 1e-2
epsilon_mcar25 = pick_epsilon(X_miss_mcar25_tensor) # Determines Sinkhorn regularization strength

# Create a binary mask indicating missing positions (1.0 for missing, 0.0 otherwise)
mask_mcar25 = torch.isnan(X_miss_mcar25_tensor).double()

# Initialize and run Sinkhorn-based imputation
sinkhorn_imputer = SinkhornImputation(eps=epsilon_mcar25, batchsize=batchsize, lr=lr, niter=50)
sinkhorn_filled_tensor, maes_mcar25, rmses_mcar25 = sinkhorn_imputer.fit_transform(
    X_miss_mcar25_tensor,
    verbose=True,
    report_interval=50,
    X_true= X_tensor # for tracking MAE/RMSE during training   
)

# Convert imputed tensor to NumPy format for CMI analysis
#sinkhorn_filled_np = sinkhorn_filled_tensor.detach().cpu().numpy()

# Evaluate final MAE and RMSE only on originally missing positions
mae_final = MAE(sinkhorn_filled_tensor, X_tensor, mask_mcar25)
rmse_final = RMSE(sinkhorn_filled_tensor, X_tensor, mask_mcar25)
print(f"Final MAE (Sinkhorn, MCAR-25): {mae_final:.4f}")
print(f"Final RMSE (Sinkhorn, MCAR-25): {rmse_final:.4f}")


#Y_numpy = Y.to_numpy(dtype=int).reshape(-1,1)
Y_tensor_reshaped = Y_tensor.view(-1, 1) 
full_data_with_y_sinkhorn = torch.cat([sinkhorn_filled_tensor, Y_tensor_reshaped], dim=1)
#full_data_with_y_sinkhorn = np.concatenate([sinkhorn_filled_tensor, Y_tensor], axis=1)

# Estimate (CMI) after imputation
# for each predefined triplet using all three CMI estimation methods
for triplet in cmi_triplets:
    cmi1, cmi2, cmi3 = compute_all_cmi_methods(full_data_with_y_sinkhorn,
        data_processed_tensor, # One-hot encoded + scaled continuous
        one_hot_encoded_tensor, # Only the discrete part encoded
        continuous_part_tensor, # Only the continuous part
        triplet, # (X, Y, Z)
        encoder,
        discrete_columns,
        continuous_columns
    )

    print(f"[Sinkhorn Imputation] Triplet {triplet} :")
    print(" CMI (Soft Kronecker + Gaussian):", cmi1)
    print(" CMI (Gumbel-Softmax + Gaussian):", cmi2)
    print(" CMI (Gumbel on Discrete + Separate Kernels):", cmi3)
    print()


[GRAD] Sinkhorn grad norm: 0.185108
[GRAD] Sinkhorn grad norm: 1.418087
[GRAD] Sinkhorn grad norm: 0.224703
[GRAD] Sinkhorn grad norm: 1.794153
[GRAD] Sinkhorn grad norm: 0.213148
[GRAD] Sinkhorn grad norm: 0.226163
[GRAD] Sinkhorn grad norm: 0.161849
[GRAD] Sinkhorn grad norm: 0.205938
[GRAD] Sinkhorn grad norm: 0.257763
[GRAD] Sinkhorn grad norm: 0.726907
[GRAD] Sinkhorn grad norm: 0.178395
[GRAD] Sinkhorn grad norm: 0.600177
[GRAD] Sinkhorn grad norm: 1.199256
[GRAD] Sinkhorn grad norm: 0.832735
[GRAD] Sinkhorn grad norm: 0.209026
[GRAD] Sinkhorn grad norm: 1.519117
[GRAD] Sinkhorn grad norm: 0.169885
[GRAD] Sinkhorn grad norm: 0.584966
[GRAD] Sinkhorn grad norm: 0.311819
[GRAD] Sinkhorn grad norm: 0.162106
[GRAD] Sinkhorn grad norm: 0.230676
[GRAD] Sinkhorn grad norm: 0.316927
[GRAD] Sinkhorn grad norm: 0.500906
[GRAD] Sinkhorn grad norm: 0.236348
[GRAD] Sinkhorn grad norm: 0.210665
[GRAD] Sinkhorn grad norm: 0.205805
[GRAD] Sinkhorn grad norm: 0.832807
[GRAD] Sinkhorn grad norm: 0

In [25]:
#sinkhorn cmi

from Sinkhorn_CMI import SinkhornImputation_CMI
from Utils import pick_epsilon, MAE, RMSE

n, d = X_miss_with_Y_tensor.shape
batchsize = 28
lr = 1e-2
epsilon = pick_epsilon(X_miss_with_Y_tensor)
mask_mcar25 = torch.isnan(X_miss_with_Y_tensor).double()

results = []

for cmi_index in [0, 1, 2]:
    print(f"\n\n=========================== Running Sinkhorn_CMI with CMI Method {cmi_index + 1} ===========================")

    for triplet_index in [-1] + list(range(len(cmi_triplets))):
        sk_imputer = SinkhornImputation_CMI(
            eps=epsilon,
            batchsize=batchsize,
            lr=lr,
            niter=10,
            highest_lamda_cmi=500,
            cmi_index=cmi_index
        )

        fit_kwargs = dict(
            X=X_miss_with_Y_tensor,
            verbose=False,
            report_interval=50,
            X_true=torch.cat([X_tensor, Y_tensor.reshape(-1, 1)], dim=1),
            X_cols=[cmi_triplets[triplet_index][0]] if triplet_index != -1 else [t[0] for t in cmi_triplets],
            Y_cols=[cmi_triplets[triplet_index][1]] if triplet_index != -1 else [t[1] for t in cmi_triplets],
            Z_cols=[cmi_triplets[triplet_index][2]] if triplet_index != -1 else [t[2] for t in cmi_triplets],
            encoder=encoder,
            discrete_columns=discrete_columns,
            continuous_columns=continuous_columns
        )

        if cmi_index in [1, 2]:
            fit_kwargs["Y"] = Y_tensor

        sk_imp, maes, rmses, history = sk_imputer.fit_transform(**fit_kwargs)
        
        # --- Compute CMI for each triplet after imputation ---
        print(f"\n[CMI Calculation after Sinkhorn_CMI - Method {cmi_index + 1}]")
        full_data_with_y = sk_imp  

        for triplet in cmi_triplets:
            cmi1, cmi2, cmi3 = compute_all_cmi_methods(
            full_data_with_y,
            data_processed_tensor,
            one_hot_encoded_tensor,
            continuous_part_tensor,
            triplet,
            encoder,
            discrete_columns,
            continuous_columns
            )

            print(f"[Sinkhorn_CMI - CMI Method {cmi_index + 1}] Triplet {triplet} :")
            print(" CMI (Soft Kronecker + Gaussian):", cmi1)
            print(" CMI (Gumbel-Softmax + Gaussian):", cmi2)
            print(" CMI (Gumbel + Separate Kernels):", cmi3)
            print()

        print("\n--- Final MAE / RMSE ---")
        with torch.no_grad():
            mask = torch.isnan(X_miss_with_Y_tensor).double()
            final_mae = MAE(sk_imp, torch.cat([X_tensor, Y_tensor.reshape(-1, 1)], dim=1), mask).item()
            final_rmse = RMSE(sk_imp, torch.cat([X_tensor, Y_tensor.reshape(-1, 1)], dim=1), mask).item()
            print(f"Final MAE: {final_mae:.4f}")
            print(f"Final RMSE: {final_rmse:.4f}")



[GRAD] CMI-1 grad norm: 1.426078
[GRAD] CMI-1 grad norm: 0.514196
[GRAD] CMI-1 grad norm: 0.370556
[GRAD] CMI-1 grad norm: 0.557836
[GRAD] CMI-1 grad norm: 2.082607
[GRAD] CMI-1 grad norm: 0.748652
[GRAD] CMI-1 grad norm: 0.592317
[GRAD] CMI-1 grad norm: 0.783409
[GRAD] CMI-1 grad norm: 1.444320
[GRAD] CMI-1 grad norm: 0.471525
[GRAD] CMI-1 grad norm: 0.482404
[GRAD] CMI-1 grad norm: 0.609008
[GRAD] CMI-1 grad norm: 2.176094
[GRAD] CMI-1 grad norm: 0.999244
[GRAD] CMI-1 grad norm: 0.577301
[GRAD] CMI-1 grad norm: 0.778857
[GRAD] CMI-1 grad norm: 1.749595
[GRAD] CMI-1 grad norm: 0.742837
[GRAD] CMI-1 grad norm: 0.409143
[GRAD] CMI-1 grad norm: 0.593615
[GRAD] CMI-1 grad norm: 2.381479
[GRAD] CMI-1 grad norm: 0.681137
[GRAD] CMI-1 grad norm: 0.602518
[GRAD] CMI-1 grad norm: 0.682393
[GRAD] CMI-1 grad norm: 1.103637
[GRAD] CMI-1 grad norm: 0.514185
[GRAD] CMI-1 grad norm: 0.246849
[GRAD] CMI-1 grad norm: 0.481809
[GRAD] CMI-1 grad norm: 2.172309
[GRAD] CMI-1 grad norm: 0.464204
[GRAD] C

[GRAD] CMI-1 grad norm: 1.202474
[GRAD] CMI-1 grad norm: 0.478592
[GRAD] CMI-1 grad norm: 1.701055
[GRAD] CMI-1 grad norm: 0.500532
[GRAD] CMI-1 grad norm: 0.565811
[GRAD] CMI-1 grad norm: 0.366270
[GRAD] CMI-1 grad norm: 0.428999
[GRAD] CMI-1 grad norm: 0.482201

[CMI Calculation after Sinkhorn_CMI - Method 1]
[Sinkhorn_CMI - CMI Method 1] Triplet (0, 12, 2) :
 CMI (Soft Kronecker + Gaussian): tensor(0.0829, grad_fn=<SubBackward0>)
 CMI (Gumbel-Softmax + Gaussian): tensor(0.1550)
 CMI (Gumbel + Separate Kernels): tensor(0.1993)

[Sinkhorn_CMI - CMI Method 1] Triplet (6, 12, 2) :
 CMI (Soft Kronecker + Gaussian): tensor(0.1325, grad_fn=<SubBackward0>)
 CMI (Gumbel-Softmax + Gaussian): tensor(0.2828)
 CMI (Gumbel + Separate Kernels): tensor(0.3839)

[Sinkhorn_CMI - CMI Method 1] Triplet (7, 12, 2) :
 CMI (Soft Kronecker + Gaussian): tensor(0.1359, grad_fn=<SubBackward0>)
 CMI (Gumbel-Softmax + Gaussian): tensor(0.1200)
 CMI (Gumbel + Separate Kernels): tensor(0.1430)

[Sinkhorn_CMI - CM

[Sinkhorn_CMI - CMI Method 1] Triplet (11, 10, 4) :
 CMI (Soft Kronecker + Gaussian): tensor(0.1177, grad_fn=<SubBackward0>)
 CMI (Gumbel-Softmax + Gaussian): tensor(0.3257)
 CMI (Gumbel + Separate Kernels): tensor(0.3240)


--- Final MAE / RMSE ---
Final MAE: 0.7991
Final RMSE: 1.7348
[GRAD] CMI-1 grad norm: 0.592317
[GRAD] CMI-1 grad norm: 0.607706
[GRAD] CMI-1 grad norm: 1.263560
[GRAD] CMI-1 grad norm: 0.606343
[GRAD] CMI-1 grad norm: 1.775590
[GRAD] CMI-1 grad norm: 0.569045
[GRAD] CMI-1 grad norm: 0.720293
[GRAD] CMI-1 grad norm: 0.711407
[GRAD] CMI-1 grad norm: 0.693647
[GRAD] CMI-1 grad norm: 0.770995

[CMI Calculation after Sinkhorn_CMI - Method 1]
[Sinkhorn_CMI - CMI Method 1] Triplet (0, 12, 2) :
 CMI (Soft Kronecker + Gaussian): tensor(0.0800, grad_fn=<SubBackward0>)
 CMI (Gumbel-Softmax + Gaussian): tensor(0.1550)
 CMI (Gumbel + Separate Kernels): tensor(0.1993)

[Sinkhorn_CMI - CMI Method 1] Triplet (6, 12, 2) :
 CMI (Soft Kronecker + Gaussian): tensor(0.1367, grad_fn=<Su

[GRAD] CMI-2 grad norm: 0.192469
[GRAD] CMI-2 grad norm: 1.125561
[GRAD] CMI-2 grad norm: 0.230346
[GRAD] CMI-2 grad norm: 1.614228
[GRAD] CMI-2 grad norm: 0.194893
[GRAD] CMI-2 grad norm: 0.220441
[GRAD] CMI-2 grad norm: 0.163861
[GRAD] CMI-2 grad norm: 0.200818
[GRAD] CMI-2 grad norm: 0.234452

[CMI Calculation after Sinkhorn_CMI - Method 2]
[Sinkhorn_CMI - CMI Method 2] Triplet (0, 12, 2) :
 CMI (Soft Kronecker + Gaussian): tensor(0.0799, grad_fn=<SubBackward0>)
 CMI (Gumbel-Softmax + Gaussian): tensor(0.1403)
 CMI (Gumbel + Separate Kernels): tensor(0.2408)

[Sinkhorn_CMI - CMI Method 2] Triplet (6, 12, 2) :
 CMI (Soft Kronecker + Gaussian): tensor(0.1368, grad_fn=<SubBackward0>)
 CMI (Gumbel-Softmax + Gaussian): tensor(0.3039)
 CMI (Gumbel + Separate Kernels): tensor(0.3819)

[Sinkhorn_CMI - CMI Method 2] Triplet (7, 12, 2) :
 CMI (Soft Kronecker + Gaussian): tensor(0.1337, grad_fn=<SubBackward0>)
 CMI (Gumbel-Softmax + Gaussian): tensor(0.1436)
 CMI (Gumbel + Separate Kernels): t

[GRAD] CMI-2 grad norm: 1.125561
[GRAD] CMI-2 grad norm: 0.230346
[GRAD] CMI-2 grad norm: 1.614228
[GRAD] CMI-2 grad norm: 0.194893
[GRAD] CMI-2 grad norm: 0.220441
[GRAD] CMI-2 grad norm: 0.163861
[GRAD] CMI-2 grad norm: 0.200818
[GRAD] CMI-2 grad norm: 0.234452

[CMI Calculation after Sinkhorn_CMI - Method 2]
[Sinkhorn_CMI - CMI Method 2] Triplet (0, 12, 2) :
 CMI (Soft Kronecker + Gaussian): tensor(0.0799, grad_fn=<SubBackward0>)
 CMI (Gumbel-Softmax + Gaussian): tensor(0.1403)
 CMI (Gumbel + Separate Kernels): tensor(0.2408)

[Sinkhorn_CMI - CMI Method 2] Triplet (6, 12, 2) :
 CMI (Soft Kronecker + Gaussian): tensor(0.1368, grad_fn=<SubBackward0>)
 CMI (Gumbel-Softmax + Gaussian): tensor(0.3039)
 CMI (Gumbel + Separate Kernels): tensor(0.3819)

[Sinkhorn_CMI - CMI Method 2] Triplet (7, 12, 2) :
 CMI (Soft Kronecker + Gaussian): tensor(0.1337, grad_fn=<SubBackward0>)
 CMI (Gumbel-Softmax + Gaussian): tensor(0.1436)
 CMI (Gumbel + Separate Kernels): tensor(0.1344)

[Sinkhorn_CMI - CM

[GRAD] CMI-3 grad norm: 0.192469
[GRAD] CMI-3 grad norm: 0.000000
[GRAD] CMI-3 grad norm: 0.000000
[GRAD] CMI-3 grad norm: 0.000000
[GRAD] CMI-3 grad norm: 0.000000
[GRAD] CMI-3 grad norm: 0.000000
[GRAD] CMI-3 grad norm: 0.000000
[GRAD] CMI-3 grad norm: 0.000000
[GRAD] CMI-3 grad norm: 1.125561
[GRAD] CMI-3 grad norm: 0.000000
[GRAD] CMI-3 grad norm: 0.000000
[GRAD] CMI-3 grad norm: 0.000000
[GRAD] CMI-3 grad norm: 0.000000
[GRAD] CMI-3 grad norm: 0.000000
[GRAD] CMI-3 grad norm: 0.000000
[GRAD] CMI-3 grad norm: 0.000000
[GRAD] CMI-3 grad norm: 0.230346
[GRAD] CMI-3 grad norm: 0.000000
[GRAD] CMI-3 grad norm: 0.000000
[GRAD] CMI-3 grad norm: 0.000000
[GRAD] CMI-3 grad norm: 0.000000
[GRAD] CMI-3 grad norm: 0.000000
[GRAD] CMI-3 grad norm: 0.000000
[GRAD] CMI-3 grad norm: 0.000000
[GRAD] CMI-3 grad norm: 1.614228
[GRAD] CMI-3 grad norm: 0.000000
[GRAD] CMI-3 grad norm: 0.000000
[GRAD] CMI-3 grad norm: 0.000000
[GRAD] CMI-3 grad norm: 0.000000
[GRAD] CMI-3 grad norm: 0.000000
[GRAD] CMI

[GRAD] CMI-3 grad norm: 0.192469
[GRAD] CMI-3 grad norm: 1.125561
[GRAD] CMI-3 grad norm: 0.230346
[GRAD] CMI-3 grad norm: 1.614228
[GRAD] CMI-3 grad norm: 0.194893
[GRAD] CMI-3 grad norm: 0.220441
[GRAD] CMI-3 grad norm: 0.163861
[GRAD] CMI-3 grad norm: 0.200818
[GRAD] CMI-3 grad norm: 0.234452

[CMI Calculation after Sinkhorn_CMI - Method 3]
[Sinkhorn_CMI - CMI Method 3] Triplet (0, 12, 2) :
 CMI (Soft Kronecker + Gaussian): tensor(0.0799, grad_fn=<SubBackward0>)
 CMI (Gumbel-Softmax + Gaussian): tensor(0.1403)
 CMI (Gumbel + Separate Kernels): tensor(0.2408)

[Sinkhorn_CMI - CMI Method 3] Triplet (6, 12, 2) :
 CMI (Soft Kronecker + Gaussian): tensor(0.1368, grad_fn=<SubBackward0>)
 CMI (Gumbel-Softmax + Gaussian): tensor(0.3039)
 CMI (Gumbel + Separate Kernels): tensor(0.3819)

[Sinkhorn_CMI - CMI Method 3] Triplet (7, 12, 2) :
 CMI (Soft Kronecker + Gaussian): tensor(0.1337, grad_fn=<SubBackward0>)
 CMI (Gumbel-Softmax + Gaussian): tensor(0.1436)
 CMI (Gumbel + Separate Kernels): t

[GRAD] CMI-3 grad norm: 1.125561
[GRAD] CMI-3 grad norm: 0.230346
[GRAD] CMI-3 grad norm: 1.614228
[GRAD] CMI-3 grad norm: 0.194893
[GRAD] CMI-3 grad norm: 0.220441
[GRAD] CMI-3 grad norm: 0.163861
[GRAD] CMI-3 grad norm: 0.200818
[GRAD] CMI-3 grad norm: 0.234452

[CMI Calculation after Sinkhorn_CMI - Method 3]
[Sinkhorn_CMI - CMI Method 3] Triplet (0, 12, 2) :
 CMI (Soft Kronecker + Gaussian): tensor(0.0799, grad_fn=<SubBackward0>)
 CMI (Gumbel-Softmax + Gaussian): tensor(0.1403)
 CMI (Gumbel + Separate Kernels): tensor(0.2408)

[Sinkhorn_CMI - CMI Method 3] Triplet (6, 12, 2) :
 CMI (Soft Kronecker + Gaussian): tensor(0.1368, grad_fn=<SubBackward0>)
 CMI (Gumbel-Softmax + Gaussian): tensor(0.3039)
 CMI (Gumbel + Separate Kernels): tensor(0.3819)

[Sinkhorn_CMI - CMI Method 3] Triplet (7, 12, 2) :
 CMI (Soft Kronecker + Gaussian): tensor(0.1337, grad_fn=<SubBackward0>)
 CMI (Gumbel-Softmax + Gaussian): tensor(0.1436)
 CMI (Gumbel + Separate Kernels): tensor(0.1344)

[Sinkhorn_CMI - CM

In [26]:
# Mean Imputation

# --- Step 1: Mean Imputation on X only (without Y) ---
from sklearn.impute import SimpleImputer

# X only (without Y), as NumPy and Tensor
X_miss_np = X_miss_mcar25.to_numpy(dtype=float)
X_miss_tensor = torch.tensor(X_miss_np, dtype=torch.float32)

# Groundtruth X only (without Y)
X_true_only = X_tensor[:, :X_miss_tensor.shape[1]]

# Mask over X only
mask_mcar25 = torch.isnan(X_miss_tensor).double()

# Run Mean Imputation
mean_imp_np = SimpleImputer().fit_transform(X_miss_tensor)
mean_imp_torch = torch.tensor(mean_imp_np, dtype=torch.float32)

# Step 2: Evaluate MAE only on X (exclude Y from groundtruth comparison)
mean_mcae_mcar25 = MAE(mean_imp_torch, X_true_only, mask_mcar25)
print("Final MAE (Mean Imputation):", mean_mcae_mcar25.item())

# Step 3: Append Y to imputed X for CMI computation
Y_tensor_reshaped = Y_tensor.view(-1, 1)
full_data_with_y_mean = torch.cat([mean_imp_torch, Y_tensor_reshaped], dim=1)

# --- Step 4: Compute CMI for all triplets ---
for triplet in cmi_triplets:
    cmi1, cmi2, cmi3 = compute_all_cmi_methods(
        full_data_with_y_mean,
        data_processed_tensor,           # One-hot + scaled continuous
        one_hot_encoded_tensor,          # Encoded discrete
        continuous_part_tensor,          # Continuous only
        triplet,
        encoder,
        discrete_columns,
        continuous_columns
    )

    print(f"[Mean Imputation] Triplet {triplet} :")
    print(" CMI (Soft Kronecker + Gaussian):", cmi1)
    print(" CMI (Gumbel-Softmax + Gaussian):", cmi2)
    print(" CMI (Gumbel + Separate Kernels):", cmi3)
    print()

Final MAE (Mean Imputation): 0.8183895945549011
[Mean Imputation] Triplet (0, 12, 2) :
 CMI (Soft Kronecker + Gaussian): tensor(0.0786)
 CMI (Gumbel-Softmax + Gaussian): tensor(0.2033)
 CMI (Gumbel + Separate Kernels): tensor(0.2360)

[Mean Imputation] Triplet (6, 12, 2) :
 CMI (Soft Kronecker + Gaussian): tensor(0.0863)
 CMI (Gumbel-Softmax + Gaussian): tensor(0.3747)
 CMI (Gumbel + Separate Kernels): tensor(0.4076)

[Mean Imputation] Triplet (7, 12, 2) :
 CMI (Soft Kronecker + Gaussian): tensor(0.0925)
 CMI (Gumbel-Softmax + Gaussian): tensor(0.1481)
 CMI (Gumbel + Separate Kernels): tensor(0.1730)

[Mean Imputation] Triplet (11, 12, 2) :
 CMI (Soft Kronecker + Gaussian): tensor(0.0934)
 CMI (Gumbel-Softmax + Gaussian): tensor(0.6425)
 CMI (Gumbel + Separate Kernels): tensor(0.5771)

[Mean Imputation] Triplet (0, 10, 4) :
 CMI (Soft Kronecker + Gaussian): tensor(0.0569)
 CMI (Gumbel-Softmax + Gaussian): tensor(0.1332)
 CMI (Gumbel + Separate Kernels): tensor(0.1376)

[Mean Imputation

In [27]:
# --- Step 1: Imputation by Chained Equations on X only (without Y) ---
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer


X_miss_np = X_miss_mcar25.to_numpy(dtype=float)
X_miss_tensor = torch.tensor(X_miss_np, dtype=torch.float32)
X_true_only = X_tensor[:, :X_miss_tensor.shape[1]]
mask_mcar25 = torch.isnan(X_miss_tensor).double()

# Run MICE / ICE
ice_imp_np = IterativeImputer(random_state=0, max_iter=500).fit_transform(X_miss_tensor)
ice_imp_torch = torch.tensor(ice_imp_np, dtype=torch.float32)

# Step 2: Evaluate MAE only on X
ice_mae_mcar25 = MAE(ice_imp_torch, X_true_only, mask_mcar25)
print("Final MAE (Imputation by Chained Equations):", ice_mae_mcar25.item())

# Step 3: Append Y to imputed X
Y_tensor_reshaped = Y_tensor.view(-1, 1)
full_data_with_y_Chained_Equations = torch.cat([ice_imp_torch, Y_tensor_reshaped], dim=1)

# --- Step 4: Compute CMI for all triplets ---
for triplet in cmi_triplets:
    cmi1, cmi2, cmi3 = compute_all_cmi_methods(
        full_data_with_y_Chained_Equations,
        data_processed_tensor,           # One-hot + scaled continuous
        one_hot_encoded_tensor,          # Encoded discrete
        continuous_part_tensor,          # Continuous only
        triplet,
        encoder,
        discrete_columns,
        continuous_columns
    )

    print(f"[Imputation by Chained Equations] Triplet {triplet} :")
    print(" CMI (Soft Kronecker + Gaussian):", cmi1)
    print(" CMI (Gumbel-Softmax + Gaussian):", cmi2)
    print(" CMI (Gumbel + Separate Kernels):", cmi3)
    print()

Final MAE (Imputation by Chained Equations): 0.7974743843078613
[Imputation by Chained Equations] Triplet (0, 12, 2) :
 CMI (Soft Kronecker + Gaussian): tensor(0.0897)
 CMI (Gumbel-Softmax + Gaussian): tensor(0.1619)
 CMI (Gumbel + Separate Kernels): tensor(0.2229)

[Imputation by Chained Equations] Triplet (6, 12, 2) :
 CMI (Soft Kronecker + Gaussian): tensor(0.0908)
 CMI (Gumbel-Softmax + Gaussian): tensor(0.2530)
 CMI (Gumbel + Separate Kernels): tensor(0.3171)

[Imputation by Chained Equations] Triplet (7, 12, 2) :
 CMI (Soft Kronecker + Gaussian): tensor(0.1292)
 CMI (Gumbel-Softmax + Gaussian): tensor(0.1422)
 CMI (Gumbel + Separate Kernels): tensor(0.1767)

[Imputation by Chained Equations] Triplet (11, 12, 2) :
 CMI (Soft Kronecker + Gaussian): tensor(0.1038)
 CMI (Gumbel-Softmax + Gaussian): tensor(0.7148)
 CMI (Gumbel + Separate Kernels): tensor(0.5750)

[Imputation by Chained Equations] Triplet (0, 10, 4) :
 CMI (Soft Kronecker + Gaussian): tensor(0.0753)
 CMI (Gumbel-Softma

In [None]:
# --- Step 1: SoftImpute only on X (exclude Y) ---
from SoftImpute import cv_softimpute, softimpute


X_miss_np = X_miss_mcar25.to_numpy(dtype=float)
X_true_only = X_tensor[:, :X_miss_np.shape[1]]
mask_mcar25 = torch.isnan(torch.tensor(X_miss_np, dtype=torch.float32)).double()

# Run CV to select lambda
cv_error_mcar25, grid_lambda_mcar25 = cv_softimpute(X_miss_np, grid_len=15)
lbda_mcar25 = grid_lambda_mcar25[np.argmin(cv_error_mcar25)]

# Run soft imputation
soft_imp_mcar25 = softimpute(X_miss_np, lbda_mcar25)[1]
soft_imp_mcar25_torch = torch.tensor(soft_imp_mcar25, dtype=torch.float32)

# Step 2: Evaluate MAE
soft_mae_mcar25 = MAE(soft_imp_mcar25_torch, X_true_only, mask_mcar25)
print("Final MAE (Soft Imputation):", soft_mae_mcar25.item())

# Step 3: Concatenate Y
Y_tensor_reshaped = Y_tensor.view(-1, 1)
full_data_with_y_soft = torch.cat([soft_imp_mcar25_torch, Y_tensor_reshaped], dim=1)

# --- Step 4: Compute CMI for all triplets ---
for triplet in cmi_triplets:
    cmi1, cmi2, cmi3 = compute_all_cmi_methods(
        full_data_with_y_soft,
        data_processed_tensor,
        one_hot_encoded_tensor,
        continuous_part_tensor,
        triplet,
        encoder,
        discrete_columns,
        continuous_columns
    )

    print(f"[Soft Imputation] Triplet {triplet} :")
    print(" CMI (Soft Kronecker + Gaussian):", cmi1)
    print(" CMI (Gumbel-Softmax + Gaussian):", cmi2)
    print(" CMI (Gumbel + Separate Kernels):", cmi3)
    print()

# ______________________________________