In [14]:
import pandas as pd
import numpy as np
import torch
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from pytorch_tabnet.pretraining import TabNetPretrainer
from sklearn.metrics import mean_squared_error
import math

# Iris dataset (150 samples, 4 features)
iris = load_iris()
data = iris.data

# Convert to DataFrame
df = pd.DataFrame(data, columns=[f'feature_{i}' for i in range(data.shape[1])])

# Print the original data before introducing missing values
print("Original Data:")
print(df.head())

# Store a copy of the original data for later comparison
ground_values = df.copy()

# Introduce artificial missing values
missing_fraction = 0.3  # Adjust the fraction of missing values
mask = np.random.rand(*data.shape) < missing_fraction
df[mask] = np.nan

# Print the input dataset with missing values
print("\nInput Data with Missing Values:")
print(df.head())

# Normalize the dataset
scaler = StandardScaler()
numeric_cols = df.columns
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

print('\nAfter Normalizing Input Data:')
print(df.head())

# Fill missing values with zero placeholder
df[numeric_cols] = df[numeric_cols].fillna(0)

print("\nFilled Missing Values with Zero Placeholder:")
print(df.head())

# Pretrain the TabNet model
pretrained_model = TabNetPretrainer(
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    mask_type='entmax',
    n_d=32,  # Increase the number of decision steps
    n_a=32   # Increase the number of features shared
)

max_epochs = 50
pretrained_model.fit(
    df[numeric_cols].values,
    max_epochs=max_epochs
)
print("\nPretrained TabNet Model:")

# Define the tabnet_recon function
def tabnet_recon(df, network, df_mean=0, df_std=1):
    df_train = df.copy()
    df_train[numeric_cols] = scaler.transform(df_train[numeric_cols])
    
    # Convert input data to tensors for use in the TabNet network
    input_data = torch.tensor(df_train[numeric_cols].values, dtype=torch.float32)
    
    # Pass the input data through the TabNet network
    results = network.predict(input_data)
    
    # Handle potential tuple output
    if isinstance(results, tuple):
        results = results[0]  # Use the first element of the tuple
    
    # Denormalize the reconstructed data
    df_na_tab = (results * df_std) + df_mean
    
    # Patch the reconstructed data into the original data with missing values
    df_rec_tab = df.copy()
    df_rec_tab.update(df_na_tab)
    
    return df_rec_tab

# Extract true missing values before filling
true_missing_values = df[numeric_cols].values

print('\nTrue Missing Values:')
print(true_missing_values)

# Reconstruct missing values using the pretrained model
reconstructed_data = tabnet_recon(df, network=pretrained_model)

print('\nReconstructed Data:')
print(reconstructed_data.head())

# Extract imputed values
imputed_values = reconstructed_data[numeric_cols].values

print('\nImputed Values:')
print(imputed_values)

# Denormalize the reconstructed data
reconstructed_data[numeric_cols] = scaler.inverse_transform(reconstructed_data[numeric_cols])

print("\nDenormalized Reconstructed Data:")
print(reconstructed_data.head())

# Print the ground values again (before processing) at the end
print("\nGround Values (Before Processing, At the End):")
print(ground_values.head())

# Compare the denormalized reconstructed data with the original input data
# Calculate RMSE
rmse = math.sqrt(mean_squared_error(ground_values[numeric_cols].values, reconstructed_data[numeric_cols].values))
print("\nRoot Mean Squared Error (RMSE):", rmse)


Original Data:
   feature_0  feature_1  feature_2  feature_3
0        5.1        3.5        1.4        0.2
1        4.9        3.0        1.4        0.2
2        4.7        3.2        1.3        0.2
3        4.6        3.1        1.5        0.2
4        5.0        3.6        1.4        0.2

Input Data with Missing Values:
   feature_0  feature_1  feature_2  feature_3
0        5.1        3.5        1.4        0.2
1        4.9        3.0        NaN        0.2
2        4.7        NaN        1.3        0.2
3        4.6        NaN        NaN        0.2
4        NaN        3.6        NaN        NaN

After Normalizing Input Data:
   feature_0  feature_1  feature_2  feature_3
0  -0.769578   0.938227  -1.418931  -1.211122
1  -1.011204  -0.182044        NaN  -1.211122
2  -1.252829        NaN  -1.477258  -1.211122
3  -1.373642        NaN        NaN  -1.211122
4        NaN   1.162281        NaN        NaN

Filled Missing Values with Zero Placeholder:
   feature_0  feature_1  feature_2  feature_3
0