# Iris Data

In [1]:
import pandas as pd
import numpy as np
import torch
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from pytorch_tabnet.pretraining import TabNetPretrainer
from sklearn.metrics import mean_squared_error
import math

# Iris dataset (150 samples, 4 features)
iris = load_iris()
data = iris.data

# Convert to DataFrame
df = pd.DataFrame(data, columns=[f'feature_{i}' for i in range(data.shape[1])])

# Print the original data before introducing missing values
print("Original Data:")
print(df.head())

# Store a copy of the original data for later comparison
ground_values = df.copy()

# Introduce artificial missing values
missing_fraction = 0.3  # Adjust the fraction of missing values
mask = np.random.rand(*data.shape) < missing_fraction
df[mask] = np.nan

# Print the input dataset with missing values
print("\nInput Data with Missing Values:")
print(df.head())

# Normalize the dataset
scaler = StandardScaler()
numeric_cols = df.columns
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

print('\nAfter Normalizing Input Data:')
print(df.head())

# Fill missing values with zero placeholder
df[numeric_cols] = df[numeric_cols].fillna(0)

print("\nFilled Missing Values with Zero Placeholder:")
print(df.head())

# Pretrain the TabNet model
pretrained_model = TabNetPretrainer(
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    mask_type='entmax',
    n_d=32,  # Increase the number of decision steps
    n_a=32   # Increase the number of features shared
)

max_epochs = 100
pretrained_model.fit(
    df[numeric_cols].values,
    max_epochs=max_epochs
)
print("\nPretrained TabNet Model:")

def tabnet_recon(df, network, df_mean=0, df_std=1):
    df_train = df.copy()
    df_train[numeric_cols] = scaler.transform(df_train[numeric_cols])

    # Convert input data to tensors for use in the TabNet network
    input_data = torch.tensor(df_train[numeric_cols].values, dtype=torch.float32)

    # Pass the input data through the TabNet network
    results = network.predict(input_data)

    # Handle potential tuple output
    if isinstance(results, tuple):
        results = results[0]  # Use the first element of the tuple

    # Denormalize the reconstructed data
    df_na_tab = (results * df_std) + df_mean

    # Initialize a new DataFrame with the same structure as df
    df_rec_tab = df.copy()

    # Convert numeric_cols to a list
    numeric_cols_list = list(numeric_cols)

    for col in numeric_cols_list:
        df_rec_tab[col] = df_na_tab[:, numeric_cols_list.index(col)]

    return df_rec_tab

# Extract true missing values before filling
true_missing_values = df[numeric_cols].values

print('\nTrue Missing Values:')
print(true_missing_values)

# Reconstruct missing values using the pretrained model
reconstructed_data = tabnet_recon(df, network=pretrained_model)

print('\nReconstructed Data:')
print(reconstructed_data.head())

# Extract imputed values
imputed_values = reconstructed_data[numeric_cols].values

print('\nImputed Values:')
print(imputed_values)

# Denormalize the reconstructed data
reconstructed_data[numeric_cols] = scaler.inverse_transform(reconstructed_data)

print("\nDenormalized Reconstructed Data:")
print(reconstructed_data.head())

# Print the ground values again (before processing) at the end
print("\nGround Values (Before Processing, At the End):")
print(ground_values.head())

# Compare the denormalized reconstructed data with the original input data
# Calculate RMSE
rmse = math.sqrt(mean_squared_error(ground_values[numeric_cols].values, reconstructed_data[numeric_cols].values))
print("\nRoot Mean Squared Error (RMSE):", rmse)


Original Data:
   feature_0  feature_1  feature_2  feature_3
0        5.1        3.5        1.4        0.2
1        4.9        3.0        1.4        0.2
2        4.7        3.2        1.3        0.2
3        4.6        3.1        1.5        0.2
4        5.0        3.6        1.4        0.2

Input Data with Missing Values:
   feature_0  feature_1  feature_2  feature_3
0        5.1        3.5        1.4        0.2
1        4.9        3.0        NaN        NaN
2        NaN        NaN        1.3        0.2
3        NaN        3.1        1.5        0.2
4        5.0        3.6        1.4        0.2

After Normalizing Input Data:
   feature_0  feature_1  feature_2  feature_3
0  -0.904167   0.994617  -1.326042  -1.253879
1  -1.140036  -0.197561        NaN        NaN
2        NaN        NaN  -1.382446  -1.253879
3        NaN   0.040875  -1.269638  -1.253879
4  -1.022101   1.233053  -1.326042  -1.253879

Filled Missing Values with Zero Placeholder:
   feature_0  feature_1  feature_2  feature_3
0

# Synthetic data (Random)

In [2]:
import pandas as pd
import numpy as np
import torch
from sklearn.preprocessing import StandardScaler
from pytorch_tabnet.pretraining import TabNetPretrainer
from sklearn.metrics import mean_squared_error
import math

# Create a random dataset (e.g., 500 samples, 10 features)
np.random.seed(0)
n_samples = 2500
n_features = 15
data = np.random.randn(n_samples, n_features)

# Convert to DataFrame
df = pd.DataFrame(data, columns=[f'feature_{i}' for i in range(n_features)])

# Print the original data before introducing missing values
print("Original Data:")
print(df.head())

# Store a copy of the original data for later comparison
ground_values = df.copy()

# Introduce artificial missing values
missing_fraction = 0.3  # Adjust the fraction of missing values
mask = np.random.rand(n_samples, n_features) < missing_fraction
df[mask] = np.nan

# Print the input dataset with missing values
print("\nInput Data with Missing Values:")
print(df.head())

# Normalize the dataset
scaler = StandardScaler()
numeric_cols = df.columns
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

print('\nAfter Normalizing Input Data:')
print(df.head())

# Fill missing values with zero placeholder
df[numeric_cols] = df[numeric_cols].fillna(0)

print("\nFilled Missing Values with Zero Placeholder:")
print(df.head())

# Pretrain the TabNet model
pretrained_model = TabNetPretrainer(
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    mask_type='entmax',
    n_d=32,  # Increase the number of decision steps
    n_a=32   # Increase the number of features shared
)

max_epochs = 50
pretrained_model.fit(
    df[numeric_cols].values,
    max_epochs=max_epochs
)
print("\nPretrained TabNet Model:")

def tabnet_recon(df, network, df_mean=0, df_std=1):
    df_train = df.copy()
    df_train[numeric_cols] = scaler.transform(df_train[numeric_cols])

    # Convert input data to tensors for use in the TabNet network
    input_data = torch.tensor(df_train[numeric_cols].values, dtype=torch.float32)

    # Pass the input data through the TabNet network
    results = network.predict(input_data)

    # Handle potential tuple output
    if isinstance(results, tuple):
        results = results[0]  # Use the first element of the tuple

    # Denormalize the reconstructed data
    df_na_tab = (results * df_std) + df_mean

    # Initialize a new DataFrame with the same structure as df
    df_rec_tab = df.copy()

    # Convert numeric_cols to a list
    numeric_cols_list = list(numeric_cols)

    # Only update non-missing values, leave missing values as zeros
    for col in numeric_cols_list:
        df_rec_tab[col] = df_na_tab[:, numeric_cols_list.index(col)]

    return df_rec_tab

# Extract true missing values before filling
true_missing_values = df[numeric_cols].values

print('\nTrue Missing Values:')
print(true_missing_values)

# Reconstruct missing values using the pretrained model
reconstructed_data = tabnet_recon(df, network=pretrained_model)

print('\nReconstructed Data:')
print(reconstructed_data.head())

# Extract imputed values
imputed_values = reconstructed_data[numeric_cols].values

print('\nImputed Values:')
print(imputed_values)

# Denormalize the reconstructed data
reconstructed_data[numeric_cols] = scaler.inverse_transform(reconstructed_data[numeric_cols])

print("\nDenormalized Reconstructed Data:")
print(reconstructed_data.head())

# Print the ground values again (before processing) at the end
print("\nGround Values (Before Processing, At the End):")
print(ground_values.head())

# Compare the denormalized reconstructed data with the original input data
# Calculate RMSE
rmse = math.sqrt(mean_squared_error(ground_values[numeric_cols].values, reconstructed_data[numeric_cols].values))
print("\nRoot Mean Squared Error (RMSE):", rmse)


Original Data:
   feature_0  feature_1  feature_2  feature_3  feature_4  feature_5  \
0   1.764052   0.400157   0.978738   2.240893   1.867558  -0.977278   
1   0.333674   1.494079  -0.205158   0.313068  -0.854096  -2.552990   
2   0.154947   0.378163  -0.887786  -1.980796  -0.347912   0.156349   
3  -0.438074  -1.252795   0.777490  -1.613898  -0.212740  -0.895467   
4  -0.672460  -0.359553  -0.813146  -1.726283   0.177426  -0.401781   

   feature_6  feature_7  feature_8  feature_9  feature_10  feature_11  \
0   0.950088  -0.151357  -0.103219   0.410599    0.144044    1.454274   
1   0.653619   0.864436  -0.742165   2.269755   -1.454366    0.045759   
2   1.230291   1.202380  -0.387327  -0.302303   -1.048553   -1.420018   
3   0.386902  -0.510805  -1.180632  -0.028182    0.428332    0.066517   
4  -1.630198   0.462782  -0.907298   0.051945    0.729091    0.128983   

   feature_12  feature_13  feature_14  
0    0.761038    0.121675    0.443863  
1   -0.187184    1.532779    1.469359  