# Artifitially generated data (1500*15)

In [72]:
import pandas as pd
import numpy as np
import torch
from sklearn.preprocessing import StandardScaler
from pytorch_tabnet.pretraining import TabNetPretrainer

# larger example dataset with missing values
np.random.seed(42)
num_samples = 1500
num_features = 10
data = {
    f'feature{i}': np.random.normal(size=num_samples) for i in range(1, num_features + 1)
}
data['subtype'] = np.random.choice(['A', 'B', 'C'], size=num_samples)

# Print input dataset with sequentially increasing values
input_data = pd.DataFrame(data)
print("Input Data with random Values:")
print(input_data.head())


# Introduce random missing values
missing_mask = np.random.rand(num_samples, num_features) < 0.2
for col in data.keys():
    if col != 'subtype':
        data[col][missing_mask[:, int(col[-1]) - 1]] = np.nan

omicMiss = pd.DataFrame(data)

# Normalize the dataset
scaler = StandardScaler()
numeric_cols = [f'feature{i}' for i in range(1, num_features + 1)]
omicMiss[numeric_cols] = scaler.fit_transform(omicMiss[numeric_cols])

print ('After Normalization:')
print (omicMiss.head())

# Handle missing values by filling NaNs with a specific value
missing_value_placeholder = 0
omicMiss.fillna(missing_value_placeholder, inplace=True)

# Pretrain the TabNet model
pretrained_model = TabNetPretrainer(
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    mask_type='entmax'
)

max_epochs = 15
pretrained_model.fit(
    omicMiss[numeric_cols].values,
    max_epochs=max_epochs
)

# Define the tabnet_recon function

def tabnet_recon(omicMiss, network):
    omicMissTrain = omicMiss.copy()

    # Normalize the missing data
    omicMissTrain[numeric_cols] = scaler.transform(omicMissTrain[numeric_cols])

    # Create a mask for missing values
    missing_mask = omicMissTrain[numeric_cols].isnull().values

    # Convert input data to tensors for use in the TabNet network
    inputData = torch.tensor(omicMissTrain[numeric_cols].values, dtype=torch.float32)

    # Pass the input data through the TabNet network
    results = network.predict(inputData)

    # Handle potential tuple output
    if isinstance(results, tuple):
        results = results[0]  # Use the first element of the tuple

    # If there are no missing values, return the original DataFrame
    if np.sum(missing_mask) == 0:
        return omicMissTrain

    # Denormalize and reconstruct the missing values
    imputed_values = results[missing_mask] * scaler.scale_ + scaler.mean_

    # Replace missing values with imputed values
    omicMissTrain.loc[missing_mask, numeric_cols] = imputed_values

    return omicMissTrain



# Extract true missing values before filling
true_missing_values = omicMiss[numeric_cols].values

# Reconstruct missing values using the pretrained model
reconstructed_data = tabnet_recon(omicMiss, network=pretrained_model)

# Extract imputed values
imputed_values = reconstructed_data[numeric_cols].values

# Calculate MAE, R-squared, and RMSE
mae = np.mean(np.abs(imputed_values - true_missing_values))
total_variation = np.sum((true_missing_values - np.mean(true_missing_values)) ** 2)
residual_variation = np.sum((true_missing_values - imputed_values) ** 2)
r_squared = 1 - (residual_variation / total_variation)
rmse = np.sqrt(np.mean((imputed_values - true_missing_values) ** 2))

# Print original and reconstructed data (only printing a subset)
print("Original Data:")
print(omicMiss.head())
print("\nReconstructed Data:")
print(reconstructed_data.head())

print("MAE:", mae)
print("R-squared:", r_squared)
print("RMSE:", rmse)


Input Data with random Values:
   feature1  feature2  feature3  feature4  feature5  feature6  feature7  \
0  0.496714  0.778361 -1.907808 -0.958778 -1.114081  1.725694  0.765402   
1 -0.138264 -0.551186 -0.860385 -1.352509 -0.630931  0.121844  1.073413   
2  0.647689 -0.818199 -0.413606 -1.583588 -0.942060  0.753417  0.498690   
3  1.523030 -0.003374  1.887688  0.412999 -0.547996  0.099826 -1.942498   
4 -0.234153 -0.170185  0.556553 -0.214068 -0.214150 -0.667333 -0.155422   

   feature8  feature9  feature10 subtype  
0  1.541321  1.174814   0.430846       A  
1  1.333998 -1.878981   0.236542       B  
2  0.777735 -0.327795   0.767063       C  
3  0.074686 -0.041660   0.537984       C  
4  0.022500  0.015909   0.717846       A  
After Normalization:
   feature1  feature2  feature3  feature4  feature5  feature6  feature7  \
0  0.438938  0.800388       NaN -0.948301       NaN  1.671872  0.831004   
1       NaN -0.558943 -0.793387 -1.341602       NaN       NaN  1.153698   
2  0.593039 -0

# sequential increasing_subtype

In [57]:
import pandas as pd
import numpy as np
import torch
from sklearn.preprocessing import StandardScaler
from pytorch_tabnet.pretraining import TabNetPretrainer

# Create a sequentially increasing dataset
np.random.seed(42)
num_samples = 1500
num_features = 10

data = {
    f'feature{i}': np.linspace(i, i + num_samples - 1, num_samples) for i in range(1, num_features + 1)
}
data['subtype'] = np.random.choice(['A', 'B', 'C'], size=num_samples)


# Print input dataset with sequentially increasing values
input_data = pd.DataFrame(data)
print("Input Data with Sequentially Increasing Values:")
print(input_data.head())

# Create a DataFrame
omicMiss = pd.DataFrame(input_data)

# Normalize the dataset
scaler = StandardScaler()
numeric_cols = [f'feature{i}' for i in range(1, num_features + 1)]
omicMiss[numeric_cols] = scaler.fit_transform(omicMiss[numeric_cols])

print ('After Normalizing Input Data:')
print (omicMiss)

# Introduce random missing values
missing_mask = np.random.rand(num_samples, num_features) < 0.2
for col in data.keys():
    if col != 'subtype':
        omicMiss[col][missing_mask[:, int(col[-1]) - 1]] = np.nan

# Handle missing values by filling NaNs with a specific value
missing_value_placeholder = 0
omicMiss.fillna(missing_value_placeholder, inplace=True)

# Pretrain the TabNet model
pretrained_model = TabNetPretrainer(
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    mask_type='entmax'
)
max_epochs = 15
pretrained_model.fit(
    omicMiss[numeric_cols].values,
    max_epochs=max_epochs
)

# Define the tabnet_recon function
def tabnet_recon(omicMiss, network):
    omicMissTrain = omicMiss.copy()

    # Normalize the missing data
    omicMissTrain[numeric_cols] = scaler.transform(omicMissTrain[numeric_cols])

    # Create a mask for missing values
    missing_mask = omicMissTrain[numeric_cols].isnull().values

    # Convert input data to tensors for use in the TabNet network
    inputData = torch.tensor(omicMissTrain[numeric_cols].values, dtype=torch.float32)

    # Pass the input data through the TabNet network
    results = network.predict(inputData)

    # Handle potential tuple output
    if isinstance(results, tuple):
        results = results[0]  # Use the first element of the tuple

    # If there are no missing values, return the original DataFrame
    if np.sum(missing_mask) == 0:
        return omicMissTrain

    # Denormalize and reconstruct the missing values
    imputed_values = results[missing_mask] * scaler.scale_ + scaler.mean_

    # Replace missing values with imputed values
    omicMissTrain.loc[missing_mask, numeric_cols] = imputed_values

    return omicMissTrain

# Extract true missing values before filling
true_missing_values = omicMiss[numeric_cols].values

# Reconstruct missing values using the pretrained model
reconstructed_data = tabnet_recon(omicMiss, network=pretrained_model)

# Extract imputed values
imputed_values = reconstructed_data[numeric_cols].values

# Calculate MAE, R-squared, and RMSE
mae = np.mean(np.abs(imputed_values - true_missing_values))
total_variation = np.sum((true_missing_values - np.mean(true_missing_values)) ** 2)
residual_variation = np.sum((true_missing_values - imputed_values) ** 2)
r_squared = 1 - (residual_variation / total_variation)
rmse = np.sqrt(np.mean((imputed_values - true_missing_values) ** 2))



# Print original and reconstructed data (only printing a subset)
print("Original Data:")
print(omicMiss.head())
print("\nReconstructed Data:")
print(reconstructed_data.head())

print("MAE:", mae)
print("R-squared:", r_squared)
print("RMSE:", rmse)


Input Data with Sequentially Increasing Values:
   feature1  feature2  feature3  feature4  feature5  feature6  feature7  \
0       1.0       2.0       3.0       4.0       5.0       6.0       7.0   
1       2.0       3.0       4.0       5.0       6.0       7.0       8.0   
2       3.0       4.0       5.0       6.0       7.0       8.0       9.0   
3       4.0       5.0       6.0       7.0       8.0       9.0      10.0   
4       5.0       6.0       7.0       8.0       9.0      10.0      11.0   

   feature8  feature9  feature10 subtype  
0       8.0       9.0       10.0       C  
1       9.0      10.0       11.0       A  
2      10.0      11.0       12.0       C  
3      11.0      12.0       13.0       C  
4      12.0      13.0       14.0       A  
After Normalizing Input Data:
      feature1  feature2  feature3  feature4  feature5  feature6  feature7  \
0    -1.730896 -1.730896 -1.730896 -1.730896 -1.730896 -1.730896 -1.730896   
1    -1.728587 -1.728587 -1.728587 -1.728587 -1.728587 -1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  omicMiss[col][missing_mask[:, int(col[-1]) - 1]] = np.nan


epoch 0  | loss: 2.84049 |  0:00:00s
epoch 1  | loss: 1.52872 |  0:00:01s
epoch 2  | loss: 1.05392 |  0:00:03s
epoch 3  | loss: 0.93024 |  0:00:04s
epoch 4  | loss: 0.81558 |  0:00:06s
epoch 5  | loss: 0.70546 |  0:00:07s
epoch 6  | loss: 0.63075 |  0:00:09s
epoch 7  | loss: 0.57067 |  0:00:11s
epoch 8  | loss: 0.50782 |  0:00:12s
epoch 9  | loss: 0.44913 |  0:00:14s
epoch 10 | loss: 0.41588 |  0:00:16s
epoch 11 | loss: 0.41101 |  0:00:17s
epoch 12 | loss: 0.35871 |  0:00:18s
epoch 13 | loss: 0.35479 |  0:00:19s
epoch 14 | loss: 0.35125 |  0:00:21s
Original Data:
   feature1  feature2  feature3  feature4  feature5  feature6  feature7  \
0  0.000000 -1.730896  0.000000 -1.730896 -1.730896  0.000000 -1.730896   
1 -1.728587 -1.728587 -1.728587 -1.728587 -1.728587 -1.728587 -1.728587   
2 -1.726278  0.000000 -1.726278 -1.726278  0.000000 -1.726278 -1.726278   
3 -1.723968 -1.723968 -1.723968 -1.723968 -1.723968  0.000000 -1.723968   
4 -1.721659 -1.721659 -1.721659  0.000000 -1.721659 -1.

# sequential increasing w/o subtype

In [58]:
import pandas as pd
import numpy as np
import torch
from sklearn.preprocessing import StandardScaler
from pytorch_tabnet.pretraining import TabNetPretrainer

# Create a sequentially increasing dataset
np.random.seed(42)
num_samples = 1500
num_features = 10

data = {
    f'feature{i}': np.linspace(i, i + num_samples - 1, num_samples) for i in range(1, num_features + 1)
}

# Print input dataset with sequentially increasing values
input_data = pd.DataFrame(data)
print("Input Data with Sequentially Increasing Values:")
print(input_data.head())

# Create a DataFrame
omicMiss = pd.DataFrame(input_data)

# Normalize the dataset
scaler = StandardScaler()
numeric_cols = [f'feature{i}' for i in range(1, num_features + 1)]
omicMiss[numeric_cols] = scaler.fit_transform(omicMiss[numeric_cols])

print ('After Normalizing Input Data:')
print (omicMiss.head())

# Introduce random missing values
missing_mask = np.random.rand(num_samples, num_features) < 0.2
for col in data.keys():
    omicMiss[col][missing_mask[:, int(col[-1]) - 1]] = np.nan

# Handle missing values by filling NaNs with a specific value
missing_value_placeholder = 0
omicMiss.fillna(missing_value_placeholder, inplace=True)

# Pretrain the TabNet model
pretrained_model = TabNetPretrainer(
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    mask_type='entmax'
)
max_epochs = 15
pretrained_model.fit(
    omicMiss[numeric_cols].values,
    max_epochs=max_epochs
)

# Define the tabnet_recon function
def tabnet_recon(omicMiss, network):
    omicMissTrain = omicMiss.copy()

    # Normalize the missing data
    omicMissTrain[numeric_cols] = scaler.transform(omicMissTrain[numeric_cols])

    # Create a mask for missing values
    missing_mask = omicMissTrain[numeric_cols].isnull().values

    # Convert input data to tensors for use in the TabNet network
    inputData = torch.tensor(omicMissTrain[numeric_cols].values, dtype=torch.float32)

    # Pass the input data through the TabNet network
    results = network.predict(inputData)

    # Handle potential tuple output
    if isinstance(results, tuple):
        results = results[0]  # Use the first element of the tuple

    # If there are no missing values, return the original DataFrame
    if np.sum(missing_mask) == 0:
        return omicMissTrain

    # Denormalize and reconstruct the missing values
    imputed_values = results[missing_mask] * scaler.scale_ + scaler.mean_

    # Replace missing values with imputed values
    omicMissTrain.loc[missing_mask, numeric_cols] = imputed_values

    return omicMissTrain

# Extract true missing values before filling
true_missing_values = omicMiss[numeric_cols].values

# Reconstruct missing values using the pretrained model
reconstructed_data = tabnet_recon(omicMiss, network=pretrained_model)

# Extract imputed values
imputed_values = reconstructed_data[numeric_cols].values

# Calculate MAE, R-squared, and RMSE
mae = np.mean(np.abs(imputed_values - true_missing_values))
total_variation = np.sum((true_missing_values - np.mean(true_missing_values)) ** 2)
residual_variation = np.sum((true_missing_values - imputed_values) ** 2)
r_squared = 1 - (residual_variation / total_variation)
rmse = np.sqrt(np.mean((imputed_values - true_missing_values) ** 2))

# Print original and reconstructed data (only printing a subset)
print("Original Data:")
print(omicMiss.head())
print("\nReconstructed Data:")
print(reconstructed_data.head())

print("MAE:", mae)
print("R-squared:", r_squared)
print("RMSE:", rmse)


Input Data with Sequentially Increasing Values:
   feature1  feature2  feature3  feature4  feature5  feature6  feature7  \
0       1.0       2.0       3.0       4.0       5.0       6.0       7.0   
1       2.0       3.0       4.0       5.0       6.0       7.0       8.0   
2       3.0       4.0       5.0       6.0       7.0       8.0       9.0   
3       4.0       5.0       6.0       7.0       8.0       9.0      10.0   
4       5.0       6.0       7.0       8.0       9.0      10.0      11.0   

   feature8  feature9  feature10  
0       8.0       9.0       10.0  
1       9.0      10.0       11.0  
2      10.0      11.0       12.0  
3      11.0      12.0       13.0  
4      12.0      13.0       14.0  
After Normalizing Input Data:
   feature1  feature2  feature3  feature4  feature5  feature6  feature7  \
0 -1.730896 -1.730896 -1.730896 -1.730896 -1.730896 -1.730896 -1.730896   
1 -1.728587 -1.728587 -1.728587 -1.728587 -1.728587 -1.728587 -1.728587   
2 -1.726278 -1.726278 -1.726278 -1.7

# sequential increasing rows by 0.1

In [68]:
import pandas as pd
import numpy as np
import torch
from sklearn.preprocessing import StandardScaler
from pytorch_tabnet.pretraining import TabNetPretrainer

# Create a sequentially increasing dataset with multiple rows
np.random.seed(42)
num_samples = 1000  # Number of rows
num_features = 10

# Generate data with sequential increasing values by 0.5
data = np.arange(1, (num_samples * num_features) + 1).reshape(num_samples, num_features) * 0.1

# Convert the data array into a DataFrame
input_data = pd.DataFrame(data, columns=[f'feature{i}' for i in range(1, num_features + 1)])

# Print input dataset with sequentially increasing values
print("Input Data with Sequentially Increasing Values:")
print(input_data.head())


# Create a DataFrame
omicMiss = pd.DataFrame(input_data)

# Normalize the dataset
scaler = StandardScaler()
numeric_cols = [f'feature{i}' for i in range(1, num_features + 1)]
omicMiss[numeric_cols] = scaler.fit_transform(omicMiss[numeric_cols])

print ('After Normalizing Input Data:')
print (omicMiss.head())

# Introduce random missing values
missing_mask = np.random.rand(num_samples, num_features) < 0.2
for col in omicMiss.columns:
    omicMiss[col][missing_mask[:, int(col[-1]) - 1]] = np.nan


# Handle missing values by filling NaNs with a specific value
missing_value_placeholder = 0
omicMiss.fillna(missing_value_placeholder, inplace=True)

# Pretrain the TabNet model
pretrained_model = TabNetPretrainer(
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    mask_type='entmax'
)
max_epochs = 50
pretrained_model.fit(
    omicMiss[numeric_cols].values,
    max_epochs=max_epochs
)

# Define the tabnet_recon function
def tabnet_recon(omicMiss, network):
    omicMissTrain = omicMiss.copy()

    # Normalize the missing data
    omicMissTrain[numeric_cols] = scaler.transform(omicMissTrain[numeric_cols])

    # Create a mask for missing values
    missing_mask = omicMissTrain[numeric_cols].isnull().values

    # Convert input data to tensors for use in the TabNet network
    inputData = torch.tensor(omicMissTrain[numeric_cols].values, dtype=torch.float32)

    # Pass the input data through the TabNet network
    results = network.predict(inputData)

    # Handle potential tuple output
    if isinstance(results, tuple):
        results = results[0]  # Use the first element of the tuple

    # If there are no missing values, return the original DataFrame
    if np.sum(missing_mask) == 0:
        return omicMissTrain

    # Denormalize and reconstruct the missing values
    imputed_values = results[missing_mask] * scaler.scale_ + scaler.mean_

    # Replace missing values with imputed values
    omicMissTrain.loc[missing_mask, numeric_cols] = imputed_values

    return omicMissTrain

# Extract true missing values before filling
true_missing_values = omicMiss[numeric_cols].values

# Reconstruct missing values using the pretrained model
reconstructed_data = tabnet_recon(omicMiss, network=pretrained_model)

# Extract imputed values
imputed_values = reconstructed_data[numeric_cols].values

# Calculate MAE, R-squared, and RMSE
mae = np.mean(np.abs(imputed_values - true_missing_values))
total_variation = np.sum((true_missing_values - np.mean(true_missing_values)) ** 2)
residual_variation = np.sum((true_missing_values - imputed_values) ** 2)
r_squared = 1 - (residual_variation / total_variation)
rmse = np.sqrt(np.mean((imputed_values - true_missing_values) ** 2))

# Print original and reconstructed data (only printing a subset)
print("Original Data:")
print(omicMiss.head())
print("\nReconstructed Data:")
print(reconstructed_data.head())

print("MAE:", mae)
print("R-squared:", r_squared)
print("RMSE:", rmse)


Input Data with Sequentially Increasing Values:
   feature1  feature2  feature3  feature4  feature5  feature6  feature7  \
0       0.1       0.2       0.3       0.4       0.5       0.6       0.7   
1       1.1       1.2       1.3       1.4       1.5       1.6       1.7   
2       2.1       2.2       2.3       2.4       2.5       2.6       2.7   
3       3.1       3.2       3.3       3.4       3.5       3.6       3.7   
4       4.1       4.2       4.3       4.4       4.5       4.6       4.7   

   feature8  feature9  feature10  
0       0.8       0.9        1.0  
1       1.8       1.9        2.0  
2       2.8       2.9        3.0  
3       3.8       3.9        4.0  
4       4.8       4.9        5.0  
After Normalizing Input Data:
   feature1  feature2  feature3  feature4  feature5  feature6  feature7  \
0 -1.730320 -1.730320 -1.730320 -1.730320 -1.730320 -1.730320 -1.730320   
1 -1.726856 -1.726856 -1.726856 -1.726856 -1.726856 -1.726856 -1.726856   
2 -1.723391 -1.723391 -1.723391 -1.7