# Sequential data set with random missing values, size of the dataset (1000*10)

In [22]:
import pandas as pd
import numpy as np
import torch
from sklearn.preprocessing import StandardScaler
from pytorch_tabnet.pretraining import TabNetPretrainer

# Create a sequentially increasing dataset with multiple rows
np.random.seed(42)
num_samples = 1000  # Number of rows
num_features = 10

# Generate data with sequential increasing values by 0.5
data = np.arange(1, (num_samples * num_features) + 1).reshape(num_samples, num_features) * 0.2

# Convert the data array into a DataFrame
input_data = pd.DataFrame(data, columns=[f'feature{i}' for i in range(1, num_features + 1)])

# Print input dataset with sequentially increasing values
print("Input Data with Sequentially Increasing Values:")
print(input_data.head())


# Create a DataFrame
omicMiss = pd.DataFrame(input_data)

# Normalize the dataset
scaler = StandardScaler()
numeric_cols = [f'feature{i}' for i in range(1, num_features + 1)]
omicMiss[numeric_cols] = scaler.fit_transform(omicMiss[numeric_cols])

print ('After Normalizing Input Data:')
print (omicMiss.head())

# Introduce random missing values
missing_mask = np.random.rand(num_samples, num_features) < 0.2
for col in omicMiss.columns:
    omicMiss[col][missing_mask[:, int(col[-1]) - 1]] = np.nan
    
#omicMiss = pd.DataFrame(data)
print("\nData with Missing Values:")
print(omicMiss.head())

# Handle missing values by filling NaNs with a specific value
missing_value_placeholder = 0
omicMiss.fillna(missing_value_placeholder, inplace=True)

print("\nFilled Missing Values with Placeholder:")
print(omicMiss.head())
# Pretrain the TabNet model
pretrained_model = TabNetPretrainer(
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    mask_type='entmax'
)
max_epochs = 50
pretrained_model.fit(
    omicMiss[numeric_cols].values,
    pretraining_ratio=0.8,
    max_epochs=max_epochs
)
print("\nPretrained TabNet Model:")

# Define the tabnet_recon function
def tabnet_recon(omicMiss, network):
    omicMissTrain = omicMiss.copy()

    # Normalize the missing data
    omicMissTrain[numeric_cols] = scaler.transform(omicMissTrain[numeric_cols])

    # Create a mask for missing values
    missing_mask = omicMissTrain[numeric_cols].isnull().values

    # Convert input data to tensors for use in the TabNet network
    inputData = torch.tensor(omicMissTrain[numeric_cols].values, dtype=torch.float32)

    # Pass the input data through the TabNet network
    results = network.predict(inputData)

    # Handle potential tuple output
    if isinstance(results, tuple):
        results = results[0]  # Use the first element of the tuple

    # If there are no missing values, return the original DataFrame
    if np.sum(missing_mask) == 0:
        return omicMissTrain

    # Denormalize and reconstruct the missing values
    imputed_values = results[missing_mask] * scaler.scale_ + scaler.mean_

    # Replace missing values with imputed values
    omicMissTrain.loc[missing_mask, numeric_cols] = imputed_values

    return omicMissTrain

# Extract true missing values before filling
true_missing_values = omicMiss[numeric_cols].values
print ('\n True missing values: ')
print (true_missing_values)

# Reconstruct missing values using the pretrained model
reconstructed_data = tabnet_recon(omicMiss, network=pretrained_model)
print ('\n Reconstructed data: ')
print ( reconstructed_data.head())

# Extract imputed values
imputed_values = reconstructed_data[numeric_cols].values
print ('\n Imputed values: ')
print ( imputed_values)

# Calculate MAE, R-squared, and RMSE
mae = np.mean(np.abs(imputed_values - true_missing_values))
total_variation = np.sum((true_missing_values - np.mean(true_missing_values)) ** 2)
residual_variation = np.sum((true_missing_values - imputed_values) ** 2)
r_squared = 1 - (residual_variation / total_variation)
rmse = np.sqrt(np.mean((imputed_values - true_missing_values) ** 2))

# Print original and reconstructed data (only printing a subset)
print("Original Data:")
print(omicMiss.head())
print("\nReconstructed Data:")
print(reconstructed_data.head())

print("MAE:", mae)
print("R-squared:", r_squared)
print("RMSE:", rmse)


Input Data with Sequentially Increasing Values:
   feature1  feature2  feature3  feature4  feature5  feature6  feature7  \
0       0.2       0.4       0.6       0.8       1.0       1.2       1.4   
1       2.2       2.4       2.6       2.8       3.0       3.2       3.4   
2       4.2       4.4       4.6       4.8       5.0       5.2       5.4   
3       6.2       6.4       6.6       6.8       7.0       7.2       7.4   
4       8.2       8.4       8.6       8.8       9.0       9.2       9.4   

   feature8  feature9  feature10  
0       1.6       1.8        2.0  
1       3.6       3.8        4.0  
2       5.6       5.8        6.0  
3       7.6       7.8        8.0  
4       9.6       9.8       10.0  
After Normalizing Input Data:
   feature1  feature2  feature3  feature4  feature5  feature6  feature7  \
0 -1.730320 -1.730320 -1.730320 -1.730320 -1.730320 -1.730320 -1.730320   
1 -1.726856 -1.726856 -1.726856 -1.726856 -1.726856 -1.726856 -1.726856   
2 -1.723391 -1.723391 -1.723391 -1.7

# Artifitially generated data with missing values, size of the dataset (1500*15)

In [21]:
import pandas as pd
import numpy as np
import torch
from sklearn.preprocessing import StandardScaler
from pytorch_tabnet.pretraining import TabNetPretrainer

#dataset with missing values
np.random.seed(42)
num_samples = 1500
num_features = 15
data = {
    f'feature{i}': np.random.normal(size=num_samples) for i in range(1, num_features + 1)
}
data['subtype'] = np.random.choice(['A', 'B', 'C'], size=num_samples)

# Introduce random missing values
missing_mask = np.random.rand(num_samples, num_features) < 0.2
for col in data.keys():
    if col != 'subtype':
        data[col][missing_mask[:, int(col[-1]) - 1]] = np.nan

omicMiss = pd.DataFrame(data)

print("Input Data with random missing Values:")
print(omicMiss.head())

# Normalize the dataset
scaler = StandardScaler()
numeric_cols = [f'feature{i}' for i in range(1, num_features + 1)]
omicMiss[numeric_cols] = scaler.fit_transform(omicMiss[numeric_cols])

print ('After Normalizing Input Data:')
print (omicMiss.head())

# Handle missing values by filling NaNs with a specific value
missing_value_placeholder = 0
omicMiss.fillna(missing_value_placeholder, inplace=True)

# Pretrain the TabNet model
pretrained_model = TabNetPretrainer(
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    mask_type='entmax'
)

max_epochs = 50
pretrained_model.fit(
    omicMiss[numeric_cols].values,
    max_epochs=max_epochs
)

# Define the tabnet_recon function

def tabnet_recon(omicMiss, network):
    omicMissTrain = omicMiss.copy()

    # Normalize the missing data
    omicMissTrain[numeric_cols] = scaler.transform(omicMissTrain[numeric_cols])

    # Create a mask for missing values
    missing_mask = omicMissTrain[numeric_cols].isnull().values

    # Convert input data to tensors for use in the TabNet network
    inputData = torch.tensor(omicMissTrain[numeric_cols].values, dtype=torch.float32)

    # Pass the input data through the TabNet network
    results = network.predict(inputData)

    # Handle potential tuple output
    if isinstance(results, tuple):
        results = results[0]  # Use the first element of the tuple

    # If there are no missing values, return the original DataFrame
    if np.sum(missing_mask) == 0:
        return omicMissTrain

    # Denormalize and reconstruct the missing values
    imputed_values = results[missing_mask] * scaler.scale_ + scaler.mean_

    # Replace missing values with imputed values
    omicMissTrain.loc[missing_mask, numeric_cols] = imputed_values

    return omicMissTrain



# Extract true missing values before filling
true_missing_values = omicMiss[numeric_cols].values

# Reconstruct missing values using the pretrained model
reconstructed_data = tabnet_recon(omicMiss, network=pretrained_model)

# Extract imputed values
imputed_values = reconstructed_data[numeric_cols].values

# Calculate MAE, R-squared, and RMSE
mae = np.mean(np.abs(imputed_values - true_missing_values))
total_variation = np.sum((true_missing_values - np.mean(true_missing_values)) ** 2)
residual_variation = np.sum((true_missing_values - imputed_values) ** 2)
r_squared = 1 - (residual_variation / total_variation)
rmse = np.sqrt(np.mean((imputed_values - true_missing_values) ** 2))

# Print original and reconstructed data (only printing a subset)
print("Original Data:")
print(omicMiss.head())
print("\nReconstructed Data:")
print(reconstructed_data.head())

print("MAE:", mae)
print("R-squared:", r_squared)
print("RMSE:", rmse)

Input Data with random missing Values:
   feature1  feature2  feature3  feature4  feature5  feature6  feature7  \
0  0.496714  0.778361 -1.907808 -0.958778 -1.114081       NaN  0.765402   
1 -0.138264 -0.551186 -0.860385       NaN -0.630931  0.121844  1.073413   
2       NaN -0.818199 -0.413606 -1.583588 -0.942060  0.753417       NaN   
3  1.523030 -0.003374  1.887688  0.412999 -0.547996  0.099826 -1.942498   
4 -0.234153 -0.170185       NaN -0.214068 -0.214150 -0.667333 -0.155422   

   feature8  feature9  feature10  feature11  feature12  feature13  feature14  \
0  1.541321       NaN   0.430846  -0.143423  -0.904591   0.321835   0.332621   
1  1.333998       NaN   0.236542  -0.032656   0.932829  -0.781358        NaN   
2       NaN -0.327795   0.767063        NaN  -0.695645   0.691356  -1.680780   
3  0.074686 -0.041660   0.537984   0.946861  -0.940936  -0.590362   0.747610   
4  0.022500  0.015909   0.717846  -0.747217   0.532116        NaN  -1.143671   

   feature15 subtype  
0   2.

# Data  from jiaojiao; size of the dataset (2700*50)

In [18]:
import pandas as pd
import numpy as np
import torch
from sklearn.preprocessing import StandardScaler
from pytorch_tabnet.pretraining import TabNetPretrainer

# dataset with missing values 
omicMiss = pd.read_csv('/Users/emondemoniac/Desktop/TabNet_PyTorch/jiaojiao data/48complete_proteomics.csv')
omicMiss = omicMiss.iloc[:, 1:]
omicMiss = pd.DataFrame(omicMiss)

# Print input dataset 
print("Input Data with missing Values:")
print(omicMiss.head())

# Normalize the dataset
scaler = StandardScaler()
numeric_cols = omicMiss.columns[0:]  # Assuming the first column is a non-numeric identifier
omicMiss[numeric_cols] = scaler.fit_transform(omicMiss[numeric_cols])

print ('After Normalizing Input Data:')
print (omicMiss.head())

# Handle missing values by filling NaNs with a specific value
missing_value_placeholder = 0
omicMiss.fillna(missing_value_placeholder, inplace=True)

print("\nFilled Missing Values with Placeholder:")
print(omicMiss.head())

# Pretrain the TabNet model
pretrained_model = TabNetPretrainer(
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    mask_type='entmax',
    n_d=32,  # Increase the number of decision steps
    n_a=32   # Increase the number of features shared
)

max_epochs = 50
pretrained_model.fit(
    omicMiss[numeric_cols].values,
    max_epochs=max_epochs
)
print("\nPretrained TabNet Model:")

# Define the tabnet_recon function
def tabnet_recon(omicMiss, network, omicMissMean=0, omicMissSd=1):
    omicMissTrain = omicMiss.copy()
    omicMissTrain[numeric_cols] = scaler.transform(omicMissTrain[numeric_cols])
    
    # Convert input data to tensors for use in the TabNet network
    inputData = torch.tensor(omicMissTrain[numeric_cols].values, dtype=torch.float32)
    
    # Pass the input data through the TabNet network
    results = network.predict(inputData)
    
    # Handle potential tuple output
    if isinstance(results, tuple):
        results = results[0]  # Use the first element of the tuple
    
    # Denormalize the reconstructed data
    omicNa_tab = (results * omicMissSd) + omicMissMean
    
    # Combine reconstructed data with original non-numeric identifier column
    omicNa_tab = pd.DataFrame(omicNa_tab, columns=numeric_cols)
    omicNa_tab[omicMiss.columns[1]] = omicMiss[omicMiss.columns[1]]
    
    # Patch the reconstructed data into the original data with missing values
    omicRec_tab = omicMiss.copy()
    omicRec_tab.update(omicNa_tab)
    
    return omicRec_tab

# Extract true missing values before filling
true_missing_values = omicMiss[numeric_cols].values

print ('\n True missing values: ')
print (true_missing_values)


# Reconstruct missing values using the pretrained model
reconstructed_data = tabnet_recon(omicMiss, network=pretrained_model)

print ('\n Reconstructed data: ')
print ( reconstructed_data.head())

# Extract imputed values
imputed_values = reconstructed_data[numeric_cols].values

print ('\n Imputed values: ')
print ( imputed_values)

# Calculate MAE, R-squared, and RMSE
mae = np.mean(np.abs(imputed_values - true_missing_values))
total_variation = np.sum((true_missing_values - np.mean(true_missing_values)) ** 2)
residual_variation = np.sum((true_missing_values - imputed_values) ** 2)
r_squared = 1 - (residual_variation / total_variation)
rmse = np.sqrt(np.mean((imputed_values - true_missing_values) ** 2))


# Print original and reconstructed data (only printing a subset)
print("Original Data:")
#omicMiss = omicMiss.iloc[:, 1:]
print(omicMiss.head())
print("\nReconstructed Data:")
#reconstructed_data = reconstructed_data.iloc[:, 1:]
print(reconstructed_data.head())

print("MAE:", mae)
print("R-squared:", r_squared)
print("RMSE:", rmse)

Input Data with missing Values:
   Balm_3_1_U_IO_DDA_30min_G6_1_5228  Balm_3_2_T_IO_DDA_30min_H6_1_5230  \
0                                NaN                           8.935561   
1                           8.937573                           9.367173   
2                           8.672674                           8.955487   
3                                NaN                           8.596817   
4                          11.804774                          11.778822   

   Balm_3_3_U_IO_DDA_30min_A7_1_5232  Balm_3_4_T_IO_DDA_30min_B7_1_5234  \
0                                NaN                                NaN   
1                                NaN                                NaN   
2                                NaN                                NaN   
3                                NaN                                NaN   
4                          11.864427                          12.075907   

   DOHH_2_1_U_IO_DDA_30_C4_1_5188  DOHH_2_2_T_IO_DDA_30_D4_1_5190 