In [1]:
import tensorflow as tf
from tensorflow.keras import layers, Model
from tensorflow.keras.layers import Input, Dense, Lambda, Concatenate
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split




In [2]:

# Define the Sampling layer
class Sampling(layers.Layer):
    def call(self, inputs):
        z_mean, z_log_var = inputs
        batch = tf.shape(z_mean)[0]
        dim = tf.shape(z_mean)[1]
        epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
        return z_mean + tf.exp(0.5 * z_log_var) * epsilon

class VAE(Model):
    def __init__(self, encoder, decoder, **kwargs):
        super(VAE, self).__init__(**kwargs)
        self.encoder = encoder
        self.decoder = decoder

    def call(self, inputs, training=None):
        z_mean, z_log_var, z = self.encoder(inputs)
        return self.decoder(z)

    def train_step(self, data):
        if isinstance(data, tuple):
            data = data[0]
        with tf.GradientTape() as tape:
            z_mean, z_log_var, z = self.encoder(data)
            reconstruction = self.decoder(z)
            reconstruction_loss = tf.reduce_mean(
                tf.keras.losses.mean_squared_error(data, reconstruction)
            )
            kl_loss = -0.5 * (1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var))
            kl_loss = tf.reduce_mean(tf.reduce_sum(kl_loss, axis=1))
            
            # Additional loss term for categorical variables
            categorical_loss = tf.reduce_mean(
                tf.keras.losses.binary_crossentropy(data[:, -3:], reconstruction[:, -3:])
            )
            # Weight for the categorical loss
            alpha = 0.5  # Adjust this weight as needed
            total_loss = reconstruction_loss + kl_loss + alpha * categorical_loss

        grads = tape.gradient(total_loss, self.trainable_weights)
        self.optimizer.apply_gradients(zip(grads, self.trainable_weights))
        return {
            "loss": total_loss,
            "reconstruction_loss": reconstruction_loss,
            "kl_loss": kl_loss,
            "categorical_loss": categorical_loss,
        }

In [3]:
# Define the encoder and decoder networks
original_dim = 16  #includeing one-hot encoded columns
latent_dim = 2
intermediate_dim = 64

# Encoder network
inputs = Input(shape=(original_dim,))
h = Dense(intermediate_dim, activation='relu')(inputs)
z_mean = Dense(latent_dim)(h)
z_log_var = Dense(latent_dim)(h)
z = Sampling()([z_mean, z_log_var])
encoder = Model(inputs, [z_mean, z_log_var, z])

# Decoder network
latent_inputs = Input(shape=(latent_dim,))
h_decoded = Dense(intermediate_dim, activation='relu')(latent_inputs)
outputs = Dense(original_dim, activation='sigmoid')(h_decoded)
decoder = Model(latent_inputs, outputs)

# Define the VAE model
vae = VAE(encoder, decoder)
vae.compile(optimizer='adam')





In [4]:
# Load and preprocess data
Data = pd.read_excel('Dataset.xlsx')
Data = Data.sample(frac=1).reset_index(drop=True)

# Clean column names
cleaned_columns = {
    col: col.replace(' ', '').replace('(%', '').replace(')','').replace('\n', '')
    for col in Data.columns
}
Data.rename(columns=cleaned_columns, inplace=True)

#DataFrame has clean column names
print(Data.head())

           C          H         N          O         S         VM        Ash  \
0  49.270000   6.550000  1.560000  42.620000  0.000000  80.752916   6.044539   
1  45.019652   6.569343  2.189781  46.221224  0.000000  76.122673  18.948521   
2  45.739910  10.762332  7.036909  34.908589  1.552259  49.357766  42.020000   
3  56.085157   7.108220  4.861029  30.833826  1.111768  73.526474  20.879121   
4  45.896172   5.709845  1.622249  46.771734  0.000000  77.318919   1.783784   

          FC    Cel    Hem    Lig Location  Plantcapacity(kg/hr     MSP  
0  13.202545  36.89  20.42  17.38       UK                50000  1.0088  
1   4.928806  21.00  28.00  21.00       UK                25000  0.6864  
2   8.626970  35.00  25.00  26.50       US                25000  0.8500  
3   5.594406  16.60  48.50   1.60       UK                25000  0.7384  
4  20.897297  21.00  12.80  32.70       US                25000  0.7300  


In [5]:
data = pd.get_dummies(Data, columns= ['Location'])
print(data.head())

           C          H         N          O         S         VM        Ash  \
0  49.270000   6.550000  1.560000  42.620000  0.000000  80.752916   6.044539   
1  45.019652   6.569343  2.189781  46.221224  0.000000  76.122673  18.948521   
2  45.739910  10.762332  7.036909  34.908589  1.552259  49.357766  42.020000   
3  56.085157   7.108220  4.861029  30.833826  1.111768  73.526474  20.879121   
4  45.896172   5.709845  1.622249  46.771734  0.000000  77.318919   1.783784   

          FC    Cel    Hem    Lig  Plantcapacity(kg/hr     MSP  \
0  13.202545  36.89  20.42  17.38                50000  1.0088   
1   4.928806  21.00  28.00  21.00                25000  0.6864   
2   8.626970  35.00  25.00  26.50                25000  0.8500   
3   5.594406  16.60  48.50   1.60                25000  0.7384   
4  20.897297  21.00  12.80  32.70                25000  0.7300   

   Location_China  Location_UK  Location_US  
0               0            1            0  
1               0            1

In [6]:
continuous_data = data.iloc[:, :-3].values
binary_data = data.iloc[:, -3:].values

# Normalize the continuous features
scaler = MinMaxScaler()
continuous_data_normalized = scaler.fit_transform(continuous_data)

# Concatenate with the binary data to get the normalized data
normalized_data = np.concatenate([continuous_data_normalized, binary_data], axis=1)

# Train/test split
x_train, x_test = train_test_split(normalized_data, test_size=0.2, random_state=42)

# Train the VAE
vae.fit(x_train, x_train, epochs=50, batch_size=32, validation_data=(x_test, x_test))


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50


Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x1cd70fcb7c0>

In [7]:
def generate_synthetic_data(vae, scaler, n_samples=1000):
    # Instead of standard normal, use the distribution of the latent space
    latent_samples = np.random.normal(size=(n_samples, latent_dim))
    
    # Predict the outputs (reconstructions)
    predictions = vae.decoder.predict(latent_samples)
    
    # Check sigmoid outputs before rounding
    print("Sigmoid outputs before rounding:", predictions[:, -3:])

    # Separate continuous and binary data
    continuous_data = predictions[:, :-3]
    binary_data = predictions[:, -3:]
    
    # Inverse transform the continuous data to get it back to original scale
    continuous_data = scaler.inverse_transform(continuous_data)
    
    binary_data = np.zeros_like(binary_data)
    binary_data[np.arange(len(binary_data)), np.argmax(predictions[:, -3:], axis=1)] = 1
    
    # Concatenate both continuous and binary data
    return np.concatenate([continuous_data, binary_data], axis=1)

In [8]:
# Generate synthetic data
synthetic_data = generate_synthetic_data(vae, scaler, n_samples=5000)

# Convert synthetic data to DataFrame
synthetic_data_df = pd.DataFrame(synthetic_data, columns=data.columns)

# Save the synthetic data to a new CSV file
synthetic_data_df.to_csv('VAE_synthetic_data.csv', index=False)

print("Synthetic data generated and saved to synthetic_data.csv")

Sigmoid outputs before rounding: [[0.34588557 0.31692964 0.30585152]
 [0.32509091 0.37613758 0.3923231 ]
 [0.28336015 0.28538272 0.3296258 ]
 ...
 [0.34670863 0.36164647 0.40023136]
 [0.3504729  0.3272499  0.36024934]
 [0.29974407 0.37253553 0.36913022]]
Synthetic data generated and saved to synthetic_data.csv


In [9]:
print(synthetic_data_df.dtypes)
print(data.head())
print(synthetic_data_df.head())

C                      float32
H                      float32
N                      float32
O                      float32
S                      float32
VM                     float32
Ash                    float32
FC                     float32
Cel                    float32
Hem                    float32
Lig                    float32
Plantcapacity(kg/hr    float32
MSP                    float32
Location_China         float32
Location_UK            float32
Location_US            float32
dtype: object
           C          H         N          O         S         VM        Ash  \
0  49.270000   6.550000  1.560000  42.620000  0.000000  80.752916   6.044539   
1  45.019652   6.569343  2.189781  46.221224  0.000000  76.122673  18.948521   
2  45.739910  10.762332  7.036909  34.908589  1.552259  49.357766  42.020000   
3  56.085157   7.108220  4.861029  30.833826  1.111768  73.526474  20.879121   
4  45.896172   5.709845  1.622249  46.771734  0.000000  77.318919   1.783784   

         

In [10]:
data[['Location_China', 'Location_UK', 'Location_US']] = data[['Location_China', 'Location_UK', 'Location_US']].round().astype(int)
synthetic_data_df[['Location_China', 'Location_UK', 'Location_US']] = synthetic_data_df[['Location_China', 'Location_UK', 'Location_US']].round().astype(int)

# Convert one-hot encoding back to categorical for groupby operation in original data
data['Country'] = (data[['Location_China', 'Location_UK', 'Location_US']].round() == 1).idxmax(axis=1)

# Group by country and calculate mean for the original data
means = data.groupby('Country').mean()

# Convert one-hot encoding back to categorical for groupby operation in synthetic data
synthetic_data_df['Country'] = (synthetic_data_df[[ 'Location_China','Location_UK','Location_US']] == 1).idxmax(axis=1)

# Group by country and calculate mean for the synthetic data
synthetic_means = synthetic_data_df.groupby('Country').mean()

print("Original Data Means by Country:")
print(means)

# For a clean separation in the output
print("\n" + "-"*50 + "\n")

print("Synthetic Data Means by Country:")
print(synthetic_means)

Original Data Means by Country:
                        C         H         N          O        S         VM  \
Country                                                                        
Location_China  49.506202  6.418846  1.793219  42.060283  0.22145  75.465442   
Location_UK     49.506202  6.418846  1.793219  42.060283  0.22145  75.465442   
Location_US     49.506202  6.418846  1.793219  42.060283  0.22145  75.465442   

                     Ash         FC        Cel        Hem        Lig  \
Country                                                                
Location_China  8.694476  15.840234  33.685306  26.992285  21.840139   
Location_UK     8.694476  15.840234  33.685306  26.992285  21.840139   
Location_US     8.694476  15.840234  33.685306  26.992285  21.840139   

                Plantcapacity(kg/hr       MSP  Location_China  Location_UK  \
Country                                                                      
Location_China              37500.0  0.570813     

In [11]:
print(data.mean())
print(synthetic_data_df.mean())

C                         49.506202
H                          6.418846
N                          1.793219
O                         42.060283
S                          0.221450
VM                        75.465442
Ash                        8.694476
FC                        15.840234
Cel                       33.685306
Hem                       26.992285
Lig                       21.840139
Plantcapacity(kg/hr    37500.000000
MSP                        0.792811
Location_China             0.333333
Location_UK                0.333333
Location_US                0.333333
dtype: float64
C                         49.591564
H                          6.455460
N                          1.963361
O                         41.969433
S                          0.306616
VM                        75.393829
Ash                        9.196394
FC                        15.792426
Cel                       33.758919
Hem                       27.023275
Lig                       22.214952
Plantcapacity

  print(data.mean())
  print(synthetic_data_df.mean())
