In [None]:
import tensorflow as tf
import numpy as np
from tensorflow.keras import layers
from tensorflow.keras.layers import Dense, Input, Lambda, BatchNormalization, LeakyReLU, Add, Dropout
from tensorflow.keras import models
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras import regularizers
import tensorflow.keras.backend as K
from tensorflow.keras import initializers
from tensorflow.keras.initializers import HeNormal
from tensorflow.keras.callbacks import TensorBoard
from tensorflow import keras
import math
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.callbacks import LearningRateScheduler
from sklearn.model_selection import train_test_split
import pyarrow as pa
import pyarrow.parquet as pq
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
def print_title(title):
    print(f'{50 * "="}')
    print(title)
    print(f'{50 * "="}')


print_title('Loading Data')
df = pq.read_table("C:\\Users\\G20187729\\Desktop\\Python_codes_3\\Data\\20240710_crosssectionaldata\\X.parquet", use_threads=True).to_pandas()
X0_train = pq.read_table("C:\\Users\\G20187729\\Desktop\\Python_codes_3\\Data\\X0_train.parquet", use_threads=True).to_pandas()
X0_test = pq.read_table("C:\\Users\\G20187729\\Desktop\\Python_codes_3\\Data\\X0_test.parquet", use_threads=True).to_pandas()
X1_train = pq.read_table("C:\\Users\\G20187729\\Desktop\\Python_codes_3\\Data\\X1_train.parquet", use_threads=True).to_pandas()
X1_test = pq.read_table("C:\\Users\\G20187729\\Desktop\\Python_codes_3\\Data\\X1_test.parquet", use_threads=True).to_pandas()
print_title('Data loading complete')

In [None]:
X = df.iloc[:, 1:, ]
y1 = df.iloc[:, 0]

In [None]:
y = X.iloc[:, 0]
X = X.iloc[:, 1:, ]

In [None]:
X = X.values.astype(np.float32)
X0_train = X0_train.values.astype(np.float32)
X1_train = X1_train.values.astype(np.float32)
X0_test = X0_test.values.astype(np.float32)
X1_test = X1_test.values.astype(np.float32)

X_train, X_test, y_train, y_test, y1_train, y1_test = train_test_split(X, y, y1, test_size=0.2, random_state=0)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=0)
X0_train, X0_val = train_test_split(X0_train, test_size=0.2, random_state=0, shuffle=False)
X1_train, X1_val = train_test_split(X1_train, test_size=0.2, random_state=0, shuffle=False)


In [None]:
print(X_train.shape)
print(X0_val.shape)
print(X1_train.shape)

In [None]:
scaler = MinMaxScaler()

# Fit the scaler on the training data and transform the training set
X_train_scaled = scaler.fit_transform(X_train)

# Transform the validation and test sets using the same scaler
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Similarly, fit and transform X0_train, X0_val, X0_test for the second dataset
X0_train_scaled = scaler.transform(X0_train)
X0_val_scaled = scaler.transform(X0_val)
X0_test_scaled = scaler.transform(X0_test)

# And fit and transform X1_train, X1_val, X1_test for the third dataset
X1_train_scaled = scaler.transform(X1_train)
X1_val_scaled = scaler.transform(X1_val)
X1_test_scaled = scaler.transform(X1_test)

In [None]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)


In [None]:
with tf.device('/GPU:0'):
    class Sampling(layers.Layer):

        def call(self, inputs):
            z_mean, z_log_var = inputs
            batch = tf.shape(z_mean)[0]
            dim = tf.shape(z_mean)[1]
            epsilon = tf.random.normal(shape=(batch, dim))
            return z_mean + tf.exp(0.5 * z_log_var) * epsilon

In [None]:
input_shape = (332909,)
latent_dim = 50
batch_size = 54

In [None]:
def residual_block(x, filters, kernel_size=3, stride=1, l1_reg=0.05, l2_reg=0.05, dropout_rate=0.4):
    shortcut = x
    x = layers.Dense(filters, activation="relu", kernel_regularizer=regularizers.l1_l2(l1=l1_reg, l2=l2_reg))(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(dropout_rate)(x)
    x = layers.Dense(filters, activation=None, kernel_regularizer=regularizers.l1_l2(l1=l1_reg, l2=l2_reg))(x)
    x = layers.BatchNormalization()(x)
    
    # Adjust the shortcut if necessary
    if shortcut.shape[-1] != filters:
        shortcut = layers.Dense(filters, activation=None, kernel_regularizer=regularizers.l1_l2(l1=l1_reg, l2=l2_reg))(shortcut)
        shortcut = layers.BatchNormalization()(shortcut)
    
    x = layers.add([x, shortcut])
    x = layers.ReLU()(x)
    return x

In [None]:
with tf.device('/GPU:0'):
    def build_encoder(input_shape, latent_dim, l1_reg=0.05, l2_reg=0.05, dropout_rate=0.4):
        encoder_inputs = layers.Input(shape=input_shape)
        x = layers.Dense(800, activation="relu", kernel_regularizer=regularizers.l1_l2(l1=l1_reg, l2=l2_reg))(encoder_inputs)
        x = layers.BatchNormalization()(x)
        x = layers.Dropout(dropout_rate)(x)
        
        # Add residual blocks
        x = residual_block(x, 800, l1_reg=l1_reg, l2_reg=l2_reg, dropout_rate=dropout_rate)
        x = residual_block(x, 400, l1_reg=l1_reg, l2_reg=l2_reg, dropout_rate=dropout_rate)
        x = residual_block(x, 200, l1_reg=l1_reg, l2_reg=l2_reg, dropout_rate=dropout_rate)
        x = residual_block(x, 100, l1_reg=l1_reg, l2_reg=l2_reg, dropout_rate=dropout_rate)
        
        z_mean = layers.Dense(latent_dim, name="z_mean")(x)
        z_log_var = layers.Dense(latent_dim, name="z_log_var")(x)
        z = Sampling()([z_mean, z_log_var])
        encoder = models.Model(encoder_inputs, [z_mean, z_log_var, z], name="encoder")
        return encoder

    encoder = build_encoder(input_shape, latent_dim, l1_reg=0.05, l2_reg=0.05, dropout_rate=0.4)
    encoder.summary()


In [None]:
with tf.device('/GPU:0'):
    def build_decoder(latent_dim, output_shape, l1_reg=0.05, l2_reg=0.05, dropout_rate=0.4):
        latent_inputs = layers.Input(shape=(latent_dim,))
        x = layers.Dense(100, activation="relu", kernel_regularizer=regularizers.l1_l2(l1=l1_reg, l2=l2_reg))(latent_inputs)
        x = layers.BatchNormalization()(x)
        x = layers.Dropout(dropout_rate)(x)
        
        # Add residual blocks
        x = residual_block(x, 100, l1_reg=l1_reg, l2_reg=l2_reg, dropout_rate=dropout_rate)
        x = residual_block(x, 200, l1_reg=l1_reg, l2_reg=l2_reg, dropout_rate=dropout_rate)
        x = residual_block(x, 400, l1_reg=l1_reg, l2_reg=l2_reg, dropout_rate=dropout_rate)
        x = residual_block(x, 800, l1_reg=l1_reg, l2_reg=l2_reg, dropout_rate=dropout_rate)
        
        decoder_outputs = layers.Dense(output_shape, activation="sigmoid")(x)
        decoder = models.Model(latent_inputs, decoder_outputs, name="decoder")
        return decoder

    decoder = build_decoder(latent_dim, 332909, l1_reg=0.05, l2_reg=0.05, dropout_rate=0.4)
    decoder.summary()


In [None]:
class KoopmanOperator(tf.Module):
    def __init__(self, latent_dim):
        self.latent_dim = latent_dim
        self.k_matrix_ut = tf.Variable(
            tf.random.uniform((latent_dim * (latent_dim - 1) // 2,), dtype=tf.float32, minval=1e-5, maxval=1.0)
        )
        self.k_matrix_diag = tf.Variable(
            tf.random.uniform((latent_dim,), dtype=tf.float32, minval=1e-5, maxval=1.0)
        )

    def koopman_operation(self, g):
        k_matrix = self._build_koopman_matrix()
        g_next = tf.linalg.matmul(g, k_matrix)
        scalar_output = tf.reduce_mean(g_next, axis=-1)
        return g_next, scalar_output

    def get_koopman_matrix(self):
        k_matrix = self._build_koopman_matrix()
        return k_matrix

    def _build_koopman_matrix(self):
        k_matrix = tf.zeros((self.latent_dim, self.latent_dim), dtype=tf.float32)
        
        upper_triangular_indices = tf.linalg.band_part(tf.ones((self.latent_dim, self.latent_dim)), 0, -1)
        diag_indices = tf.linalg.diag_part(upper_triangular_indices)
        
        upper_triangular_values = tf.concat([self.k_matrix_ut, self.k_matrix_diag], axis=0)
        k_matrix = tf.linalg.set_diag(k_matrix, self.k_matrix_diag)
        
        k_matrix += tf.linalg.band_part(
            tf.scatter_nd(tf.where(upper_triangular_indices), upper_triangular_values, [self.latent_dim, self.latent_dim]),
            0, -1
        )
        
        return k_matrix

    def koopman_sparsity_loss(self, l1_factor=0.1):
        ut_sparsity_loss = tf.reduce_mean(tf.abs(self.k_matrix_ut))
        diag_sparsity_loss = tf.reduce_mean(tf.abs(self.k_matrix_diag))
        total_sparsity_loss = l1_factor * (ut_sparsity_loss + diag_sparsity_loss)
        return total_sparsity_loss

In [None]:
with tf.device('/GPU:0'):
    class KoopmanModel(tf.keras.Model):
        def __init__(self, koopman_operator):
            super(KoopmanModel, self).__init__()
            self.koopman_operator = koopman_operator

        def call(self, input_present, input_future):
            # Define the forward pass using the koopman_operator
            g_next_present, daf_present = self.koopman_operator.koopman_operation(input_present)
            g_next_future, daf_future = self.koopman_operator.koopman_operation(input_future)
            return g_next_present, daf_present, g_next_future, daf_future

In [None]:
class loader(keras.utils.Sequence):
    def __init__(self, X, X0, X1, batch_size):
        self.X0 = X0
        self.X = X
        self.X1 = X1
        self.num_batches_ts = len(self.X0) // batch_size
        self.num_batches_cs = len(self.X) // batch_size
        if self.num_batches_ts == self.num_batches_cs:
            self.ratio = 1
        else:
            self.ratio = self.num_batches_cs // self.num_batches_ts
        self.batch_size = batch_size

    def __len__(self):
        return self.num_batches_ts

    def __getitem__(self, index):
        start_cs = index * self.batch_size * self.ratio
        end_cs = start_cs + (self.batch_size * self.ratio)
        batch_X_train = self.X[start_cs:end_cs]

        # Get a batch of time-series data
        start_ts = index * self.batch_size
        end_ts = start_ts + self.batch_size
        batch_X0_train = self.X0[start_ts:end_ts]
        batch_X1_train = self.X1[start_ts:end_ts]

        return [batch_X_train, batch_X0_train, batch_X1_train]


In [None]:
train_loader = loader(X_train_scaled, X0_train_scaled, X1_train_scaled, batch_size)
val_loader = loader(X_val_scaled, X0_val_scaled, X1_val_scaled, batch_size)
steps_per_epoch = len(X_train_scaled) // batch_size
validation_steps = len(X_val_scaled) // batch_size

In [None]:
with tf.device('/GPU:0'):
    class VAE(keras.Model):
        def __init__(self, encoder, decoder, **kwargs):
            super().__init__(**kwargs)
            self.encoder = encoder
            self.decoder = decoder
          

        def call(self, inputs):
            print(len(inputs))
            z_mean_cross, z_log_var_cross, z_cross = self.encoder(inputs[0])
            z_mean_present, z_log_var_present, z_present = self.encoder(inputs[1])
            z_mean_future, z_log_var_future, z_future = self.encoder(inputs[2])
            reconstruction = self.decoder(z_cross)

            return reconstruction, z_present, z_future, z_mean_cross, z_log_var_cross

In [None]:
with tf.device('/GPU:0'):
    class MyModel(keras.Model):
        def __init__(self, vae, koopman, **kwargs):
            super().__init__(**kwargs)
            self.vae = vae
            self.koopman = koopman            
            self.total_loss_tracker = keras.metrics.Mean(name="total_loss")
            self.reconstruction_loss_tracker = keras.metrics.Mean(name="reconstruction_loss")
            self.kl_loss_tracker = keras.metrics.Mean(name="kl_loss")
            self.linear_dynamics_loss_tracker = keras.metrics.Mean(name="linear_dynamics_loss")
            self.daf_loss_tracker = keras.metrics.Mean(name="daf_loss")
            self.auxiliary_loss_tracker = keras.metrics.Mean(name = "auxiliary_loss")
            self.koopman_sparsity_loss_tracker = keras.metrics.Mean(name = "koopman_sparsity_loss")

        @property
        def metrics(self):
            return [
                self.total_loss_tracker,
                self.reconstruction_loss_tracker,
                self.kl_loss_tracker,
                self.linear_dynamics_loss_tracker,
                self.daf_loss_tracker,
                self.auxiliary_loss_tracker,
                self.koopman_sparsity_loss_tracker
            ]
        
        def call(self, inputs):
            reconstruction, z_present, z_future, z_mean_cross, z_log_var_cross = self.vae(inputs)
            g_next_present, daf_present, g_next_future, daf_future = self.koopman(z_present, z_future)            
            decoded_future = self.vae.decoder(g_next_present)
            return g_next_present, daf_present, g_next_future, daf_future, reconstruction, decoded_future, z_present, z_future, z_mean_cross, z_log_var_cross
        
        def train_step(self, data):
            data_unpacked = data[0]
            input_data_cross, input_data_present, input_data_future = data_unpacked

            with tf.GradientTape() as tape:
                g_next_present, daf_present, g_next_future, daf_future, reconstruction, decoded_future, z_present, z_future, z_mean_cross, z_log_var_cross = self(data_unpacked, training=True)

                # Reconstruction Loss
                reconstruction_loss = tf.reduce_mean(tf.reduce_sum(tf.square(input_data_cross - reconstruction), axis=-1))
                

                # KL Divergence Loss
                kl_loss = -0.5 * tf.reduce_mean(1 + z_log_var_cross - tf.square(z_mean_cross) - tf.exp(z_log_var_cross))

                # Linear Dynamics Loss
                linear_dynamics_loss = tf.reduce_mean(tf.reduce_sum(tf.square(z_future - g_next_present), axis=-1))

                # DAF Loss
                daf_loss = tf.reduce_mean(tf.reduce_sum(tf.square(daf_present - daf_future), axis=-1))

                # Auxiliary Loss
                auxiliary_loss = tf.reduce_mean(tf.reduce_sum(tf.square(input_data_future - decoded_future), axis=-1))

                # Sparsity Loss for Koopman Operator
                koopman_sparsity_loss = self.koopman.koopman_operator.koopman_sparsity_loss()

                # Total Loss
                total_loss = (
                    reconstruction_loss +
                    kl_loss +
                    linear_dynamics_loss +
                    daf_loss +
                    auxiliary_loss +
                    koopman_sparsity_loss
                )

            trainable_vars = self.trainable_variables
            grads = tape.gradient(total_loss, trainable_vars)
            self.optimizer.apply_gradients(zip(grads, trainable_vars))

            self.total_loss_tracker.update_state(total_loss)
            self.reconstruction_loss_tracker.update_state(reconstruction_loss)
            self.kl_loss_tracker.update_state(kl_loss)
            self.linear_dynamics_loss_tracker.update_state(linear_dynamics_loss)
            self.daf_loss_tracker.update_state(daf_loss)
            self.auxiliary_loss_tracker.update_state(auxiliary_loss)
            self.koopman_sparsity_loss_tracker.update_state(koopman_sparsity_loss)

            return {
                "total_loss": self.total_loss_tracker.result(),
                "reconstruction_loss": self.reconstruction_loss_tracker.result(),
                "kl_loss": self.kl_loss_tracker.result(),
                "linear_dynamics_loss": self.linear_dynamics_loss_tracker.result(),
                "daf_loss": self.daf_loss_tracker.result(),
                "auxiliary_loss": self.auxiliary_loss_tracker.result(),
                "koopman_sparsity_loss": self.koopman_sparsity_loss_tracker.result()
            }
        
        def test_step(self, data):
            data_unpacked = data[0]
            input_data_cross, input_data_present, input_data_future = data_unpacked
            g_next_present, daf_present, g_next_future, daf_future, reconstruction, decoded_future, z_present, z_future, z_mean_cross, z_log_var_cross = self(data_unpacked, training=True)

            # Reconstruction Loss
            reconstruction_loss = tf.reduce_mean(tf.reduce_sum(tf.square(input_data_cross - reconstruction), axis=-1))
            

            # KL Divergence Loss
            kl_loss = -0.5 * tf.reduce_mean(1 + z_log_var_cross - tf.square(z_mean_cross) - tf.exp(z_log_var_cross))

            # Linear Dynamics Loss
            linear_dynamics_loss = tf.reduce_mean(tf.reduce_sum(tf.square(z_future - g_next_present), axis=-1))

            # DAF Loss
            daf_loss = tf.reduce_mean(tf.reduce_sum(tf.square(daf_present - daf_future), axis=-1))

            #Auxiliary Loss
            auxiliary_loss = tf.reduce_mean(tf.reduce_sum(tf.square(input_data_future - decoded_future), axis=-1))

            # Sparsity Loss for Koopman Operator
            koopman_sparsity_loss = self.koopman.koopman_operator.koopman_sparsity_loss()

            # Total Loss
            total_loss = (
                reconstruction_loss +
                kl_loss +
                linear_dynamics_loss +
                daf_loss +
                auxiliary_loss +
                koopman_sparsity_loss
            )

            self.total_loss_tracker.update_state(total_loss)
            self.reconstruction_loss_tracker.update_state(reconstruction_loss)
            self.kl_loss_tracker.update_state(kl_loss)
            self.linear_dynamics_loss_tracker.update_state(linear_dynamics_loss)
            self.daf_loss_tracker.update_state(daf_loss)
            self.auxiliary_loss_tracker.update_state(auxiliary_loss)
            self.koopman_sparsity_loss_tracker.update_state(koopman_sparsity_loss)

            return {
                "total_loss": self.total_loss_tracker.result(),
                "reconstruction_loss": self.reconstruction_loss_tracker.result(),
                "kl_loss": self.kl_loss_tracker.result(),
                "linear_dynamics_loss": self.linear_dynamics_loss_tracker.result(),
                "daf_loss": self.daf_loss_tracker.result(),
                "auxiliary_loss": self.auxiliary_loss_tracker.result(),
                "koopman_sparsity_loss": self.koopman_sparsity_loss_tracker.result()
            }

In [None]:
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)

tf.get_logger().setLevel('ERROR')

In [None]:
if __name__ == '__main__':
    import time

    start = time.time()

    vae = VAE(encoder, decoder)
    koopman_operator = KoopmanOperator(latent_dim)
    koopman = KoopmanModel(koopman_operator)   
    model = MyModel(vae, koopman)
    model.compile(optimizer=keras.optimizers.Adam(learning_rate = 0.0001, clipvalue=1.0, clipnorm=1.0))

In [None]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
n_batches = 8

filepath = "C:\\Users\\G20187729\\Desktop\\Python_codes_3\\Data\\Saved_models_20240711_withoutold\\saved-model-{epoch:02d}DAF.ckpt"
checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath = filepath, monitor='val_total_loss', verbose=1, save_weights_only=True, save_freq=50*n_batches)

In [None]:
early = EarlyStopping(monitor="total_loss", mode="min", patience=5, restore_best_weights=True)
callbacks_list = [early, checkpoint]

In [None]:
def lr_scheduler(epoch, lr):
    if epoch % 250 == 0 and epoch != 0:
        return lr * 0.1  # reduce learning rate by a factor of 10
    else:
        return lr

# Create a learning rate scheduler callback
lr_scheduler_callback = LearningRateScheduler(lr_scheduler)

In [None]:
model.save_weights(filepath.format(epoch=0))

In [None]:
with tf.device('/GPU:0'):
    hist = model.fit(
        train_loader,
        epochs=1000,
#         steps_per_epoch=steps_per_epoch,
        validation_data=val_loader,
        validation_freq=1,
#         validation_steps=validation_steps,
        callbacks=[checkpoint]
    )

    elapsed = time.time() - start
    print(f'Training time: {hms_string(elapsed)}')
#     print(hist.history)

In [None]:
with tf.device('/GPU:0'):
    model.load_weights("C:\\Users\\G20187729\\Desktop\\Python_codes_3\\Data\\Saved_models_20240711_withoutold\\saved-model-300DAF.ckpt")

In [None]:
test_batch_size = 86

In [None]:
test_loader = loader(X_test_scaled, X0_test_scaled, X1_test_scaled, test_batch_size)
test_steps = len(X_test_scaled) // test_batch_size

In [None]:
model.evaluate(test_loader)

In [None]:
pred = model.predict(test_loader)

In [None]:
pred

In [None]:
dai_P = pred[1]

In [None]:
dai_F = pred[3]

In [None]:
np.savetxt("C:\\Users\\G20187729\\Desktop\\Python_codes_3\\Data\\perftest\\dai_P.csv", dai_P, delimiter=',')

In [None]:
np.savetxt("C:\\Users\\G20187729\\Desktop\\Python_codes_3\\Data\\perftest\\dai_F.csv", dai_F, delimiter=',')

In [None]:
np.savetxt("C:\\Users\\G20187729\\Desktop\\Python_codes_3\\Data\\perftest\\y_test.csv", y_test, delimiter=',')

In [None]:
# Create DataFrames from scalar values
df1_dai_P = pd.DataFrame({'dai_P': [dai_P]})
df1_dai_F = pd.DataFrame({'dai_F': [dai_F]})

# Extract scalar values from DataFrames
x = df1_dai_P['dai_P'].values[0]  # Extract the scalar value of dai_P
y = df1_dai_F['dai_F'].values[0]  # Extract the scalar value of dai_F

# Plotting using matplotlib
plt.figure(figsize=(8, 6))
plt.scatter(x, y, marker='o', s=100, c='blue', label='Data Points')

# Adding labels and title
plt.xlabel('dai_P')
plt.ylabel('dai_F')
plt.title('Scatter Plot of dai_P vs dai_F')

# Displaying legend
plt.legend()

# Showing plot
plt.grid(False)
plt.show()

In [None]:
test_loader = loader(X_test_scaled, X_test_scaled, X_test_scaled, test_batch_size)
test_steps = len(X_test_scaled) // test_batch_size

pred = model.predict(test_loader)

dai_P = pred[1]

In [None]:
y1_test = y1_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

# Print length to ensure they have the same length
print("Length of y1_test:", len(y1_test))
print("Length of y_test:", len(y_test))

# Convert to DataFrames
df_y1 = pd.DataFrame({'y1': y1_test})
df_y = pd.DataFrame({'y': y_test})
df_dai = pd.DataFrame(dai_P, columns=['DAI'])

# Combine dataframes
df = pd.concat([df_y1, df_y, df_dai], axis=1)

# Ensure 'y1' is categorical if necessary
df['y1'] = df['y1'].astype('category')

# Check the resulting DataFrames
print("DataFrame df_y1:")
print(df_y1.head())
print("DataFrame df_y:")
print(df_y.head())
print("DataFrame df_dai:")
print(df_dai.head())

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='y', y='DAI', hue='y1', palette='Set1', s=30)

# Adding titles and labels
plt.title('Scatter Plot of Age vs DAI')
plt.xlabel('Age')
plt.ylabel('DAI')

# Show the plot
plt.show()

In [None]:
y1_test = y1_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

# Print length to ensure they have the same length
print("Length of y1_test:", len(y1_test))
print("Length of y_test:", len(y_test))

# Convert to DataFrames
df_y1 = pd.DataFrame({'y1': y1_test})
df_y = pd.DataFrame({'y': y_test})
df_dai = pd.DataFrame(dai_P, columns=['DAI'])

# Combine dataframes
df = pd.concat([df_y1, df_y, df_dai], axis=1)

# Ensure 'y1' is categorical if necessary
df['y1'] = df['y1'].astype('category')

# Define age groups
# Check for NaN values and convert 'y' to numeric if necessary
df['y'] = pd.to_numeric(df['y'], errors='coerce')

# Handle NaN values if any
df = df.dropna(subset=['y'])

# Bin the ages into intervals of 10 years
bins = range(0, int(y_test.max()) + 10, 10)
labels = [f"{i}-{i+9}" for i in bins[:-1]]
df['AgeGroup'] = pd.cut(df['y'], bins=bins, labels=labels, right=False)

# Calculate mean and standard error of DAI for each age group
grouped = df.groupby('AgeGroup')['DAI'].agg(['mean', 'sem']).reset_index()

# Plot the data
plt.figure(figsize=(12, 6))
sns.pointplot(data=grouped, x='AgeGroup', y='mean', capsize=0.1)
plt.errorbar(x=np.arange(len(grouped)), y=grouped['mean'], yerr=grouped['sem'], fmt='o', color='red')

# Adding titles and labels
plt.title('Mean DAI with Standard Error by Age Group')
plt.xlabel('Age Group')
plt.ylabel('Mean DAI')

# Rotate x-axis labels if necessary
plt.xticks(rotation=45)

# Show the plot
plt.show()

In [None]:
# Ensure 'y1' and 'y_test' have the same length and reset indices
y1_test = y1_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

# Print length to ensure they have the same length
print("Length of y1_test:", len(y1_test))
print("Length of y_test:", len(y_test))

# Convert to DataFrames
df_y1 = pd.DataFrame({'y1': y1_test})
df_y = pd.DataFrame({'y': y_test})
df_dai = pd.DataFrame(dai_P, columns=['DAI'])

# Combine dataframes
df = pd.concat([df_y1, df_y, df_dai], axis=1)

# Ensure 'y1' is categorical if necessary
df['y1'] = df['y1'].astype('category')

# Define age groups
# Check for NaN values and convert 'y' to numeric if necessary
df['y'] = pd.to_numeric(df['y'], errors='coerce')

# Handle NaN values if any
df = df.dropna(subset=['y'])

# Bin the ages into intervals of 10 years
bins = range(0, int(y_test.max()) + 10, 10)
labels = [f"{i}-{i+9}" for i in bins[:-1]]
df['AgeGroup'] = pd.cut(df['y'], bins=bins, labels=labels, right=False)

# Calculate mean and standard error of DAI for each age group and cohort
grouped = df.groupby(['AgeGroup', 'y1'])['DAI'].agg(['mean', 'sem']).reset_index()



# Add error bars manually
for i, age_group in enumerate(labels):
    for j, cohort in enumerate(df['y1'].cat.categories):
        subset = grouped[(grouped['AgeGroup'] == age_group) & (grouped['y1'] == cohort)]
        if not subset.empty:
            plt.errorbar(x=i + j*0.2, y=subset['mean'].values[0], yerr=subset['sem'].values[0], fmt='o', color=sns.color_palette('Set1')[j])

# Adding titles and labels
plt.title('Mean DAI with Standard Error by Age Group and Cohort')
plt.xlabel('Age Group')
plt.ylabel('Mean DAI')

# Rotate x-axis labels if necessary
plt.xticks(rotation=45)

# Show the plot
plt.show()

In [None]:
# Concatenate dataframes
df = pd.concat([df_y, df_dai], axis=1)

# Define age groups
# Check for NaN values and convert 'y' to numeric if necessary
df['y'] = pd.to_numeric(df['y'], errors='coerce')

# Handle NaN values if any
df = df.dropna(subset=['y'])

# Define bins and labels
bins = range(0, int(max(df['y'])) + 10, 10)  # Convert max(df['y']) to int
labels = [f'{i}-{i+9}' for i in bins[:-1]]  # Labels for the age groups
df['age_group'] = pd.cut(df['y'], bins=bins, labels=labels, right=False)

# Calculate statistics
grouped = df.groupby('age_group')['DAI'].agg(['mean', 'std', 'count']).reset_index()

# Plotting
plt.errorbar(grouped['age_group'], grouped['mean'], yerr=grouped['std'], fmt='o', capsize=5)
plt.xlabel('Age Group')
plt.ylabel('Mean DAI')
plt.title('Mean DAI with Standard Deviation by Age Group')
plt.xticks(rotation=45)
plt.grid(False)
plt.tight_layout()
plt.show()

In [None]:
# Concatenate dataframes
df = pd.concat([df_y, df_dai], axis=1)

# Define age groups
# Check for NaN values and convert 'y' to numeric if necessary
df['y'] = pd.to_numeric(df['y'], errors='coerce')

# Handle NaN values if any
df = df.dropna(subset=['y'])

# Define bins and labels
bins = range(0, int(max(df['y'])) + 10, 10)  # Convert max(df['y']) to int
labels = [f'{i}-{i+9}' for i in bins[:-1]]  # Labels for the age groups
df['age_group'] = pd.cut(df['y'], bins=bins, labels=labels, right=False)

# Calculate statistics
grouped = df.groupby('age_group')['DAI'].agg(['mean', 'std', 'count']).reset_index()

# Calculate Standard Error (SE)
grouped['SE'] = grouped['std'] / np.sqrt(grouped['count'])

# Plotting
plt.errorbar(grouped['age_group'], grouped['mean'], yerr=grouped['SE'], fmt='o', capsize=5)
plt.xlabel('Age Group')
plt.ylabel('Mean DAI')
plt.title('Mean DAI with Standard Error by Age Group')
plt.xticks(rotation=45)
plt.grid(False)
plt.tight_layout()
plt.show()

In [None]:
from scipy.optimize import curve_fit

df = pd.concat([df_y, df_dai], axis=1)

# Check for NaN values and convert 'y' to numeric if necessary
df['y'] = pd.to_numeric(df['y'], errors='coerce')

# Handle NaN values if any
df = df.dropna(subset=['y', 'DAI'])

# Remove inf values
df = df[np.isfinite(df['y'])]
df = df[np.isfinite(df['DAI'])]

# Extract age and DAI values
age = df['y'].values
dai = df['DAI'].values

# Ensure there are no NaN or inf values
if np.any(np.isnan(age)) or np.any(np.isnan(dai)):
    raise ValueError("Data contains NaN values")
if np.any(np.isinf(age)) or np.any(np.isinf(dai)):
    raise ValueError("Data contains infinite values")

# Normalize data
age_norm = (age - np.min(age)) / (np.max(age) - np.min(age))
dai_norm = (dai - np.min(dai)) / (np.max(dai) - np.min(dai))

# Define the Gompertz-Makeham function
def gompertz_makeham(x, A, B, C):
    return A + B * np.exp(C * x)

# Fit the model to the data with different initial guesses and bounds
initial_guess = [0.1, 0.1, 0.01]
bounds = (0, [10., 10., 1.])

popt, pcov = curve_fit(gompertz_makeham, age_norm, dai_norm, p0=initial_guess, bounds=bounds)
A, B, C = popt

# Generate fitted values for plotting
fitted_dai_norm = gompertz_makeham(age_norm, A, B, C)

# Denormalize the fitted values
fitted_dai = fitted_dai_norm * (np.max(dai) - np.min(dai)) + np.min(dai)

# Plot the results
plt.scatter(age, dai, label='Observed DAI', color='blue')
plt.plot(age, fitted_dai, label=f'Gompertz-Makeham fit (A={A:.2f}, B={B:.2f}, C={C:.2f})', color='red')
plt.xlabel('Age')
plt.ylabel('DAI')
plt.title('DAI with Gompertz-Makeham Fit')
plt.legend()
plt.show()

In [None]:
# Concatenate dataframes and prepare data
df = pd.concat([df_y, df_dai], axis=1)

# Convert 'y' to numeric and handle NaN values
df['y'] = pd.to_numeric(df['y'], errors='coerce')
df = df.dropna(subset=['y', 'DAI'])
df = df[np.isfinite(df['y'])]
df = df[np.isfinite(df['DAI'])]

# Extract age and DAI values
age = df['y'].values
dai = df['DAI'].values

# Ensure there are no NaN or inf values
if np.any(np.isnan(age)) or np.any(np.isnan(dai)):
    raise ValueError("Data contains NaN values")
if np.any(np.isinf(age)) or np.any(np.isinf(dai)):
    raise ValueError("Data contains infinite values")

# Group age into 10-year intervals
df['age_group'] = (df['y'] // 10) * 10
grouped = df.groupby('age_group').agg({'DAI': 'mean'}).reset_index()

# Extract grouped data
age_grouped = grouped['age_group'].values
dai_grouped = grouped['DAI'].values

# Normalize data
age_norm = (age - np.min(age)) / (np.max(age) - np.min(age))
dai_norm = (dai - np.min(dai)) / (np.max(dai) - np.min(dai))

# Normalize grouped data
age_grouped_norm = (age_grouped - np.min(age)) / (np.max(age) - np.min(age))
dai_grouped_norm = (dai_grouped - np.min(dai)) / (np.max(dai) - np.min(dai))

# Define the Gompertz-Makeham function
def gompertz_makeham(x, A, B, C):
    return A + B * np.exp(C * x)

# Fit the model to the grouped data with different initial guesses and bounds
initial_guess = [0.1, 0.1, 0.01]
bounds = (0, [10., 10., 1.])

popt, pcov = curve_fit(gompertz_makeham, age_grouped_norm, dai_grouped_norm, p0=initial_guess, bounds=bounds)
A, B, C = popt

# Generate fitted values for plotting
fitted_dai_grouped_norm = gompertz_makeham(age_grouped_norm, A, B, C)

# Denormalize the fitted values
fitted_dai_grouped = fitted_dai_grouped_norm * (np.max(dai) - np.min(dai)) + np.min(dai)

# Plot the results
plt.scatter(age_grouped, dai_grouped, label='Mean DAI per Age Group', color='blue')
plt.plot(age_grouped, fitted_dai_grouped, label=f'Gompertz-Makeham fit ', color='red')
plt.xlabel('Age')
plt.ylabel('DAI')
plt.title('Mean DAI by Age Group with Gompertz-Makeham Fit')
plt.legend()
plt.show()

In [None]:
# Concatenate dataframes and prepare data
# Concatenate dataframes and prepare data
df = pd.concat([df_y, df_dai], axis=1)

# Convert 'y' to numeric and handle NaN values
df['y'] = pd.to_numeric(df['y'], errors='coerce')
df = df.dropna(subset=['y', 'DAI'])
df = df[np.isfinite(df['y'])]
df = df[np.isfinite(df['DAI'])]

# Extract age and DAI values
age = df['y'].values
dai = df['DAI'].values

# Ensure there are no NaN or inf values
if np.any(np.isnan(age)) or np.any(np.isnan(dai)):
    raise ValueError("Data contains NaN values")
if np.any(np.isinf(age)) or np.any(np.isinf(dai)):
    raise ValueError("Data contains infinite values")

# Group age into 10-year intervals
df['age_group'] = (df['y'] // 10) * 10
grouped = df.groupby('age_group').agg({'DAI': 'mean'}).reset_index()

# Extract grouped data
age_grouped = grouped['age_group'].values
dai_grouped = grouped['DAI'].values

# Normalize data
age_min, age_max = np.min(age), np.max(age)
dai_min, dai_max = np.min(dai), np.max(dai)
age_grouped_norm = (age_grouped - age_min) / (age_max - age_min)
dai_grouped_norm = (dai_grouped - dai_min) / (dai_max - dai_min)

# Define the exponential function
def exponential(x, a, b):
    return a * np.exp(b * x)

# Fit exponential model
popt_exp, _ = curve_fit(exponential, age_grouped_norm, dai_grouped_norm, p0=[1, 0.01])
a_exp, b_exp = popt_exp

# Generate fitted values for the normalized data
fitted_dai_grouped_norm = exponential(age_grouped_norm, *popt_exp)

# Denormalize the fitted values
fitted_dai_grouped = fitted_dai_grouped_norm * (dai_max - dai_min) + dai_min

# Plot the results
plt.figure(figsize=(10, 6))
plt.scatter(age_grouped, dai_grouped, label='Mean DAI per Age Group', color='blue')
plt.plot(age_grouped, fitted_dai_grouped, label=f'Exponential fit (a={a_exp:.2f}, b={b_exp:.2f})', color='red')
plt.xlabel('Age')
plt.ylabel('DAI')
plt.title('Mean DAI by Age Group with Exponential Fit')
plt.legend()
plt.show()

In [None]:
def calculate_r_squared(actual, predicted):
    """
    Calculate R-squared (coefficient of determination) for actual and predicted values.

    :param actual: list or numpy array of actual values
    :param predicted: list or numpy array of predicted values
    :return: R-squared value
    """
    # Convert lists to numpy arrays for easier computation
    actual = np.array(actual)
    predicted = np.array(predicted)
    
    # Calculate mean of actual values
    mean_actual = np.mean(actual)
    
    # Calculate total sum of squares
    total_sum_squares = np.sum((actual - mean_actual) ** 2)
    
    # Calculate residual sum of squares
    residual_sum_squares = np.sum((actual - predicted) ** 2)
    
    # Calculate R-squared
    r_squared = 1 - (residual_sum_squares / total_sum_squares)
    
    return r_squared


In [None]:
actual_data = X_test_scaled
predicted_data = pred[4]

In [None]:
min_length = min(len(actual_data), len(predicted_data))
actual_data = actual_data[:min_length]
predicted_data = predicted_data[:min_length]

In [None]:
r_squared_value = calculate_r_squared(actual_data, predicted_data)
print("R-squared:", r_squared_value)

In [None]:
def calculate_mse(actual, predicted):
    """
    Calculate Mean Squared Error (MSE) for actual and predicted values.

    :param actual: list or numpy array of actual values
    :param predicted: list or numpy array of predicted values
    :return: Mean Squared Error (MSE) value
    """
    # Convert lists to numpy arrays for easier computation
    actual = np.array(actual)
    predicted = np.array(predicted)
    
    # Calculate squared errors
    squared_errors = (actual - predicted) ** 2
    
    # Calculate Mean Squared Error (MSE)
    mse = np.mean(squared_errors)
    
    return mse

In [None]:
mse_value = calculate_mse(actual_data, predicted_data)
print("Mean Squared Error (MSE):", mse_value)

In [None]:
def calculate_rmse(actual, predicted):
    """
    Calculate Root Mean Squared Error (RMSE) for actual and predicted values.

    :param actual: list or numpy array of actual values
    :param predicted: list or numpy array of predicted values
    :return: Root Mean Squared Error (RMSE) value
    """
    # Calculate MSE
    mse = calculate_mse(actual, predicted)
    
    # Calculate RMSE by taking square root of MSE
    rmse = np.sqrt(mse)
    
    return rmse

In [None]:
rmse = calculate_rmse(actual_data, predicted_data)
print("RMSE:", rmse)

In [None]:
def extract_latent_representations_from_loader(loader):
    latent_representations_cross = []
    latent_representations_present = []
    latent_representations_future = []
    for batch_data in loader:
        input_data_cross, input_data_present, input_data_future = batch_data  
        _, _, z_latent_cross = model.vae.encoder(input_data_cross)
        _, _, z_latent_present = model.vae.encoder(input_data_present)
        _, _, z_latent_future = model.vae.encoder(input_data_future)
        latent_representations_cross.append(z_latent_cross.numpy())
        latent_representations_present.append(z_latent_present.numpy())
        latent_representations_future.append(z_latent_future.numpy())
    latent_representations_cross = np.concatenate(latent_representations_cross, axis=0)
    latent_representations_present = np.concatenate(latent_representations_present, axis=0)
    latent_representations_future = np.concatenate(latent_representations_future, axis=0)
    return latent_representations_cross, latent_representations_present, latent_representations_future


In [None]:
with tf.device('/CPU:0'):
    X_train_latent_cross, X_train_latent_present, X_train_latent_future = extract_latent_representations_from_loader(train_loader)

In [None]:
num_cross = X_train_latent_cross.shape[0]
num_present = X_train_latent_present.shape[0]
num_future = X_train_latent_future.shape[0]

# Concatenate latent representations of all three datasets
X_all_latent = np.concatenate((X_train_latent_cross, X_train_latent_present, X_train_latent_future), axis=0)

# Apply t-SNE to reduce dimensionality
tsne = TSNE(n_components=2, perplexity=1000, random_state=42)
X_tsne = tsne.fit_transform(X_all_latent)

# Visualize the t-SNE embeddings with different colors for each dataset
plt.figure(figsize=(8, 6))
plt.scatter(X_tsne[:num_cross, 0], X_tsne[:num_cross, 1], color='blue', label='Cross Dataset', s=10)  # Scatter plot for cross dataset
plt.scatter(X_tsne[num_cross:num_cross+num_present, 0], X_tsne[num_cross:num_cross+num_present, 1], color='red', label='Present Dataset', s=10)  # Scatter plot for present dataset
plt.scatter(X_tsne[num_cross+num_present:, 0], X_tsne[num_cross+num_present:, 1], color='green', label='Future Dataset', s=10)  # Scatter plot for future dataset
plt.title('t-SNE Visualization of Latent Representations')
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.legend()
plt.show()

In [None]:
with tf.device('/CPU:0'):
    Z_mean, Z_log_var, Z = vae.encoder(X_train_scaled)

In [None]:
mean = tf.reduce_mean(Z, axis=0)
stddev = tf.math.reduce_std(Z, axis=0)
standardized = (Z - mean) / stddev

In [None]:
print(standardized.shape)

In [None]:
from sklearn.manifold import TSNE

In [None]:
latent_tsne = TSNE(n_components=2, init='random', perplexity=100).fit_transform(standardized)

In [None]:
np.savetxt("C:\\Users\\G20187729\\Desktop\\Python_codes_3\\Data\\tsne.csv", latent_tsne, delimiter=',')

In [None]:
tsne_df = pd.DataFrame(latent_tsne)

In [None]:
y_train = pd.DataFrame(y_train)

In [None]:
tsne = pd.concat([y_train.reset_index(drop=True), tsne_df.reset_index(drop=True)], axis=1)

In [None]:
tsne.columns = ['Age', 'tsne1', 'tsne2']

In [None]:
tsne = tsne.sort_values(by = 'Age')

In [None]:
tsne = tsne.dropna(subset=['Age'])

In [None]:
tsne_copy = tsne.copy()

In [None]:
#Grouping samples by every 10 years
tsne_copy['Age_group'] = pd.cut(tsne['Age'], bins=range(0,101,10), right=False)

In [None]:
import matplotlib.pyplot as plt
import matplotlib.cm as cm

In [None]:
#plotting
plt.figure(figsize=(10, 6))
colors = cm.tab10(np.linspace(0, 1, len(tsne_copy['Age_group'].unique()))) #colors based on age

In [None]:
for i, (key, group) in enumerate(tsne_copy.groupby('Age_group', observed=False)):
    plt.scatter(group['tsne1'], group['tsne2'], label=key, color=colors[i], s=8)
plt.xlabel('t_SNE1')
plt.ylabel('t_SNE2')
plt.legend(title='Age Group', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(False)
plt.show()

In [None]:
Z_mean_t, Z_log_var_t, Z_t = vae.encoder(X0_test_scaled)
Z_mean_t1, Z_log_var_t1, Z_t1 = vae.encoder(X1_test_scaled)

In [None]:
latent_tsne1 = TSNE(n_components=2, init='random', perplexity=50, random_state=10).fit_transform(Z_t)
latent_tsne2 = TSNE(n_components=2, init='random', perplexity=50, random_state=10).fit_transform(Z_t1)

In [None]:
# Plotting
plt.scatter(latent_tsne1[:, 0], latent_tsne1[:, 1], label='t')
plt.scatter(latent_tsne2[:, 0], latent_tsne2[:, 1], label='t+1')
plt.title('t-SNE Visualization')
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.legend()
plt.show()

In [None]:
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

# Concatenate the data along with time labels
X_concat = np.concatenate((X0_test_scaled, X1_test_scaled), axis=0)
time_labels = np.concatenate((np.zeros(X0_test_scaled.shape[0]), np.ones(X1_test_scaled.shape[0])))

# Perform t-SNE on the concatenated data
latent_tsne_concat = TSNE(n_components=2, init='random', perplexity=50, random_state=10).fit_transform(X_concat)

# Split the t-SNE results back into t and t+1
latent_tsne_t = latent_tsne_concat[:X0_test_scaled.shape[0]]
latent_tsne_t1 = latent_tsne_concat[X1_test_scaled.shape[0]:]

# Plotting
plt.scatter(latent_tsne_t[:, 0], latent_tsne_t[:, 1], label='t')
plt.scatter(latent_tsne_t1[:, 0], latent_tsne_t1[:, 1], label='t+1')
plt.title('t-SNE Visualization of Concatenated Data')
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.legend()
plt.show()

In [None]:

# Concatenate the data along with time labels
X_concat = np.concatenate((X0_train_scaled, X1_train_scaled), axis=0)
time_labels = np.concatenate((np.zeros(X0_train_scaled.shape[0]), np.ones(X1_train_scaled.shape[0])))

# Perform t-SNE on the concatenated data
latent_tsne_concat = TSNE(n_components=2, init='random', perplexity=500, random_state=10).fit_transform(X_concat)

# Split the t-SNE results back into t and t+1
latent_tsne_t = latent_tsne_concat[:X0_train_scaled.shape[0]]
latent_tsne_t1 = latent_tsne_concat[X1_train_scaled.shape[0]:]

# Plotting
plt.scatter(latent_tsne_t[:, 0], latent_tsne_t[:, 1], label='t')
plt.scatter(latent_tsne_t1[:, 0], latent_tsne_t1[:, 1], label='t+1')
plt.title('t-SNE Visualization of Concatenated Data')
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.legend()
plt.show()

In [None]:
Z_mean_t, Z_log_var_t, Z_t = vae.encoder(X0_train_scaled)
Z_mean_t1, Z_log_var_t1, Z_t1 = vae.encoder(X1_train_scaled)

In [None]:
latent_tsne1 = TSNE(n_components=2, init='random', perplexity=400, random_state=0).fit_transform(Z_t)
latent_tsne2 = TSNE(n_components=2, init='random', perplexity=400, random_state=0).fit_transform(Z_t1)

In [None]:
# Plotting
plt.scatter(latent_tsne1[:, 0], latent_tsne1[:, 1], label='t')
plt.scatter(latent_tsne2[:, 0], latent_tsne2[:, 1], label='t+1')
plt.title('t-SNE Visualization')
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.legend()
plt.show()

In [None]:
Z_mean, Z_log_var, Z = vae.encoder(X_test_scaled)
Z_mean_t, Z_log_var_t, Z_t = vae.encoder(X0_test_scaled)
Z_mean_t1, Z_log_var_t1, Z_t1 = vae.encoder(X1_test_scaled)

In [None]:
latent_tsne = TSNE(n_components=2, init='random', perplexity=500, random_state=10).fit_transform(Z)
latent_tsne1 = TSNE(n_components=2, init='random', perplexity=50, random_state=10).fit_transform(Z_t)
latent_tsne2 = TSNE(n_components=2, init='random', perplexity=50, random_state=10).fit_transform(Z_t1)

In [None]:
# Plotting
plt.scatter(latent_tsne[:, 0], latent_tsne[:, 1], label='c')
plt.scatter(latent_tsne1[:, 0], latent_tsne1[:, 1], label='t')
plt.scatter(latent_tsne2[:, 0], latent_tsne2[:, 1], label='t+1')
plt.title('t-SNE Visualization')
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.legend()
plt.show()

In [None]:
latent_tsne1 = TSNE(n_components=2, init='random', perplexity=30, random_state=10).fit_transform(X0_test_scaled)
latent_tsne2 = TSNE(n_components=2, init='random', perplexity=30, random_state=10).fit_transform(X1_test_scaled)

In [None]:
# Plotting
plt.scatter(latent_tsne1[:, 0], latent_tsne1[:, 1], label='t')
plt.scatter(latent_tsne2[:, 0], latent_tsne2[:, 1], label='t+1')
plt.title('t-SNE Visualization')
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.legend()
plt.show()

In [None]:
latent_tsne1 = TSNE(n_components=3, init='random', perplexity=50, random_state=42).fit_transform(Z_t)
latent_tsne2 = TSNE(n_components=3, init='random', perplexity=50, random_state=42).fit_transform(Z_t1)

In [None]:
from mpl_toolkits.mplot3d import Axes3D
# Plotting
fig = plt.figure(figsize=(10, 6))
ax = fig.add_subplot(111, projection='3d')

# Scatter plot for dataset 1
ax.scatter(latent_tsne1[:, 0], latent_tsne1[:, 1], latent_tsne1[:, 2], c='b', label='t')

# Scatter plot for dataset 2
ax.scatter(latent_tsne2[:, 0], latent_tsne2[:, 1], latent_tsne2[:, 2], c='r', label='t+1')

ax.set_title('t-SNE 3D Visualization')
ax.set_xlabel('t-SNE Component 1')
ax.set_ylabel('t-SNE Component 2')
ax.set_zlabel('t-SNE Component 3')
ax.legend()

In [None]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA


# Perform PCA on each dataset
pca = PCA(n_components=2)

latent_pca_1 = pca.fit_transform(Z_t)
latent_pca_2 = pca.fit_transform(Z_t1)

# Plotting
plt.scatter(latent_pca_1[:, 0], latent_pca_1[:, 1], label='Dataset 1')
plt.scatter(latent_pca_2[:, 0], latent_pca_2[:, 1], label='Dataset 2')
plt.title('PCA Visualization')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.legend()
plt.show()


In [None]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D  # Importing 3D plotting tools
from sklearn.decomposition import PCA

# Perform PCA on each dataset
pca = PCA(n_components=3)

latent_pca_1 = pca.fit_transform(Z_t)
latent_pca_2 = pca.fit_transform(Z_t1)

# Plotting
fig = plt.figure(figsize=(10, 6))
ax = fig.add_subplot(111, projection='3d')

# Scatter plot for dataset 1
ax.scatter(latent_pca_1[:, 0], latent_pca_1[:, 1], latent_pca_1[:, 2], label='Dataset 1')

# Scatter plot for dataset 2
ax.scatter(latent_pca_2[:, 0], latent_pca_2[:, 1], latent_pca_2[:, 2], label='Dataset 2')

ax.set_title('PCA 3D Visualization')
ax.set_xlabel('PCA Component 1')
ax.set_ylabel('PCA Component 2')
ax.set_zlabel('PCA Component 3')
ax.legend()

plt.show()


In [None]:
combined = np.concatenate([X0_train_scaled, X1_train_scaled])

In [None]:
# Standardize the data
combined_standardized = (combined - combined.mean(axis=0)) / combined.std(axis=0)

In [None]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D  # Importing 3D plotting tools
from sklearn.decomposition import PCA


pca = PCA(n_components=2)
pca_result = pca.fit_transform(combined)

# Plot PCA results
plt.figure(figsize=(15, 7))
plt.subplot(1, 2, 1)
plt.scatter(pca_result[:len(Z_t), 0], pca_result[:len(Z_t), 1], c='b', label='Timepoint 1')
plt.scatter(pca_result[len(Z_t):, 0], pca_result[len(Z_t):, 1], c='r', label='Timepoint 2')
plt.title('PCA')
plt.legend()


In [None]:
tsne_result = TSNE(n_components=3, init='random', perplexity=300, random_state=0).fit_transform(combined)


In [None]:
plt.figure(figsize=(15, 7))

plt.subplot(1, 2, 2)
plt.scatter(tsne_result[:len(X0_train_scaled), 0], tsne_result[:len(X1_train_scaled), 1], c='b', label='t')
plt.scatter(tsne_result[len(X0_train_scaled):, 0], tsne_result[len(X1_train_scaled):, 1], c='r', label='t+1')
plt.title('t-SNE Visualization')
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.legend()
plt.show()

In [None]:
Z_mean_t, Z_log_var_t, Z_t = vae.encoder(X0_train_scaled)
Z_mean_t1, Z_log_var_t1, Z_t1 = vae.encoder(X1_train_scaled)

In [None]:
combined = np.concatenate([Z_t, Z_t1])

In [None]:
# Standardize the data
combined_standardized = (combined - combined.mean(axis=0)) / combined.std(axis=0)

In [None]:
tsne_result = TSNE(n_components=2, init='random', perplexity=350, random_state=42).fit_transform(combined_standardized)


In [None]:
plt.figure(figsize=(15, 7))

plt.subplot(1, 2, 2)
plt.scatter(tsne_result[:len(Z_t), 0], tsne_result[:len(Z_t1), 1], c='b', label='t')
plt.scatter(tsne_result[len(Z_t):, 0], tsne_result[len(Z_t1):, 1], c='r', label='t+1')
plt.title('t-SNE Visualization')
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.legend()
plt.show()

In [None]:
#3D plot

In [None]:
threeD_latent_tsne = TSNE(n_components=3, init='random', perplexity=200).fit_transform(Z)

In [None]:
np.savetxt("C:\\Users\\G20187729\\Desktop\\Python_codes_3\\Data\\3Dtsne.csv", latent_tsne, delimiter=',')

In [None]:
tsne_df = pd.DataFrame(threeD_latent_tsne)
y_test = pd.DataFrame(y_test)

In [None]:
tsne = pd.concat([y_test.reset_index(drop=True), tsne_df.reset_index(drop=True)], axis=1)

In [None]:
tsne.columns = ['Age', 'tsne1', 'tsne2', 'tsne3']

In [None]:
tsne = tsne.sort_values(by = 'Age')
tsne = tsne.dropna(subset=['Age'])
tsne_copy = tsne.copy()
#Grouping samples by every 10 years
tsne_copy['Age_group'] = pd.cut(tsne['Age'], bins=range(0,101,30), right=False)

In [None]:
import plotly.graph_objs as go
import plotly.io as pio

In [None]:
traces = []
for key, group in tsne_copy.groupby('Age_group', observed=False):
    trace = go.Scatter3d(
        x=group['tsne1'],
        y=group['tsne2'],
        z=group['tsne3'],  # Assuming you have a 'tsne3' column for the third dimension
        mode='markers',
        marker=dict(
            size=6,
            opacity=0.8
        ),
        name=str(key)
    )
    traces.append(trace)

In [None]:
layout = go.Layout(
    scene=dict(
        xaxis=dict(title='tsne1'),
        yaxis=dict(title='tsne2'),
        zaxis=dict(title='tsne3')
    ),
    legend=dict(
        x=0,
        y=1.0,
        bgcolor='rgba(255,255,255,0)',
        bordercolor='rgba(255,255,255,0)',
        itemwidth=50
    )
)

In [None]:
fig = go.Figure(data=traces, layout=layout)
fig.show()

In [None]:
pio.write_html(fig, 'C:\\Users\\G20187729\\Desktop\\Python_codes_3\\Data\\Koopman best r2_78\\Next 280 epochs\\3d_tsne_plot.html')

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [None]:
# Calculate pairwise distances between latent vectors
pairwise_distances = np.linalg.norm(Z[:, None] - Z, axis=-1)

# Create heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(pairwise_distances, cmap='viridis', square=True)
plt.title('Pairwise Distance Heatmap of Autoencoder Latent Space')
plt.xlabel('Data Points')
plt.ylabel('Data Points')
plt.show()

In [None]:
with tf.device('/GPU:0'):
    Z_mean, Z_log_var, Z = vae.encoder(test_loader)

In [None]:
print(Z.shape)

In [None]:
from scipy import stats
def correlation_analysis(array1, array2, threshold=0.001):
    correlations = np.zeros((array1.shape[1], array2.shape[1]))
    p_values = np.zeros((array1.shape[1], array2.shape[1]))

    for i in range(array1.shape[1]):
        for j in range(array2.shape[1]):
            correlation, p_value = pearson_correlation(array1[:, i], array2[:, j])
            correlations[i, j] = correlation
            p_values[i, j] = p_value

    # Rank features based on significance (lower p-value means higher significance)
    sorted_indices = np.argsort(p_values, axis=None)
    ranked_indices = np.unravel_index(sorted_indices, p_values.shape)

    # Filter features based on significance threshold
    significant_features = [(i, j) for i, j in zip(ranked_indices[0], ranked_indices[1]) if p_values[i, j] < threshold]

    return correlations, p_values, significant_features

def pearson_correlation(x, y):.
    correlation, p_value = stats.pearsonr(x, y)
    return correlation, p_value


In [None]:
correlations, p_values, significant_features = correlation_analysis(Z, X_test_scaled)

In [None]:
# Writing to CSV file
correlations_df = pd.DataFrame(correlations)
p_values_df = pd.DataFrame(p_values)
significant_features_df = pd.DataFrame(significant_features, columns=[f"Feature{i+1}" for i in range(len(significant_features[0]))])

correlations_df.to_csv("C:\\Users\\G20187729\\Desktop\\Python_codes_3\\Data\\correlations.csv", index=False)
p_values_df.to_csv("C:\\Users\\G20187729\\Desktop\\Python_codes_3\\Data\\p_values.csv", index=False)
significant_features_df.to_csv("C:\\Users\\G20187729\\Desktop\\Python_codes_3\\Data\\significant_features.csv", index=False)

In [None]:
# Creating a list of dictionaries where each dictionary represents a row in the DataFrame
rows = []
for significant_tuple in significant_features:
    row_dict = {}
    for i, feature_index in enumerate(significant_tuple):
        row_dict[f"Feature{i+1}"] = feature_index
    rows.append(row_dict)

# Creating the DataFrame
significant_features_df = pd.DataFrame(rows)
significant_features_df.to_csv("C:\\Users\\G20187729\\Desktop\\Python_codes_3\\Data\\significant_features.csv", index=False)

In [None]:
#correlation based on percentile and z-score method

In [None]:
from scipy import stats
def correlation_analysis(array1, array2, threshold_percentile=99):
    correlations = np.zeros((array1.shape[1], array2.shape[1]))
    p_values = np.zeros((array1.shape[1], array2.shape[1]))

    for i in range(array1.shape[1]):
        for j in range(array2.shape[1]):
            correlation, p_value = pearson_correlation(array1[:, i], array2[:, j])
            correlations[i, j] = correlation
            p_values[i, j] = p_value

    # Check if data is normally distributed
    is_normally_distributed = check_normal_distribution(np.concatenate((array1, array2), axis=1))

    # Set threshold based on normality
    if is_normally_distributed:
        threshold = np.percentile(correlations, threshold_percentile)
    else:
        threshold = np.percentile(abs(correlations), 100 - stats.norm.cdf(1.96) * 100)

    # Rank features based on correlation coefficient
    significant_features = np.where(abs(correlations) > threshold)

    return correlations, p_values, significant_features

def pearson_correlation(x, y):
    correlation, p_value = stats.pearsonr(x, y)
    return correlation, p_value

def check_normal_distribution(data):
    _, p_value = stats.normaltest(data)
    return (p_value > 0.01).all()  # Check if all elements satisfy the condition

In [None]:
correlations, p_values, significant_features = correlation_analysis(Z, X_test_scaled)

In [None]:
# Writing to CSV file
correlations_df = pd.DataFrame(correlations)
p_values_df = pd.DataFrame(p_values)
significant_features_df = pd.DataFrame(significant_features, columns=[f"Feature{i+1}" for i in range(len(significant_features[0]))])

In [None]:
print(significant_features_df.shape)

In [None]:
correlations_df.to_csv("C:\\Users\\G20187729\\Desktop\\Python_codes_3\\Data\\correlations_normality_checked.csv", index=False)
p_values_df.to_csv("C:\\Users\\G20187729\\Desktop\\Python_codes_3\\Data\\p_values_normality_checked.csv", index=False)
significant_features_df.to_csv("C:\\Users\\G20187729\\Desktop\\Python_codes_3\\Data\\significant_features_normality_checked.csv", index=False)

In [None]:
is_normally_distributed = check_normal_distribution(Z)
print("Is the data normally distributed?", is_normally_distributed)

In [None]:
# Calculate -log10(p-values)
neg_log_p_values = -np.log10(p_values)

# Extract correlation coefficients
correlation_coefficients = correlations.flatten()

# Plot volcano plot
plt.figure(figsize=(8, 6))
plt.scatter(correlation_coefficients, neg_log_p_values, color='blue', alpha=0.5)
plt.axhline(-np.log10(threshold), color='red', linestyle='--', label=f'Threshold = {threshold:.2f}')
plt.xlabel('Correlation Coefficient')
plt.ylabel('-log10(p-value)')
plt.title('Volcano Plot')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Get the Koopman matrix
koopman_matrix = model.koopman.koopman_operator.get_koopman_matrix()

# Compute the eigenvectors and eigenvalues
eigenvalues, eigenvectors = np.linalg.eig(koopman_matrix)

In [None]:
# Sort eigenvectors based on eigenvalues
sorted_indices = np.argsort(eigenvalues)[::-1]  # Sort in descending order
eigenvalues = eigenvalues[sorted_indices]
eigenvectors = eigenvectors[:, sorted_indices]

In [None]:
import matplotlib.pyplot as plt
# Plot the eigenvectors
num_eigenvectors_to_plot = 50  # Change this according to the number of eigenvectors you want to plot
for i in range(num_eigenvectors_to_plot):
    plt.plot(eigenvectors[:, i], label=f'Eigenvector {i+1}')

plt.xlabel('Index')
plt.ylabel('Value')
plt.title('Eigenvectors of Koopman Operator')
#plt.legend()
plt.show()

In [None]:
plt.bar(np.arange(len(eigenvalues)), eigenvalues)
plt.xlabel('Eigenvalue Index')
plt.ylabel('Eigenvalue Magnitude')
plt.title('Eigenvalues of Koopman Matrix')
plt.show()

In [None]:
# Plot eigenvalues on the complex plane
plt.figure(figsize=(8, 6))
plt.scatter(np.real(eigenvalues), np.imag(eigenvalues), color='blue', label='Eigenvalues')
plt.axhline(y=0, color='black', linestyle='--', linewidth=0.5)  # Horizontal line at y=0 (real axis)
plt.axvline(x=0, color='black', linestyle='--', linewidth=0.5)  # Vertical line at x=0 (imaginary axis)
plt.xlabel('Real Part')
plt.ylabel('Imaginary Part')
plt.title('Eigenvalues on Complex Plane')
plt.grid(True)
plt.legend()
plt.show()


In [None]:
# Interpretation
print("Eigenvalues:")
for eigenvalue in eigenvalues:
    print(f"Real Part: {np.real(eigenvalue):.2f}, Imaginary Part: {np.imag(eigenvalue):.2f}")

# Interpretation guidelines (replace with your analysis)
print("\nInterpretation:")
print("- Eigenvalues with negative real parts represent stable modes.")
print("- Eigenvalues with positive real parts represent unstable modes.")
print("- Eigenvalues with zero real parts represent neutral modes.")
print("- Nonzero imaginary parts indicate oscillatory behavior.")

In [None]:
normalized_eigenvectors = eigenvectors / np.linalg.norm(eigenvectors, axis=0)

# Plotting the heatmap
plt.figure(figsize=(10, 6))
plt.imshow(normalized_eigenvectors[:, :num_eigenvectors_to_plot], cmap='viridis', aspect='auto')
plt.colorbar(label='Normalized Value')
plt.xlabel('Eigenvector Index')
plt.ylabel('State Index')
plt.title('Heatmap of Eigenvectors of Koopman Operator')
plt.show()

In [None]:
def plotEignValues(model):
    '''
    Plots the eigen values of the learned Koopman operator
    Args:
        args (argparse): object with programs arguements
        model (tensorflow.keras.Model): tensorflow model with koopman operator kMatrix
    '''
    # Get koopman operator from model
    kMatrix = model.koopman.koopman_operator.get_koopman_matrix().numpy()

    try:
        w, v = np.linalg.eig(kMatrix)
    except:
        print('issue computing eigs')
        return

    plt.close('all')
    plt.scatter(np.real(np.log(np.abs(w))), np.imag(w))

In [None]:
plotEignValues(model)

In [None]:
def plotEignVectors(model, n=3):
    '''
    Plots the eigen vectors of the learned Koopman operator
    Args:
        args (argparse): object with programs arguements
        model (tensorflow.keras.Model): tensorflow model with koopman operator kMatrix
    '''
    # Get koopman operator from model
    kMatrix = model.koopman.koopman_operator.get_koopman_matrix().numpy()
    w, v = np.linalg.eig(kMatrix)
    idx = np.argsort(w)[::-1]
    w = w[idx]
    v = v[:,idx]

    data = test_loader
    yPred = tf.constant(data[:,:,:-1], dtype=tf.float32)

    plt.close("all")
    fig, ax = plt.subplots(1, n, figsize=(4*n, 4))


In [None]:
plotEignVectors(model, n=3)

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor

In [None]:
combined_data = np.concatenate([X0_train_scaled, X1_train_scaled])

In [None]:
combined_latent = np.concatenate([Z_t, Z_t1])

In [None]:
print(combined_latent.shape)

In [None]:
from scipy import stats
def correlation_analysis(array1, array2, threshold_percentile=99):
    correlations = np.zeros((array1.shape[1], array2.shape[1]))
    p_values = np.zeros((array1.shape[1], array2.shape[1]))

    for i in range(array1.shape[1]):
        for j in range(array2.shape[1]):
            correlation, p_value = pearson_correlation(array1[:, i], array2[:, j])
            correlations[i, j] = correlation
            p_values[i, j] = p_value

    # Check if data is normally distributed
    is_normally_distributed = check_normal_distribution(np.concatenate((array1, array2), axis=1))

    # Set threshold based on normality
    if is_normally_distributed:
        threshold = np.percentile(correlations, threshold_percentile)
    else:
        threshold = np.percentile(abs(correlations), 100 - stats.norm.cdf(1.96) * 100)

    # Rank features based on correlation coefficient
    significant_features = np.where(abs(correlations) > threshold)

    return correlations, p_values, significant_features

def pearson_correlation(x, y):
    correlation, p_value = stats.pearsonr(x, y)
    return correlation, p_value

def check_normal_distribution(data):
    _, p_value = stats.normaltest(data)
    return (p_value > 0.01).all()  # Check if all elements satisfy the condition

In [None]:
correlations, p_values, significant_features = correlation_analysis(combined_data, combined_latent)

In [None]:
print(correlations.shape)

In [None]:
rf = RandomForestRegressor(n_estimators = 100)
rf.fit(combined_data, combined_latent)

In [None]:
feature_importances = rf.feature_importances_

In [None]:
feature_importances

In [None]:
plt.figure(figsize=(10, 6))
plt.bar(range(len(feature_importances)), feature_importances, color='skyblue')
plt.xlabel("Feature Index")
plt.ylabel("Feature Importance")
plt.title('Feature Importance for Predicting Latent Space')
plt.show

In [None]:
plt.figure(figsize=(12, 8))
sns.heatmap(correlations, cmap='coolwarm', xticklabels=False)
plt.title('Correlation between Features and Latent Space')
plt.xlabel('Latent Space')
plt.ylabel('Features')
plt.show()

In [None]:
for key, indices in grouped_indices.items():
    print("Group", key, ":")
    for index in indices:
        print("Feature index:", index)

In [None]:
group_1_array = np.array(grouped_indices[1])
print("Shape of Group 1 array:", group_1_array.shape)

In [None]:
import os
import csv

# Define the directory where CSV files will be saved
output_directory = "C:\\Users\\G20187729\\Desktop\\Python_codes_3\\Data\\correlations"

# Assuming grouped_indices is a dictionary containing groups of feature indices

# Define a function to write a group to a CSV file
def write_group_to_csv(group_indices, group_number):
    filename = os.path.join(output_directory, f"Group_{group_number}_indices.csv")
    with open(filename, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["Feature Index"])
        for index in group_indices:
            writer.writerow([index])

# Write each group to a separate CSV file
for group_number, indices in grouped_indices.items():
    write_group_to_csv(indices, group_number)

In [None]:
print(correlations.shape)

In [None]:
significant_features_df = pd.DataFrame(significant_features, columns=[f"Feature{i+1}" for i in range(len(significant_features[0]))])
print(significant_features_df.shape)

In [None]:
significant_features

In [None]:
# Directory to store CSV files
directory = "significant_feature_groups"
if not os.path.exists(directory):
    os.makedirs(directory)

# Create a DataFrame with significant_features
df = pd.DataFrame({
    'Feature1': significant_features[1],  # Assuming Feature1 is the first array in significant_features
    'Feature2': significant_features[0]   # Assuming Feature2 is the second array in significant_features
})

# Group by unique values of Feature1
groups = df.groupby('Feature1')

# Iterate over groups and create individual CSV files
for group_name, group_data in groups:
    filename = f"group_{group_name}.csv"
    filepath = os.path.join(directory, filename)
    group_data[['Feature2']].to_csv(filepath, index=False)  # Save only Feature2 column

In [None]:
import numpy as np
from joblib import Parallel, delayed
from scipy.stats import spearmanr

def correlation_worker(i, j, array1, array2):
    correlation, p_value = spearmanr(array1[:, i], array2[:, j])
    return correlation, p_value

def correlation_analysis_parallel(array1, array2, threshold_percentile=99):
    num_cores = -1  # Use all available CPU cores

    correlations = np.zeros((array1.shape[1], array2.shape[1]))
    p_values = np.zeros((array1.shape[1], array2.shape[1]))

    results = Parallel(n_jobs=num_cores)(
        delayed(correlation_worker)(i, j, array1, array2)
        for i in range(array1.shape[1]) for j in range(array2.shape[1])
    )

    for (i, j), (correlation, p_value) in zip(np.ndindex(correlations.shape), results):
        correlations[i, j] = correlation
        p_values[i, j] = p_value

    # Set threshold based on normality
    threshold = np.percentile(abs(correlations), threshold_percentile)

    # Rank features based on correlation coefficient
    significant_features = np.where(abs(correlations) > threshold)

    return correlations, p_values, significant_features

# Assuming you have prepared your data arrays: array1 and array2
correlations, p_values, significant_features = correlation_analysis_parallel(combined_data, combined_latent)

In [None]:
print(correlations.shape)

In [None]:
significant_features_df = pd.DataFrame(significant_features, columns=[f"Feature{i+1}" for i in range(len(significant_features[0]))])
print(significant_features_df.shape)

In [None]:
significant_features

In [None]:
import os
# Directory to store CSV files
directory = "significant_feature_groups"
if not os.path.exists(directory):
    os.makedirs(directory)

# Create a DataFrame with significant_features
df = pd.DataFrame({
    'Feature1': significant_features[1],  # Assuming Feature1 is the first array in significant_features
    'Feature2': significant_features[0]   # Assuming Feature2 is the second array in significant_features
})

# Group by unique values of Feature1
groups = df.groupby('Feature1')

# Iterate over groups and create individual CSV files
for group_name, group_data in groups:
    filename = f"group_{group_name}.csv"
    filepath = os.path.join(directory, filename)
    group_data[['Feature2']].to_csv(filepath, index=False)  # Save only Feature2 column