In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import importlib
import sys
import os
import gower
from kneed import KneeLocator
import torch
from torch import nn
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR
import torchsummary
from torch.utils.data import Dataset, DataLoader

In [None]:
# import custom scripts
sys.path.append(os.path.abspath(os.path.join('..', 'scripts')))
import utils
importlib.reload(utils)
import models
importlib.reload(models)
import preprocessing
importlib.reload(preprocessing)

sns.set_palette('Set2')

In [None]:
df = preprocessing.load_dataset(scaler= None)
data_tensor = torch.tensor(df.to_numpy(), dtype=torch.float32)
binary_indices = utils.binary_indices
continuous_indices = utils.continuous_indices

In [None]:
# used when trying to find the best weight for the loss function
deviations = df.iloc[:,continuous_indices].std().to_numpy()
print(f"weigthing is : {1/(deviations)}")

In [None]:
# create a custom dataset class that will return the batches as tensors
class DataFrameDataset(Dataset):
    def __init__(self, dataframe):
        self.data = torch.tensor(dataframe.values, dtype=torch.float32)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]
def create_dataloader(df, batch_size=1, shuffle=True):
    dataset = DataFrameDataset(df)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)
    return dataloader
dataloader = create_dataloader(df, batch_size=32, shuffle=True)

In [None]:
# set the seed for reproducibility
torch.manual_seed(99)
# initialize model as 3 dimensional latent space autoencoder    
model = models.Autoencoder_Encoder(binary_indices = binary_indices)
torchsummary.summary(model);

In [None]:
# training loop
epochs = 50
optimizer = optim.Adam(model.parameters(), lr=0.01)
scheduler = StepLR(optimizer, step_size=10, gamma=0.5)

# tried a custom loss function that weigths the loss by  a factor (1/standard deviation)
#                                  continuous_stds=deviations)

criterion = models.Autoencoder_Loss_Prob(binary_indices=binary_indices,
                                         continuous_indices= continuous_indices)
for epoch in range(epochs):
    for data in dataloader:
        model.train()
        optimizer.zero_grad()
        x_reconstructed = model(data) 
        loss = criterion(data, x_reconstructed)
        loss.backward()
        optimizer.step()
    scheduler.step()
    if epoch % 10 == 0:
        print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}, LR: {scheduler.get_last_lr()[0]}')

print("Training complete")


In [None]:
model.eval()
distances = [criterion(data_tensor[i,:].unsqueeze(0), 
                        model(data_tensor)[i,:].unsqueeze(0)).item() 
             for i in range(len(df))]
sorted_distances = np.sort(distances)

# Use KneeLocator to find the knee point (3.8 if normalized)
knee = KneeLocator(range(len(sorted_distances)), 
                   sorted_distances, 
                   curve='convex', 
                   direction='increasing',
                   # S = 3.5) for the normalized distances
                   S = 7.5)

# Plot the sorted distances and the kneepoint
plt.plot(sorted_distances, marker='o', linestyle='-', color='b', label='Sorted Distances')
plt.axvline(x=knee.knee, color='r', linestyle='--', label='Knee Point')
plt.axhline(y=sorted_distances[knee.knee], color='r', linestyle='--')

# Adding labels and title
plt.xlabel('Index')
plt.ylabel('Distance')
plt.title('Sorted Distances with Knee Point')
plt.legend()
# Show the plot
plt.show()
print(f"outliers: {np.sum([1 if i > knee.knee_y else 0 for i in distances])}")

In [None]:
# get worst 3 reconstructed errors
np.argsort(distances)[-3:]

In [None]:
# visualize the reconstruction of the data
model.eval()
test_output = model(data_tensor)
# index = 1
# index = 1626
# index = 1077 ## only 0 in column 14
# index = 6256
# index = 4932
# index = 1077
index = 2750
original_data = data_tensor.numpy()[index]
reconstructed_data = test_output.detach().numpy()[index]
reconstructed_data_raw =  reconstructed_data.copy()
reconstructed_data[binary_indices] = (reconstructed_data[binary_indices] > 0.5)
#reconstructed_data[binary_indices] = (reconstructed_data[binary_indices])

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 4))
axes[0].plot(original_data, 'b')
axes[0].set_title('Original Data')
axes[1].plot(reconstructed_data, 'r')
axes[1].set_title('Reconstructed Data')
plt.show()

In [None]:
# plot it as heatmap
fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(12, 3), 
                         gridspec_kw={'height_ratios': [1, 1]})

# Reshape the 1D arrays to 2D (1 x number of elements)
original_data_2d = original_data.reshape(1, -1)
reconstructed_data_2d = reconstructed_data.reshape(1, -1)

# Generate tick positions and labels
x_positions = np.arange(original_data_2d.shape[1]) + 0.5
x_labels = np.arange(original_data_2d.shape[1])

# Heatmap for original data
cax1 = axes[0].imshow(original_data_2d, cmap='viridis', aspect='auto', vmin=0, vmax=1)
axes[0].set_title('Original data', fontsize=16)
axes[0].set_yticks([])  # Remove y-axis labels
axes[0].set_xticks(x_positions)
axes[0].set_xticklabels(x_labels)

# Heatmap for reconstructed data
cax2 = axes[1].imshow(reconstructed_data_2d, cmap='viridis', aspect='auto', vmin=0, vmax=1)
axes[1].set_title('Reconstructed data', fontsize=16)
axes[1].set_yticks([])  # Remove y-axis labels
axes[1].set_xticks(x_positions)
axes[1].set_xticklabels(x_labels)


# Adjust the layout to reduce the vertical space between subplots
plt.tight_layout(pad=2.5)  # Increase pad value to prevent overlap


# Add a shared colorbar for both heatmaps
cbar = fig.colorbar(cax1, ax=axes, 
                    orientation='vertical', 
                    fraction=0.1, 
                    pad=0.02,
                    aspect=10, 
                    shrink=1)  # Increase the fraction to make the colorbar larger
cbar.ax.tick_params(labelsize=12)  # Increase font size of colorbar labels
cbar.set_ticks([0, 1])  # Set colorbar to only show ticks at 0 and 1
cbar.set_ticklabels(['0', '1'])  # Explicitly set the tick labels to '0' and '1'


# Display the plot
plt.show()


In [None]:
outlier_index = [-1 if i > knee.knee_y else 0 for i in distances]
print(f"percentage of outliers is: {-np.sum(outlier_index)/len(outlier_index) * 100: .2f}%")

In [None]:
# higlight the outliers in the TSNE plot
utils.plot_TSNE(df, outlier_index, dist_matrix= gower.gower_matrix(df))

In [None]:
# study occurencies of Falses in binary columns in outliers and non-outliers
pd.concat([
    np.abs(np.sum((df[np.array(outlier_index) == -1]).iloc[:,binary_indices]-1,axis = 1)).describe(),
    np.abs(np.sum((df[np.array(outlier_index) != -1]).iloc[:,binary_indices]-1,axis = 1)).describe()], 
    axis = 1,
    keys = ['falses/obs Outliers', 'Falses/obs non-out'])

In [None]:
# compare a random block of reconstructed results with original data
import random
model.eval()
test_output = model(data_tensor)

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 4))

for i in random.sample(range(0, len(df)), 100):
    original_data = data_tensor.numpy()[i]
    reconstructed_data = test_output.detach().numpy()[i]
    reconstructed_data[binary_indices] = (reconstructed_data[binary_indices] > 0.5)
    #reconstructed_data[binary_indices] = (reconstructed_data[binary_indices])
    axes[0].plot(original_data, 'b')
    axes[0].set_title('Original Data')
    axes[1].plot(reconstructed_data, 'r')
    axes[1].set_title('Reconstructed Data')
plt.show()

# Study losses


In [None]:
binary_temp_ind = [0]
continuous_temp_ind = [1]

x = torch.tensor([1.0, 1.0]).unsqueeze(0)
y = torch.tensor([0.5, 0.5]).unsqueeze(0)

bce = nn.BCELoss()
l1 = nn.MSELoss()

binary_loss = bce(x[:, binary_temp_ind], y[:, binary_temp_ind])
continuous_loss = l1(x[:, continuous_temp_ind], y[:, continuous_temp_ind])

combined_loss = (binary_loss + continuous_loss) / 2

print(f"binary_loss: {binary_loss.item()}, "
      f"continuous_loss: {continuous_loss.item()}, combined_loss: {combined_loss.item()}")



In [None]:
losses = []
for i in np.linspace(0.000001, 0.9999, 100):
    b = bce(i * torch.tensor([1.0]).unsqueeze(0),
            torch.tensor([0.]).unsqueeze(0))
    c = 2 * l1(i * torch.tensor([1.]).unsqueeze(0),
            torch.tensor([0.]).unsqueeze(0))
    d = 3 * l1(i * torch.tensor([1.]).unsqueeze(0),
            torch.tensor([0.]).unsqueeze(0)) 
    e = 5 * l1(i * torch.tensor([1.]).unsqueeze(0),
            torch.tensor([0.]).unsqueeze(0)) 
    f = 2.5 * l1(i * torch.tensor([1.]).unsqueeze(0),
            torch.tensor([0.]).unsqueeze(0)) 
    losses.append((b.item(), c.item(), d.item(), e.item(), f.item()))

In [None]:
import matplotlib.pyplot as plt
plt.plot(np.linspace(0,1,100), [i[1] for i in losses], label='continuous', color=sns.color_palette("Set2")[0])
plt.plot(np.linspace(0,1,100), [i[2] for i in losses], color=sns.color_palette("Set2")[0])
plt.plot(np.linspace(0,1,100), [i[3] for i in losses], color=sns.color_palette("Set2")[0])
plt.plot(np.linspace(0,1,100), [i[4] for i in losses], color=sns.color_palette("Set2")[0])

plt.plot(np.linspace(0,1,100), [i[0] for i in losses], label='binary', color=sns.color_palette("Set2")[1])
plt.legend()
plt.ylabel('Loss')
plt.xlabel('x')
plt.ylim(0, 5)
plt.savefig("weighted_loss.png")
plt.show()


In [None]:
my3d_coord = model(data_tensor)

In [None]:
# plot 3 d
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(my3d_coord[:,0].detach().numpy(), my3d_coord[:,1].detach().numpy(), my3d_coord[:,2].detach().numpy(), c=outlier_index)