

---

# Intro - Autoencoder

**Plan** - produce proof of concept autoencoder architecture.

**Purpose**: dimensionality reduction for input data.

**Hypothesis**: with automatic HP tuning an autoencoder can be used to reduce dimensionality of input data, whilst retaining adequate information to accurately reproduce input data.

**Methodology**: Test on multiple datasets - first the Iris dataset, then Pima Indians, then finally credit card fraud dataset. Evaluate and assess model architecture and visualize latent space using PCA/UMAP etc.


# First dataset - iris dataset
---
## Data sourcing and processing


In [1]:

#import packages :

import os
os.environ['NUMBAPRO_NVVM'] = '/usr/local/cuda/nvvm/lib64/libnvvm.so'
os.environ['NUMBAPRO_LIBDEVICE'] = '/usr/local/cuda/nvvm/libdevice/'

from google.colab import drive

try:
  import google.colab
  IN_COLAB = True
except:
  IN_COLAB = False

if IN_COLAB:
  # Check if drive is mounted by looking for the mount point in the file system.
  # This is a more robust approach than relying on potentially internal variables.
  import os
  if not os.path.exists('/content/drive'):
    drive.mount('/content/drive')

#basics
import os
from google.colab import drive
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
!pip install tqdm
from tqdm import tqdm

#table one
!pip install tableone
from tableone import TableOne

#torch
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

#sklearn
!pip install scikit-optimize
from skopt import gp_minimize
from skopt.space import Integer, Real, Categorical

from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from torch.utils.data import DataLoader, random_split, Dataset
from sklearn.datasets import load_breast_cancer
!pip install skopt
from skopt import gp_minimize
from skopt.space import Integer, Real, Categorical
from skopt.utils import use_named_args
from sklearn.preprocessing import MinMaxScaler



from imblearn.over_sampling import RandomOverSampler

Mounted at /content/drive
Collecting tableone
  Downloading tableone-0.9.1-py3-none-any.whl.metadata (8.5 kB)
Downloading tableone-0.9.1-py3-none-any.whl (41 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.6/41.6 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tableone
Successfully installed tableone-0.9.1
Collecting scikit-optimize
  Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl.metadata (9.7 kB)
Collecting pyaml>=16.9 (from scikit-optimize)
  Downloading pyaml-24.9.0-py3-none-any.whl.metadata (11 kB)
Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl (107 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.8/107.8 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyaml-24.9.0-py3-none-any.whl (24 kB)
Installing collected packages: pyaml, scikit-optimize
Successfully installed pyaml-24.9.0 scikit-optimize-0.10.2
[31mERROR: Could not find a version that satisfies the requirement skop

In [2]:
#import data and visualize

# Load the breast cancer dataset
data = load_breast_cancer()

# Convert to Pandas DataFrame
bc_df = pd.DataFrame(data.data, columns=data.feature_names)

# Add the target column (malignant or benign)
bc_df['target'] = data.target

bc_columns = bc_df.columns.tolist()

print(f"The dataset lenghth is {str(len(bc_df))}")
print(f"The number of columns is {str(len(bc_columns))}")
print(f"The column names are {str(bc_columns)} \n")

table1 = TableOne(bc_df, columns=bc_columns, groupby= 'target', pval=True)
print(table1)



The dataset lenghth is 569
The number of columns is 31
The column names are ['mean radius', 'mean texture', 'mean perimeter', 'mean area', 'mean smoothness', 'mean compactness', 'mean concavity', 'mean concave points', 'mean symmetry', 'mean fractal dimension', 'radius error', 'texture error', 'perimeter error', 'area error', 'smoothness error', 'compactness error', 'concavity error', 'concave points error', 'symmetry error', 'fractal dimension error', 'worst radius', 'worst texture', 'worst perimeter', 'worst area', 'worst smoothness', 'worst compactness', 'worst concavity', 'worst concave points', 'worst symmetry', 'worst fractal dimension', 'target'] 

                                    Grouped by target                                                      
                                              Missing        Overall               0              1 P-Value
n                                                                569             212            357        
mean radius,



---

# Autoencoder (Breast cancer dataset)

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, random_split, Dataset
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
import numpy as np
from skopt import gp_minimize
from skopt.space import Integer, Real, Categorical

# **Set device for GPU acceleration**
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if device.type != 'cuda':
    print("WARNING: GPU is not available. The model will run on the CPU, which might be slower.")
else:
    print("Cuda setup successful")

# Load and preprocess the Breast Cancer dataset
breast_cancer = load_breast_cancer()
data = breast_cancer.data

# Scale data to the range [0, 1] for better convergence
scaler = MinMaxScaler()
data = scaler.fit_transform(data)

# Convert to PyTorch Dataset
class BreastCancerDataset(Dataset):
    def __init__(self, data):
        self.data = torch.tensor(data, dtype=torch.float32)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

dataset = BreastCancerDataset(data)

#----------------- Split into training, validation, and test sets -----------------#

#70% train data
train_size = int(0.7 * len(dataset))
#20% validation set holdout (aka dev set)
val_size = int(0.2 * len(dataset))
#remaining data is test data
test_size = len(dataset) - train_size - val_size
#randomize split
train_set, val_set, test_set = random_split(dataset, [train_size, val_size, test_size])

# Flexible Autoencoder architecture
class FlexibleAutoencoder(nn.Module):
    def __init__(self, input_dim, encoder_layers, decoder_layers, activations, dropout_prob, use_batchnorm):
        super(FlexibleAutoencoder, self).__init__()
        # Build the encoder
        encoder = []
        in_dim = input_dim

        #flexible number of encoder layers (independent of decoder layers as that number is usually greater)
        for i in range(len(encoder_layers)):
            encoder.append(nn.Linear(in_dim, encoder_layers[i]))

            #Batchnorm will be a HP
            if use_batchnorm:
                encoder.append(nn.BatchNorm1d(encoder_layers[i]))
            encoder.append(getattr(nn, activations[i])())

            if dropout_prob > 0:  # Apply dropout only if > 0
                encoder.append(nn.Dropout(dropout_prob))

            in_dim = encoder_layers[i]

        self.encoder = nn.Sequential(*encoder)

        # Build the decoder
        decoder = []
        in_dim = encoder_layers[-1]

        #flexible number of decoder layers
        for i in range(len(decoder_layers)):
            out_dim = input_dim if i == len(decoder_layers) - 1 else decoder_layers[i]
            decoder.append(nn.Linear(in_dim, out_dim))

            #batch norm will be a HP
            if use_batchnorm and i != len(decoder_layers) - 1:
                decoder.append(nn.BatchNorm1d(out_dim))
            decoder.append(getattr(nn, activations[len(encoder_layers) + i])())

            #whether or not we use dropout is also a HP
            if dropout_prob > 0 and i != len(decoder_layers) - 1:
                decoder.append(nn.Dropout(dropout_prob))
            in_dim = decoder_layers[i]

        self.decoder = nn.Sequential(*decoder)

    def forward(self, x):
        #define forward pass through encoder to get latent space representation
        latent = self.encoder(x)
        #reconstruct data from latent space representation
        reconstructed = self.decoder(latent)
        return latent, reconstructed

# Custom loss function combining MSE and latent space regularization
def custom_loss_function(reconstructed, original, latent_representation, alpha=0.01):
    #the purpose of this is to balance latent space dimensionality with MSE loss
    reconstruction_loss = nn.MSELoss()(reconstructed, original)
    regularization_loss = alpha * torch.norm(latent_representation, p=1)  # L1 norm for compactness of latent space representation
    return reconstruction_loss + regularization_loss

    # Training and validation function
def train_validate_autoencoder(encoder_layers, decoder_layers, activations, lr, batch_size, dropout_prob, latent_dim):
    # Prepare dataloaders
    batch_size = int(batch_size)
    #load data in batches
    train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, drop_last=True)

    # The following line is changed to set drop_last=False in the DataLoader.
    # This makes sure that all data is used in the validation
    # phase, even if the last batch is smaller than the specified batch size.
    val_loader = DataLoader(val_set, batch_size=batch_size, shuffle=False, drop_last=False)  # Change drop_last to False

    # Instantiate the model
    model = FlexibleAutoencoder(
        input_dim=data.shape[1],
        encoder_layers=encoder_layers,
        decoder_layers=decoder_layers,
        activations=activations,
        dropout_prob=dropout_prob,
        use_batchnorm=True  # Batch normalization always enabled
    ).to(device)

    # Define optimizer
    optimizer = optim.Adam(model.parameters(), lr=lr)
    num_epochs = 100
    train_losses, val_losses = [], []

    #define what happens in each epoch
    for epoch in range(num_epochs):
        #fit model
        model.train()
        train_loss = 0

        for batch in train_loader:
            batch = batch.to(device)
            optimizer.zero_grad()
            latent_representation, reconstructed = model(batch)
            loss = custom_loss_function(reconstructed, batch, latent_representation)
            #backprop based on the batch we pass in
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        train_losses.append(train_loss / len(train_loader))

        # Validation phase for this particular combo of HPs
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                batch = batch.to(device)
                latent_representation, reconstructed = model(batch)
                loss = custom_loss_function(reconstructed, batch, latent_representation)
                val_loss += loss.item()

        # Add a check to prevent division by zero if val_loader is empty.
        # This check calculates validation loss per batch to avoid division errors.
        val_losses.append(val_loss / len(val_loader) if len(val_loader) else 0) # Handle potential ZeroDivisionError

    return val_losses[-1], train_losses, val_losses

# Define hyperparameter space
space = [
    Integer(1, 5, name='num_encoder_layers'),
    Integer(1, 5, name='num_decoder_layers'),
    Integer(4, 128, name='num_neurons'),
    Categorical(['ReLU', 'Sigmoid', 'Tanh'], name='activation_fn'),
    Real(1e-4, 1e-2, prior='log-uniform', name='lr'),
    Integer(32, 128, name='batch_size'),
    Real(0.0, 0.5, name='dropout_prob'),  # Includes 0 for no dropout
    Real(2, 32, name='latent_dim')       # Optimize latent space dimension
]

# Objective function for optimization
def objective(params):
    num_encoder_layers, num_decoder_layers, num_neurons, activation_fn, lr, batch_size, dropout_prob, latent_dim = params
    encoder_layers = [num_neurons] * num_encoder_layers
    decoder_layers = [num_neurons] * num_decoder_layers
    activations = [activation_fn] * (num_encoder_layers + num_decoder_layers)

    val_loss, train_losses, val_losses = train_validate_autoencoder(
        encoder_layers, decoder_layers, activations, lr, batch_size, dropout_prob, latent_dim
    )
    return val_loss

# Perform Bayesian optimization
result = gp_minimize(objective, space, n_calls=30, random_state=42)

# Output the best hyperparameters
print("Best hyperparameters:")
print(f"Encoder layers: {result.x[0]}")
print(f"Decoder layers: {result.x[1]}")
print(f"Neurons per layer: {result.x[2]}")
print(f"Activation function: {result.x[3]}")
print(f"Learning rate: {result.x[4]}")
print(f"Batch size: {result.x[5]}")
print(f"Dropout probability: {result.x[6]}")
print(f"Latent dimension: {result.x[7]}")


# Get the best parameters and re-train the model
best_params = result.x
num_encoder_layers, num_decoder_layers, num_neurons, activation_fn, lr, batch_size, dropout_prob, latent_dim = best_params
encoder_layers = [num_neurons] * num_encoder_layers
decoder_layers = [num_neurons] * num_decoder_layers
activations = [activation_fn] * (num_encoder_layers + num_decoder_layers)

# Function to evaluate on the test set
def evaluate_on_test_set(encoder_layers, decoder_layers, activations, lr, batch_size, dropout_prob, latent_dim):
    # Prepare test dataloader
    batch_size = int(batch_size)
    test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False, drop_last=False)

    # Instantiate the model
    model = FlexibleAutoencoder(
        input_dim=data.shape[1],
        encoder_layers=encoder_layers,
        decoder_layers=decoder_layers,
        activations=activations,
        dropout_prob=dropout_prob,
        use_batchnorm=True
    ).to(device)

    # Define optimizer (not used for evaluation, but needed for model instantiation)
    optimizer = optim.Adam(model.parameters(), lr=lr)

    model.eval()
    test_loss = 0
    with torch.no_grad():
        for batch in test_loader:
            batch = batch.to(device)
            latent_representation, reconstructed = model(batch)
            loss = custom_loss_function(reconstructed, batch, latent_representation)
            test_loss += loss.item()

    avg_test_loss = test_loss / len(test_loader)
    return avg_test_loss

# Evaluate on the test set with best parameters
final_test_loss = evaluate_on_test_set(encoder_layers, decoder_layers, activations, lr, batch_size, dropout_prob, latent_dim)

# Print final test loss
print("\nFinal Test Loss:", final_test_loss)


Cuda setup successful
Best hyperparameters:
Encoder layers: 2
Decoder layers: 4
Neurons per layer: 11
Activation function: Tanh
Learning rate: 0.007535384509295551
Batch size: 32
Dropout probability: 0.49610577964560887
Latent dimension: 20.5244452888315

Final Test Loss: 0.6310520172119141




---

# Auto push to github




In [None]:
import datetime
import os

def commit_to_github(commit_msg):
  """
  Funct to autopush to github
  """

  # Navigate to the repository directory
  %cd /content/drive/MyDrive/Colab_Notebooks/Deep_Learning_Practice

  !git add .

  with open('/content/drive/MyDrive/IAM/PAT.txt', 'r') as file:
        github_pat = file.read().strip()
  os.environ['GITHUB_PAT'] = github_pat

  !git remote add origin "https://github.com/archiegoodman2/machine_learning_practice"

  # Replace with your actual username and email
  USERNAME="archiegoodman2"
  EMAIL="archiegoodman2011@gmail.com"

  # Set global username and email configuration
  !git config --global user.name "$USERNAME"
  !git config --global user.email "$EMAIL"

  now = datetime.datetime.now()
  current_datetime = now.strftime("%Y-%m-%d %H:%M")

  # Set remote URL using the PAT from environment variable
  !git remote set-url origin https://{os.environ['GITHUB_PAT']}@github.com/archiegoodman2/machine_learning_practice.git

  # Replace with your desired commit message
  COMMIT_MESSAGE = str(current_datetime) + " " + str(commit_msg)

  # Commit the changes
  !git commit -m "$COMMIT_MESSAGE"

  # Push to origin (force push if necessary)
  !git push -f origin master

  return 1

commit_to_github("added test set validation - to do, improve kfold validation, improve loss functions")


/content/drive/MyDrive/Colab_Notebooks/Deep_Learning_Practice
