

---

# Intro - Autoencoder

**Plan** - produce proof of concept autoencoder architecture.

**Purpose**: dimensionality reduction for input data.

**Hypothesis**: with automatic HP tuning an autoencoder can be used to reduce dimensionality of input data, whilst retaining adequate information to accurately reproduce input data.

**Methodology**: Test on multiple datasets - first the Iris dataset, then Pima Indians, then finally credit card fraud dataset. Evaluate and assess model architecture and visualize latent space using PCA/UMAP etc.


# First dataset - iris dataset
---
## Data sourcing and processing


In [8]:

#import packages :

import os
os.environ['NUMBAPRO_NVVM'] = '/usr/local/cuda/nvvm/lib64/libnvvm.so'
os.environ['NUMBAPRO_LIBDEVICE'] = '/usr/local/cuda/nvvm/libdevice/'

from google.colab import drive

try:
  import google.colab
  IN_COLAB = True
except:
  IN_COLAB = False

if IN_COLAB:
  # Check if drive is mounted by looking for the mount point in the file system.
  # This is a more robust approach than relying on potentially internal variables.
  import os
  if not os.path.exists('/content/drive'):
    drive.mount('/content/drive')

#basics
import os
from google.colab import drive
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
!pip install tqdm
from tqdm import tqdm

#table one
!pip install tableone
from tableone import TableOne

#torch
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

#sklearn
!pip install scikit-optimize
from skopt import gp_minimize
from skopt.space import Integer, Real, Categorical

from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from torch.utils.data import DataLoader, random_split, Dataset
from sklearn.datasets import load_breast_cancer
from skopt import gp_minimize
from skopt.space import Integer, Real, Categorical
from skopt.utils import use_named_args
from sklearn.preprocessing import MinMaxScaler



from imblearn.over_sampling import RandomOverSampler



In [3]:
#import data and visualize

# Load the breast cancer dataset
data = load_breast_cancer()

# Convert to Pandas DataFrame
bc_df = pd.DataFrame(data.data, columns=data.feature_names)

# Add the target column (malignant or benign)
bc_df['target'] = data.target

bc_columns = bc_df.columns.tolist()

print(f"The dataset lenghth is {str(len(bc_df))}")
print(f"The number of columns is {str(len(bc_columns))}")
print(f"The column names are {str(bc_columns)} \n")

table1 = TableOne(bc_df, columns=bc_columns, groupby= 'target', pval=True)
print(table1)



The dataset lenghth is 569
The number of columns is 31
The column names are ['mean radius', 'mean texture', 'mean perimeter', 'mean area', 'mean smoothness', 'mean compactness', 'mean concavity', 'mean concave points', 'mean symmetry', 'mean fractal dimension', 'radius error', 'texture error', 'perimeter error', 'area error', 'smoothness error', 'compactness error', 'concavity error', 'concave points error', 'symmetry error', 'fractal dimension error', 'worst radius', 'worst texture', 'worst perimeter', 'worst area', 'worst smoothness', 'worst compactness', 'worst concavity', 'worst concave points', 'worst symmetry', 'worst fractal dimension', 'target'] 

                                    Grouped by target                                                      
                                              Missing        Overall               0              1 P-Value
n                                                                569             212            357        
mean radius,



---

# Autoencoder (Breast cancer dataset)

In [9]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, random_split, Dataset
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
import numpy as np
from skopt import gp_minimize
from skopt.space import Integer, Real, Categorical

# **Set device for GPU acceleration**
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# **Error warning if no GPU is detected**
if device.type != 'cuda':
    print("WARNING: GPU is not available. The model will run on the CPU, which might be slower.")
else:
    print("Cuda setup successful")

# **Load and preprocess the Breast Cancer dataset**
breast_cancer = load_breast_cancer()
data = breast_cancer.data

# **Define custom dataset class**
class BreastCancerDataset(Dataset):
    def __init__(self, data):
        self.data = torch.tensor(data, dtype=torch.float32)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

# **Scale the data between 0 and 1**
scaler = MinMaxScaler()
data = scaler.fit_transform(data)

# **Create PyTorch dataset and split into train/validation/test sets**
dataset = BreastCancerDataset(data)
train_size = int(0.7 * len(dataset))
val_size = int(0.2 * len(dataset))
test_size = len(dataset) - train_size - val_size
train_set, val_set, test_set = random_split(dataset, [train_size, val_size, test_size])

# **Define flexible autoencoder class**
class FlexibleAutoencoder(nn.Module):
    def __init__(self, input_dim, encoder_layers, latent_dim, decoder_layers, activations, dropout_prob, use_batchnorm):
        """
        A flexible autoencoder implementation that supports customizable encoder and decoder architectures,
        latent space dimensionality, activation functions, dropout, and batch normalization.
        """
        super(FlexibleAutoencoder, self).__init__()

        # **Build the encoder**
        encoder = []
        in_dim = input_dim
        for i in range(len(encoder_layers)):
            encoder.append(nn.Linear(in_dim, encoder_layers[i]))
            if use_batchnorm:
                encoder.append(nn.BatchNorm1d(encoder_layers[i]))
            encoder.append(getattr(nn, activations[i])())
            if dropout_prob > 0:
                encoder.append(nn.Dropout(dropout_prob))
            in_dim = encoder_layers[i]

        # Add the latent space layer
        encoder.append(nn.Linear(in_dim, latent_dim))
        if use_batchnorm:
            encoder.append(nn.BatchNorm1d(latent_dim))
        encoder.append(getattr(nn, activations[len(encoder_layers)])())
        self.encoder = nn.Sequential(*encoder)

        # **Build the decoder**
        decoder = []
        in_dim = latent_dim
        for i in range(len(decoder_layers)):
            out_dim = input_dim if i == len(decoder_layers) - 1 else decoder_layers[i]
            decoder.append(nn.Linear(in_dim, out_dim))
            if use_batchnorm and i != len(decoder_layers) - 1:
                decoder.append(nn.BatchNorm1d(out_dim))
            if i != len(decoder_layers) - 1:
                decoder.append(getattr(nn, activations[len(encoder_layers) + 1 + i])())
            if dropout_prob > 0 and i != len(decoder_layers) - 1:
                decoder.append(nn.Dropout(dropout_prob))
            in_dim = out_dim
        self.decoder = nn.Sequential(*decoder)

    def forward(self, x):
        """Defines the forward pass of the autoencoder."""
        latent = self.encoder(x)
        reconstructed = self.decoder(latent)
        return reconstructed

# **Training and validation function**
def train_validate_autoencoder(encoder_layers, latent_dim, decoder_layers, activations, lr, batch_size, dropout_prob, use_batchnorm):
    """
    Train and validate the autoencoder using the given hyperparameters.
    """
    batch_size = int(batch_size)
    train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, drop_last=True)
    val_loader = DataLoader(val_set, batch_size=batch_size, shuffle=False, drop_last=True)

    # **Initialize the model**
    model = FlexibleAutoencoder(
        input_dim=data.shape[1],
        encoder_layers=encoder_layers,
        latent_dim=latent_dim,
        decoder_layers=decoder_layers,
        activations=activations,
        dropout_prob=dropout_prob,
        use_batchnorm=use_batchnorm
    ).to(device)

    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    num_epochs = 100
    train_losses, val_losses = [], []
    for epoch in tqdm(range(num_epochs)):
        model.train()
        train_loss = 0
        for batch in train_loader:
            batch = batch.to(device)
            optimizer.zero_grad()
            outputs = model(batch)
            loss = criterion(outputs, batch)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        # Validation phase
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                batch = batch.to(device)
                outputs = model(batch)
                loss = criterion(outputs, batch)
                val_loss += loss.item()
        train_losses.append(train_loss / len(train_loader))
        val_losses.append(val_loss / len(val_loader))

    return val_losses[-1]  # Return the final validation loss

# **Define the search space for hyperparameter optimization**
space = [
    Integer(1, 5, name='num_encoder_layers'),
    Integer(1, 5, name='num_decoder_layers'),
    Integer(4, 128, name='num_neurons'),
    Integer(2, 64, name='latent_dim'),  # Latent space dimensionality
    Categorical(['ReLU', 'Sigmoid', 'Tanh'], name='activation_fn'),
    Real(1e-4, 1e-2, prior='log-uniform', name='lr'),
    Integer(2, 64, name='batch_size'),
    Categorical([True, False], name='use_dropout'),
    Real(0.1, 0.5, name='dropout_prob'),
    Categorical([True, False], name='use_batchnorm')
]

# **Objective function for Bayesian optimization**
def objective(params):
    num_encoder_layers, num_decoder_layers, num_neurons, latent_dim, activation_fn, lr, batch_size, use_dropout, dropout_prob, use_batchnorm = params

    # Construct activation function list
    activations = [activation_fn] * (num_encoder_layers + num_decoder_layers + 1)

    # Define encoder and decoder layers
    encoder_layers = [num_neurons] * num_encoder_layers
    decoder_layers = [num_neurons] * num_decoder_layers

    # Train and validate the autoencoder
    final_val_loss = train_validate_autoencoder(encoder_layers, latent_dim, decoder_layers, activations, lr, batch_size, dropout_prob, use_batchnorm)
    return final_val_loss

# **Run Bayesian optimization**
result = gp_minimize(objective, space, n_calls=30, random_state=42)

# **Output the best parameters**
print("Best hyperparameters:")
print(f"Number of encoder layers: {result.x[0]}")
print(f"Number of decoder layers: {result.x[1]}")
print(f"Number of neurons per layer: {result.x[2]}")
print(f"Latent space dimensionality: {result.x[3]}")
print(f"Activation function: {result.x[4]}")
print(f"Learning rate: {result.x[5]}")
print(f"Batch size: {result.x[6]}")
print(f"Use dropout: {result.x[7]}")
print(f"Dropout probability: {result.x[8]}")
print(f"Use batch normalization: {result.x[9]}")
print(f"Final validation loss: {result.fun}")

Cuda setup successful


100%|██████████| 100/100 [00:04<00:00, 23.56it/s]
100%|██████████| 100/100 [00:02<00:00, 36.22it/s]
100%|██████████| 100/100 [00:03<00:00, 30.35it/s]
100%|██████████| 100/100 [00:36<00:00,  2.74it/s]
100%|██████████| 100/100 [00:05<00:00, 16.70it/s]
100%|██████████| 100/100 [00:03<00:00, 33.32it/s]
100%|██████████| 100/100 [00:07<00:00, 12.63it/s]
100%|██████████| 100/100 [00:02<00:00, 44.41it/s]
100%|██████████| 100/100 [00:04<00:00, 23.27it/s]
100%|██████████| 100/100 [00:04<00:00, 24.25it/s]
100%|██████████| 100/100 [00:03<00:00, 30.45it/s]
100%|██████████| 100/100 [00:04<00:00, 23.29it/s]
100%|██████████| 100/100 [00:02<00:00, 38.87it/s]
100%|██████████| 100/100 [00:02<00:00, 44.25it/s]
100%|██████████| 100/100 [00:03<00:00, 25.91it/s]
100%|██████████| 100/100 [01:36<00:00,  1.03it/s]
100%|██████████| 100/100 [00:01<00:00, 51.12it/s]
100%|██████████| 100/100 [00:04<00:00, 21.92it/s]
100%|██████████| 100/100 [00:08<00:00, 11.69it/s]
100%|██████████| 100/100 [00:13<00:00,  7.35it/s]


Best hyperparameters:
Number of encoder layers: 1
Number of decoder layers: 1
Number of neurons per layer: 85
Latent space dimensionality: 63
Activation function: ReLU
Learning rate: 0.0004582550799359247
Batch size: 14
Use dropout: False
Dropout probability: 0.23045489658159404
Use batch normalization: False
Final validation loss: 0.0013084623569739051




---

# Auto push to github




In [None]:
import datetime
import os

def commit_to_github(commit_msg):
  """
  Funct to autopush to github
  """

  # Navigate to the repository directory
  %cd /content/drive/MyDrive/Colab_Notebooks/Deep_Learning_Practice

  !git add .

  with open('/content/drive/MyDrive/IAM/PAT.txt', 'r') as file:
        github_pat = file.read().strip()
  os.environ['GITHUB_PAT'] = github_pat

  !git remote add origin "https://github.com/archiegoodman2/machine_learning_practice"

  # Replace with your actual username and email
  USERNAME="archiegoodman2"
  EMAIL="archiegoodman2011@gmail.com"

  # Set global username and email configuration
  !git config --global user.name "$USERNAME"
  !git config --global user.email "$EMAIL"

  now = datetime.datetime.now()
  current_datetime = now.strftime("%Y-%m-%d %H:%M")

  # Set remote URL using the PAT from environment variable
  !git remote set-url origin https://{os.environ['GITHUB_PAT']}@github.com/archiegoodman2/machine_learning_practice.git

  # Replace with your desired commit message
  COMMIT_MESSAGE = str(current_datetime) + " " + str(commit_msg)

  # Commit the changes
  !git commit -m "$COMMIT_MESSAGE"

  # Push to origin (force push if necessary)
  !git push -f origin master

  return 1

commit_to_github("added latent space dimension as a HP to be tuned")


/content/drive/MyDrive/Colab_Notebooks/Deep_Learning_Practice
