In [61]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.metrics import CategoricalAccuracy, Precision, Recall, AUC
import matplotlib.pyplot as plt
from time import time

import plotly.express as px

from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_score, recall_score
from sklearn.model_selection import train_test_split

In [2]:
# read iris
iris = pd.read_csv("/content/drive/MyDrive/iris.csv")

# format column names
iris.columns = [c.lower().replace(".", "_") for c in iris.columns.tolist()]

# species -> category
iris['species'] = iris.species.astype('category')
  
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [3]:
iris.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   sepal_length  150 non-null    float64 
 1   sepal_width   150 non-null    float64 
 2   petal_length  150 non-null    float64 
 3   petal_width   150 non-null    float64 
 4   species       150 non-null    category
dtypes: category(1), float64(4)
memory usage: 5.1 KB


In [4]:
rca = pd.read_csv('/content/drive/MyDrive/root_cause_analysis.csv')

# format column names
rca.columns = [c.lower().replace(".", "_") for c in rca.columns.tolist()]

# label -> category
rca['root_cause'] = rca['root_cause'].astype('category')

rca.head()

Unnamed: 0,id,cpu_load,memory_leak_load,delay,error_1000,error_1001,error_1002,error_1003,root_cause
0,1,0,0,0,0,1,0,1,MEMORY_LEAK
1,2,0,0,0,0,0,0,1,MEMORY_LEAK
2,3,0,1,1,0,0,1,1,MEMORY_LEAK
3,4,0,1,0,1,1,0,1,MEMORY_LEAK
4,5,1,1,0,1,0,1,0,NETWORK_DELAY


In [5]:
rca.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   id                1000 non-null   int64   
 1   cpu_load          1000 non-null   int64   
 2   memory_leak_load  1000 non-null   int64   
 3   delay             1000 non-null   int64   
 4   error_1000        1000 non-null   int64   
 5   error_1001        1000 non-null   int64   
 6   error_1002        1000 non-null   int64   
 7   error_1003        1000 non-null   int64   
 8   root_cause        1000 non-null   category
dtypes: category(1), int64(8)
memory usage: 63.7 KB


In [6]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras

from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [7]:
from dataclasses import dataclass
from typing import Optional, List, Any

@dataclass
class Hidden:
  nodes : Optional[List] = None
  activation : Optional[str]  = 'relu'

  def __post_init__(self):
    if self.nodes is None:
      self.nodes = [32, 64]

@dataclass
class Input:
  nodes : Optional[int] = None

@dataclass
class Output:
  nodes : Optional[int] = 3
  activation : Optional[str] = 'softmax'

@dataclass
class Initial:
  weights : Optional[str] = 'random_normal'
  bias : Optional[str] = 'zeros'

@dataclass
class ModelConfig:
  input : Optional[Input] = Input()
  output : Optional[Output] = Output()
  hidden : Optional[Hidden] = Hidden()
  initial : Optional[Initial] = Initial()
  normalization : Optional[str] = 'none'
  optimizer : Optional[str] = 'rmsprop'
  learning_rate : Optional[float] = 0.001
  regularizer : Optional[Any] = None
  dropout : Optional[float] = 0.0
  epochs : Optional[int] = 10
  batch_size : Optional[int] = 16
  train_split : Optional[float] = None
  validation_split : Optional[float] = None
  test_split : Optional[float] = None
  preprocessor : Optional[Any] = StandardScaler()
  verbose : Optional[bool] = 1
  loss_function : Optional[str] = 'categorical_crossentropy'
  metrics : Optional[List] = None

  def __post_init__(self):

    if self.metrics is None:
      
      self.metrics = ['accuracy']
    
    self.tvt = (self.train_split, self.validation_split, self.test_split)

    if ((self.train_split is None) &
        (self.validation_split is None) &
        (self.test_split is None)):
      self.train_split = 0.6
      self.validation_split=0.2
      self.test_split = 0.2
    else:
      x = ((1 if self.train_split is not None else 0) +
        (1 if self.validation_split is not None else 0) +
        (1 if self.test_split is not None else 0))
      errormsg = f"`train_split`: {self.train_split}\n"
      errormsg += f"`validation_split`: {self.validation_split}\n"
      errormsg += f"`test_split`: {self.test_split}\n"
      errormsg += "either specify all of (train, val, test) "
      errormsg += "or specify none of (train, val, test)"

      assert x==3, errormsg

In [65]:
@dataclass
class Model:
  config : Optional[Any] = ModelConfig()
  X : Optional[Any] = np.array([[1,2,3],[4,5,6],[7,8,9]])
  y : Optional[Any] = np.array([1,1,1])
  
  name : Optional[str] = None
  model : Optional[Any] = None
  optimizer : Optional[Any] = None
  compiled : Optional[bool] = False
  accuracy_measures : Optional[dict] = None
  fit : Optional[Any] = None

  def __post_init__(self):
    X_train, y_train = None, None
    X_val, y_val = None, None
    X_test, y_test = None, None

    if self.model is None:
      self.model = tf.keras.models.Sequential(name=self.name)

    if self.accuracy_measures is None:
      self.accuracy_measures = ['accuracy']

  def preprocess_data(self):
    # dummy vars for levels of the target
    self.y = pd.get_dummies(self.y)

    # split test set
    x1, self.X_test, y1, self.y_test = train_test_split(self.X,
                                                        self.y,
                                                        stratify=self.y,
                                                        test_size = self.config.test_split)
    
    # split train/val sets
    self.X_train, self.X_val, self.y_train, self.y_val = train_test_split(
        x1,
        y1,
        stratify=y1,
        # split train/val from remaining data
        test_size = self.config.validation_split / (1 -self.config.test_split)
    )

    # fit standard scaler on X_train, apply to train, val, test
    self.scaler = StandardScaler()
    self.scaler.fit(self.X_train)
    self.X_train = self.scaler.transform(self.X_train)
    self.X_val = self.scaler.transform(self.X_val)
    self.X_test = self.scaler.transform(self.X_test)

  def _add_layer(self, layer : int):
    self.model.add(
        keras.layers.Dense(
            self.config.hidden.nodes[layer],
            input_shape=(self.X.shape[1],),
            name=f"Dense-Layer-{layer}",
            kernel_initializer=self.config.initial.weights,
            bias_initializer=self.config.initial.bias,
            kernel_regularizer=self.config.regularizer,
            activation=self.config.hidden.activation))
    
  def _add_batch_normalization(self):
    if(self.config.normalization == 'batch'):
          self.model.add(keras.layers.BatchNormalization())

  def _add_dropout(self):
    if(self.config.dropout > 0.0):
          self.model.add(keras.layers.Dropout(self.config.dropout))

  def _get_optimizer(self):
    opts={
        'sgd': keras.optimizers.SGD(learning_rate=self.config.learning_rate)
        , 'rmsprop': keras.optimizers.RMSprop(learning_rate=self.config.learning_rate)
        , 'adam': keras.optimizers.Adam(learning_rate=self.config.learning_rate)
        , 'adagrad': keras.optimizers.Adagrad(learning_rate=self.config.learning_rate)
    }
    self.optimizer = opts[self.config.optimizer]


  def build_model(self):
    # loop through hidden nodes defined in config
    for layer in range(len(self.config.hidden.nodes)):
      
      # nothing to normalize/dropout for 1st hidden layer
      if layer==0: 
        self._add_layer(layer)
      
      # add batch normalization / dropout if indicated
      # to the hidden layers after the first
      else: 
        self._add_batch_normalization()
        self._add_dropout()
        self._add_layer(layer)

    # output layer
    self.model.add(keras.layers.Dense(
        self.config.output.nodes,
        name='Output-Layer',
        activation=self.config.output.activation
        )
    )

    # optimizer
    if self.optimizer is None:
      self._get_optimizer()

  def compile_model(self):
    if self.optimizer is None:
      self.build_model()
    self.model.compile(
        loss=self.config.loss_function,
        optimizer=self.optimizer,
        metrics=self.config.metrics
    )
    self.compiled = True

  def summary(self):
    if self.compiled:
      self.model.summary()
    else:
      print('No model has been compiled. Run .compile_model() to compile.')

  def fit_model(self, verbose=None):
    assert self.compiled, "Model has not been compiled yet."
    self.fitted = self.model.fit(
        self.X_train,
        self.y_train,
        batch_size=self.config.batch_size,
        epochs=self.config.epochs,
        verbose=verbose if verbose is not None else self.config.verbose,
        validation_data=(self.X_val, self.y_val))
    
# function to plot training/validation loss and 
# training/validation accuracy
# def plot_loss_acc(models, title=None):
#   fig, ((tl, ta), (vl, va)) = plt.subplots(2, 2, figsize=(20, 10))

#   for k, v in zip(models.keys(), models.values()):
    
#     df = pd.DataFrame(v.fitted.history)
#     tl.plot(df['loss'], label=k, alpha=0.7)
#     tl.set_title('Training Loss')
#     tl.legend()

#     ta.plot(df['accuracy'], label=k, alpha=0.7)
#     ta.set_title('Training Accuracy')
#     ta.legend()

#     vl.plot(df['val_loss'], label=k, alpha=0.7)
#     vl.set_title('Validation Loss')
#     vl.legend()

#     va.plot(df['val_accuracy'], label=k, alpha=0.7)
#     va.set_title('Validation Accuracy')
#     va.legend()

#   if title is not None:
#     plt.title(title)

#   plt.show()


In [86]:
import plotly.subplots as sp
import plotly.graph_objs as go
import plotly.express as px

def plot_loss_acc(models, title="Loss and Accuracy", width=1400, height=800):
    fig = sp.make_subplots(rows=2,
                           cols=2,
                           subplot_titles=("Training Loss", "Training Accuracy", "Validation Loss", "Validation Accuracy"))
    
    # Define a color sequence
    colors = px.colors.qualitative.Plotly

    # Associate each model with a specific color
    model_colors = {model: colors[i % len(colors)] for i, model in enumerate(models.keys())}

    for k, v in zip(models.keys(), models.values()):
        df = pd.DataFrame(v.fitted.history)

        fig.add_trace(go.Scatter(y=df['loss'], name=k, legendgroup=k, showlegend=True, mode='lines', line=dict(color=model_colors[k], width=1.5),
                                 hovertemplate=f'Model: {k}<br>Loss: %{{y:.3f}}'), row=1, col=1)
        fig.add_trace(go.Scatter(y=df['accuracy'], name=k, legendgroup=k, showlegend=False, mode='lines', line=dict(color=model_colors[k], width=1.5),
                                 hovertemplate=f'Model: {k}<br>Accuracy: %{{y:.1%}}'), row=1, col=2)
        fig.add_trace(go.Scatter(y=df['val_loss'], name=k, legendgroup=k, showlegend=False, mode='lines', line=dict(color=model_colors[k], width=1.5),
                                 hovertemplate=f'Model: {k}<br>Validation Loss: %{{y:.3f}}'), row=2, col=1)
        fig.add_trace(go.Scatter(y=df['val_accuracy'], name=k, legendgroup=k, showlegend=False, mode='lines', line=dict(color=model_colors[k], width=1.5),
                                 hovertemplate=f'Model: {k}<br>Validation Accuracy: %{{y:.1%}}'), row=2, col=2)


    if title is not None:
        fig.update_layout(title_text=title, title_x=0.5)

    # Update the width and height of the plot
    fig.update_layout(width=width, height=height)

    fig.show()


In [9]:
X_iris = iris.drop(columns='species')
for c in X_iris.columns:
  m = X_iris[c].mean()
  m = X_iris[c].std()

In [84]:
batch_models = {}

# test batch sizes from 16 -> 128 in increments of 16
# for batch_size in [64]:
for batch_size in range(4, 32, 4):
  # new model instance
  name = f"Batch-Size-{batch_size}"
  # print(f'name: {name}')
  batch_models[name] = Model(X=iris.drop(columns='species'),
                             y=iris.species,
                             name=name)

  # set batch size/epochs
  batch_models[name].config.batch_size = batch_size
  batch_models[name].config.epochs = 25
  batch_models[name].config.hidden.nodes = [32 for _ in range(3)]

  # process data/compile model
  batch_models[name].preprocess_data()
  batch_models[name].build_model()
  batch_models[name].compile_model()

  # model summary
  batch_models[name].summary()

  # fit model
  batch_models[name].fitted = batch_models[name].model.fit(
        batch_models[name].X_train,
        batch_models[name].y_train,
        batch_size=batch_models[name].config.batch_size,
        epochs=batch_models[name].config.epochs,
        verbose=0,
        validation_data=(batch_models[name].X_val, batch_models[name].y_val))

Model: "Batch-Size-4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 Dense-Layer-0 (Dense)       (None, 32)                160       
                                                                 
 Dense-Layer-1 (Dense)       (None, 32)                1056      
                                                                 
 Dense-Layer-2 (Dense)       (None, 32)                1056      
                                                                 
 Output-Layer (Dense)        (None, 3)                 99        
                                                                 
Total params: 2,371
Trainable params: 2,371
Non-trainable params: 0
_________________________________________________________________
Model: "Batch-Size-8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 Dense-Layer-0 (Dense)       (

In [87]:
plot_loss_acc(batch_models, title='Batch Size Model Testing')

## here, accuracy stabilizes around epoch 30, and is best for both training and validation sets with batch size 12
  - any smaller than 12 and it loss starts to get pretty unstable

In [12]:
config1 = ModelConfig()
config1.batch_size=12
config1.epochs=30

In [13]:
config1

ModelConfig(input=Input(nodes=None), output=Output(nodes=3, activation='softmax'), hidden=Hidden(nodes=[32, 64], activation='relu'), initial=Initial(weights='random_normal', bias='zeros'), normalization='none', optimizer='rmsprop', learning_rate=0.001, regularizer=None, dropout=0.0, epochs=30, batch_size=12, train_split=0.6, validation_split=0.2, test_split=0.2, preprocessor=StandardScaler(), verbose=1, loss_function='categorical_crossentropy', metrics=['accuracy'])

### experiment with the number of hidden layers
- test 1-5 hidden layers
  - usually expect 2 to be enough
- each has 32 nodes

In [14]:
ModelConfig(
    batch_size=12,
    epochs=30
)

ModelConfig(input=Input(nodes=None), output=Output(nodes=3, activation='softmax'), hidden=Hidden(nodes=[32, 64], activation='relu'), initial=Initial(weights='random_normal', bias='zeros'), normalization='none', optimizer='rmsprop', learning_rate=0.001, regularizer=None, dropout=0.0, epochs=30, batch_size=12, train_split=0.6, validation_split=0.2, test_split=0.2, preprocessor=StandardScaler(), verbose=1, loss_function='categorical_crossentropy', metrics=['accuracy'])

In [43]:
hidden_layer_models = {}
hlm_times = {}

# test number of hidden layers 1-5
for n_layers in range(1, 6):
  start_time = time()
  # new model instance
  name = f"Hidden-Layers-{n_layers}"
  # print(f'name: {name}')
  hidden_layer_models[name] = Model(
      config = ModelConfig(
          verbose=0,
          batch_size=12,
          epochs=30,
          hidden=Hidden(
              nodes=[32 for _ in range(n_layers)]
          )
      )
      , X=iris.drop(columns='species')
      , y=iris.species
      , name=name
      )

  print(f"batch_size: {hidden_layer_models[name].config.batch_size}")
  print(f"epochs: {hidden_layer_models[name].config.epochs}")
  print(f"hidden layers: {hidden_layer_models[name].config.hidden.nodes}")

  # process data/compile model
  hidden_layer_models[name].preprocess_data()
  hidden_layer_models[name].build_model()
  hidden_layer_models[name].compile_model()

  # model summary
  hidden_layer_models[name].summary()

  # fit model
  hidden_layer_models[name].fit_model()

  # get time
  hlm_times[name]={
      'start':start_time
      , 'end': time()
  }
  hlm_times[name]['time'] = hlm_times[name]['end'] - hlm_times[name]['start']

  print(f"========== Time to fit model =================")
  print(f"========= {hlm_times[name]['time']} =================")
  print(f"===============================================")
  print('\n\n')

batch_size: 12
epochs: 30
hidden layers: [32]
Model: "Hidden-Layers-1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 Dense-Layer-0 (Dense)       (None, 32)                160       
                                                                 
 Output-Layer (Dense)        (None, 3)                 99        
                                                                 
Total params: 259
Trainable params: 259
Non-trainable params: 0
_________________________________________________________________



batch_size: 12
epochs: 30
hidden layers: [32, 32]
Model: "Hidden-Layers-2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 Dense-Layer-0 (Dense)       (None, 32)                160       
                                                                 
 Dense-Layer-1 (Dense)       (None, 32)                1056  

In [88]:
plot_loss_acc(hidden_layer_models, "Hidden Layer Testing")

$\implies$ picking 3 hidden layers

## Number of nodes in a layer

In [98]:
def build_model(name,
                X=iris.drop(columns='species'),
                y=iris.species,
                verbose=0,
                batch_size=12,
                epochs=30,
                n_hidden_layers=3,
                n_nodes = None):
  
  if n_nodes is None:
    n_nodes = 32

  m = Model(
      config = ModelConfig(
          verbose=verbose,
          batch_size=batch_size,
          epochs=epochs,
          hidden=Hidden(
              nodes=[n_nodes for _ in range(n_hidden_layers)]
          )
      )
      , X=X
      , y=y
      , name=name
      )
  
  return m

def run_model(name,
              X=iris.drop(columns='species'),
              y=iris.species,
              verbose=0,
              batch_size=12,
              epochs=30,
              n_hidden_layers=3,
              n_nodes = None,
              test_times=True):
  m = build_model(name=name,
              X=X,
              y=y,
              verbose=verbose,
              batch_size=batch_size,
              epochs=epochs,
              n_hidden_layers=n_hidden_layers,
              n_nodes = n_nodes)
  m.preprocess_data()
  m.build_model()
  m.compile_model()
  
  if test_times:
    start=time()
  m.fit_model()
  if test_times:
    end = time()
    time_to_train=end - start
  else:
    time_to_train=None
  return m, time_to_train


In [95]:
nodes_models = {}
nodes_times = {}

# test number of nodes from 8 -> 80 by 8
for n_nodes in range(64, 160, 8):
  start_time = time()
  name = f"Nodes-{n_nodes}"
  
  # model
  nodes_models[name] = Model(
      config = ModelConfig(
          verbose=0,
          batch_size=12,
          epochs=30,
          hidden=Hidden(
              nodes=[n_nodes for _ in range(3)]
          )
      )
      , X=iris.drop(columns='species')
      , y=iris.species
      , name=name
      )

  # process data/compile model
  nodes_models[name].preprocess_data()
  nodes_models[name].build_model()
  nodes_models[name].compile_model()

  # fit model
  nodes_models[name].fit_model()

  # get time
  nodes_times[name]={
      'start':start_time
      , 'end': time()
  }
  nodes_times[name]['time'] = nodes_times[name]['end'] - nodes_times[name]['start']
timedf =pd.DataFrame(nodes_times).transpose()['time'].reset_index()

batch_size: 12
epochs: 30
hidden layers: [64, 64, 64]
Model: "Nodes-64"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 Dense-Layer-0 (Dense)       (None, 64)                320       
                                                                 
 Dense-Layer-1 (Dense)       (None, 64)                4160      
                                                                 
 Dense-Layer-2 (Dense)       (None, 64)                4160      
                                                                 
 Output-Layer (Dense)        (None, 3)                 195       
                                                                 
Total params: 8,835
Trainable params: 8,835
Non-trainable params: 0
_________________________________________________________________



batch_size: 12
epochs: 30
hidden layers: [72, 72, 72]
Model: "Nodes-72"
___________________________________________________________

In [96]:
timedf

Unnamed: 0,index,time
0,Nodes-64,4.216374
1,Nodes-72,3.219681
2,Nodes-80,3.191387
3,Nodes-88,3.21415
4,Nodes-96,4.228308
5,Nodes-104,5.760912
6,Nodes-112,5.861814
7,Nodes-120,3.404229
8,Nodes-128,3.216999
9,Nodes-136,3.291655


In [97]:
plot_loss_acc(nodes_models, "Number of Nodes Testing")

## first run:
- testing multiples of 8 from 8-80
  - 72 and 80 look best, so retry with bigger numbers

In [99]:
# redo with only 15 epochs

nodes_models = {}
nodes_times = {}

# test number of nodes from 80 -> 160 by 8
for n_nodes in range(80, 168, 8):
  start_time = time()
  name = f"Nodes-{n_nodes}"

  print(f"Fitting {name}")
  
  # model
  nodes_models[name] = Model(
      config = ModelConfig(
          verbose=0,
          batch_size=12,
          epochs=15,
          hidden=Hidden(
              nodes=[n_nodes for _ in range(3)]
          )
      )
      , X=iris.drop(columns='species')
      , y=iris.species
      , name=name
      )

  # process data/compile model
  nodes_models[name].preprocess_data()
  nodes_models[name].build_model()
  nodes_models[name].compile_model()

  # fit model
  nodes_models[name].fit_model()

  # get time
  nodes_times[name]={
      'start':start_time
      , 'end': time()
  }
  nodes_times[name]['time'] = nodes_times[name]['end'] - nodes_times[name]['start']
timedf =pd.DataFrame(nodes_times).transpose()['time'].reset_index()

In [100]:
plot_loss_acc(nodes_models, "Number of Nodes Testing")