# Data science experiments management with Weights & Biases platform  
### In this tutorial we will show you how to effectively log experiments with the help of WandB.  

## 1. Dashboards

In [None]:
!pip install tensorflow

In [None]:
!pip install wandb -q #install library
import wandb

In [None]:
!wandb login #or try to open terminal and run it there

In [None]:
project_name = 'first_steps'
group_name = 'cnn'
experiment_name = '2_conv'

In [None]:
import tensorflow as tf
from tensorflow.keras.callbacks import Callback
from wandb.keras import WandbCallback
import numpy as np

### First parameter config

In [None]:
wandb.init(
    project=project_name,
    group=group_name,
    name=experiment_name,
    config={
        "conv_1": 16,
        "activation_1": "relu",
        "kernel_size": (3, 3),
        "pool_size": (2, 2),
        "dropout": 0.7,
        "conv_2": 32,
        "activation_out": "softmax",
        "optimizer": "adam",
        "loss": "sparse_categorical_crossentropy",
        "metric": "accuracy",
        "epoch": 6,
        "batch_size": 32
    })
config = wandb.config

In [None]:
mnist = tf.keras.datasets.mnist
(x_train, y_train), (x_test, y_test) = mnist.load_data() ##data download
x_train = x_train.astype("float32") / 255
x_test = x_test.astype("float32") / 255
x_train = np.expand_dims(x_train, -1)
x_test = np.expand_dims(x_test, -1)
x_train, y_train = x_train[::5], y_train[::5] 

In [None]:
class_names = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']

In [None]:
def cnn_mnist(config, num_classes = 10, input_shape = (28, 28, 1)): ##simple Keras CNN
    img_inputs = tf.keras.Input(shape=input_shape)
    conv_1 = tf.keras.layers.Conv2D(config.conv_1, kernel_size=config.kernel_size, activation=config.activation_1)(img_inputs)
    pool_1 = tf.keras.layers.MaxPooling2D(pool_size=config.pool_size)(conv_1)
    conv_2 = tf.keras.layers.Conv2D(config.conv_2, kernel_size=config.kernel_size, activation=config.activation_1)(pool_1)
    pool_2 = tf.keras.layers.MaxPooling2D(pool_size=config.pool_size)(conv_2)
    flatten = tf.keras.layers.Flatten()(pool_2)
    dropout = tf.keras.layers.Dropout(config.dropout)(flatten)
    dense_out = tf.keras.layers.Dense(num_classes, activation=config.activation_out)(dropout)
    model = tf.keras.models.Model(inputs=img_inputs, outputs=dense_out)
    model.compile(loss=config.loss, optimizer=config.optimizer, metrics=[config.metric])
    return model

In [None]:
our_model = cnn_mnist()

In [None]:
our_model.fit(x_train, y_train, epochs=config.epoch, batch_size=config.batch_size,
          validation_data=(x_test, y_test),
          callbacks=[wandb.keras.WandbCallback(data_type="image",
          labels=class_names)])

wandb.finish()

### Second parameter config

In [None]:
project_name = 'first_steps'
group_name = 'cnn'
experiment_name = '2_conv_changed_channels'

wandb.init(
    project=project_name,
    group=group_name,
    name=experiment_name,
    config={
        "conv_1": 16,
        "activation_1": "relu",
        "kernel_size": (3, 3),
        "pool_size": (2, 2),
        "dropout": 0.7,
        "conv_2": 32,
        "activation_out": "softmax",
        "optimizer": "adam",
        "loss": "sparse_categorical_crossentropy",
        "metric": "accuracy",
        "epoch": 6,
        "batch_size": 32
    })
config = wandb.config

our_model = cnn_mnist()

In [None]:
our_model.fit(x_train, y_train, epochs=config.epoch, batch_size=config.batch_size,
          validation_data=(x_test, y_test),
          callbacks=[wandb.keras.WandbCallback(data_type="image",
          labels=class_names)])

wandb.finish()

## 2. Sweeps

In [None]:
# Configure the sweep – specify the parameters to search through, the search strategy, the optimization metric et all.
sweep_config = {
    'method': 'random', #grid, random
    'metric': {
      'name': 'accuracy',
      'goal': 'maximize'   
    },
    'parameters': {
        'epoch': {
            'values': [5, 10]
        },
        'dropout': {
            'values': [0.3, 0.4, 0.5]
        },
        'conv_1': {
            'values': [16, 32, 64]
        },
        'conv_2': {
            'values': [16, 32, 64]
        },
        'optimizer': {
            'values': ['adam', 'nadam', 'sgd', 'rmsprop']
        },
        'activation_1': {
            'values': ['relu', 'elu', 'selu','sigmoid']
        },
        'kernel_size': {
            'values': [(3, 3), (5, 5), (7, 7)]
        },
        
    }
}

In [None]:
user_name = 'your_wandb_login'
sweep_id = wandb.sweep(sweep_config, entity=user_name, project="first_steps")

In [None]:
def train():
    # Default values for hyper-parameters we're going to sweep over
    config_defaults = {
        "conv_1": 32,
        "activation_1": "relu",
        "kernel_size": (3, 3),
        "pool_size": (2, 2),
        "dropout": 0.1,
        "conv_2": 64,
        "activation_out": "softmax",
        "optimizer": "adam",
        "loss": "sparse_categorical_crossentropy",
        "metric": "accuracy",
        "epoch": 6,
        "batch_size": 32
    }

    # Initialize a new wandb run
    wandb.init(config=config_defaults)
    
    # Config is a variable that holds and saves hyperparameters and inputs
    config = wandb.config
    
    model = cnn_mnist(config=config)
    
    model.fit(x_train, y_train, epochs=config.epoch, batch_size=config.batch_size,
          validation_data=(x_test, y_test),
          callbacks=[wandb.keras.WandbCallback()])

In [None]:
wandb.agent(sweep_id, train)

## 3. Artifacts

In [None]:
from collections import namedtuple
Dataset = namedtuple("Dataset", ["x", "y"])

def load_data_split(train_size=50_000):
    

    (x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()


    x_train, x_val = x_train[:train_size], x_train[train_size:]
    y_train, y_val = y_train[:train_size], y_train[train_size:]

    training_data = Dataset(x_train, y_train)
    validation_data = Dataset(x_val, y_val)
    test_data = Dataset(x_test, y_test)

    datasets = [training_data, validation_data, test_data]

    return datasets

In [None]:
def load_and_log():
    with wandb.init(project=project_name, job_type="load-data") as run:
        
        datasets = load_data_split() 
        names = ["training", "validation", "test"]

        # Artifact
        raw_data = wandb.Artifact(
            "mnist-raw", type="dataset",
            description="Raw MNIST dataset, splitted",
            metadata={"source": "keras.datasets.mnist",
                      "train_data": len(datasets[0].x),
                      "valid_data": len(datasets[1].x),
                      "test_daata": len(datasets[2].x)})

        for name, data in zip(names, datasets):
            # Save our datasets
            with raw_data.new_file(name + ".npz", mode="wb") as file:
                np.savez(file, x=data.x, y=data.y)
        #save Artifact
        run.log_artifact(raw_data)

load_and_log()

In [None]:
def preprocess_dataset(dataset, normalize=True, expand_dims=True, to_categorical=True):
    x, y = dataset.x, dataset.y

    if normalize:
        x = x.astype("float32") / 255

    if expand_dims:
        x = np.expand_dims(x, -1)

    if to_categorical:
        y = tf.keras.utils.to_categorical(y, num_classes)
        
    return Dataset(x, y)

In [None]:
import os
def preprocess_and_log(preprocess_steps):

    with wandb.init(project=project_name, job_type="data_preprocessing", name="preprocess_simple") as run:

        processed_data = wandb.Artifact(
            "mnist-preprocessed", type="dataset",
            description="Preprocessed MNIST dataset",
            metadata=preprocess_steps)
         
        # which Artifact we will use
        raw_data_artifact = run.use_artifact('mnist-raw:latest')

        # download Artifact
        raw_dataset = raw_data_artifact.download()
        
        for split in ["training", "validation", "test"]:
            datafile = split + ".npz"
            data = np.load(os.path.join(raw_dataset, datafile))
            raw_split = Dataset(x=data["x"], y=data["y"])
            processed_dataset = preprocess_dataset(raw_split, **preprocess_steps)

            with processed_data.new_file(split + ".npz", mode="wb") as file:
                np.savez(file, x=processed_dataset.x, y=processed_dataset.y)

        run.log_artifact(processed_data)


steps = {"normalize": True,
         "expand_dims": True,
         "to_categorical" : True}

preprocess_and_log(steps)