In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import numpy as np

import ray
from ray import tune
from ray.util.sgd.tf.tf_trainer import TFTrainer, TFTrainable

NUM_TRAIN_SAMPLES = 1000
NUM_TEST_SAMPLES = 400

def create_config(batch_size):
    return {
        # todo: batch size needs to scale with # of workers
        "batch_size": batch_size,
        "fit_config": {
            "steps_per_epoch": NUM_TRAIN_SAMPLES // batch_size
        },
        "evaluate_config": {
            "steps": NUM_TEST_SAMPLES // batch_size,
        }
    }


def linear_dataset(a=2, size=1000):
    x = np.random.rand(size)
    y = x / 2

    x = x.reshape((-1, 1))
    y = y.reshape((-1, 1))

    return x, y

def simple_dataset(config):
    batch_size = config["batch_size"]
    x_train, y_train = linear_dataset(size=NUM_TRAIN_SAMPLES)
    x_test, y_test = linear_dataset(size=NUM_TEST_SAMPLES)

    train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
    test_dataset = tf.data.Dataset.from_tensor_slices((x_test, y_test))
    train_dataset = train_dataset.shuffle(NUM_TRAIN_SAMPLES).repeat().batch(
        batch_size)
    test_dataset = test_dataset.repeat().batch(batch_size)

    return train_dataset, test_dataset


def simple_model(config):
    model = Sequential([Dense(10, input_shape=(1, )), Dense(1)])

    model.compile(
        optimizer="sgd",
        loss="mean_squared_error",
        metrics=["mean_squared_error"])

    return model


def train_example(num_replicas=1, batch_size=128, use_gpu=False):
    trainer = TFTrainer(
        model_creator=simple_model,
        data_creator=simple_dataset,
        num_replicas=num_replicas,
        use_gpu=use_gpu,
        verbose=True,
        config=create_config(batch_size))

    # model baseline performance
    start_stats = trainer.validate()
    print(start_stats)

    # train for 2 epochs
    trainer.train()
    trainer.train()

    # model performance after training (should improve)
    end_stats = trainer.validate()
    print(end_stats)

    # sanity check that training worked
    dloss = end_stats["validation_loss"] - start_stats["validation_loss"]
    dmse = (end_stats["validation_mean_squared_error"] -
            start_stats["validation_mean_squared_error"])
    print(f"dLoss: {dloss}, dMSE: {dmse}")

    if dloss > 0 or dmse > 0:
        print("training sanity check failed. loss increased!")
    else:
        print("success!")

In [None]:
ray.init()

In [None]:
train_example()

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

def diamonds_dataset(config):
    batch_size = config["batch_size"]
    df = pd.read_csv('data/diamonds.csv')
    df.drop(df.columns[0], axis=1, inplace=True)
    df = pd.get_dummies(df, prefix=['cut_', 'color_', 'clarity_'])
    y = df.price.to_numpy()
    X = df.drop(columns=['price']).to_numpy()
    train_size = 40_000
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_size)
    
    train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
    test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test))
    train_dataset = train_dataset.shuffle(len(X_train)).repeat().batch(
        batch_size)
    test_dataset = test_dataset.repeat().batch(batch_size)

    return train_dataset, test_dataset

In [None]:
def diamonds_simple_model(config):
    model = Sequential([Dense(30, input_shape=(26, ), activation='relu'), Dense(1)])

    model.compile(
        optimizer="adam",
        loss="mean_squared_error",
        metrics=["mean_squared_error"])

    return model

In [None]:
def create_diamonds_config(batch_size):
    return {
        "batch_size": batch_size,
        "fit_config": {
            "steps_per_epoch": 40000 // batch_size
        },
        "evaluate_config": {
            "steps": 13940 // batch_size,
        }
    }

In [None]:
def train_diamonds(num_replicas=1, batch_size=128, use_gpu=False):
    trainer = TFTrainer(
        model_creator=diamonds_simple_model,
        data_creator=diamonds_dataset,
        num_replicas=num_replicas,
        use_gpu=use_gpu,
        verbose=False,
        config=create_diamonds_config(batch_size))

    # model baseline performance
    start_stats = trainer.validate()
    print(start_stats)

    for i in range(32):
        trainer.train()

    # model performance after training (should improve)
    end_stats = trainer.validate()
    print(end_stats)

    # sanity check that training worked
    dloss = end_stats["validation_loss"] - start_stats["validation_loss"]
    dmse = (end_stats["validation_mean_squared_error"] -
            start_stats["validation_mean_squared_error"])
    print(f"dLoss: {dloss}, dMSE: {dmse}")

    if dloss > 0 or dmse > 0:
        print("training sanity check failed. loss increased!")
    else:
        print("success!")
        
train_diamonds()

In [None]:
import mlflow

mlflow.create_experiment("Diamonds RaySGD")

In [None]:
from  mlflow.tracking import MlflowClient
client = MlflowClient()
experiments = client.list_experiments() # returns a list of mlflow.entities.Experiment
experiments

In [None]:
run = client.create_run(experiments[0].experiment_id) # returns mlflow.entities.Run
client.log_param(run.info.run_id, "hello", "world")
client.set_terminated(run.info.run_id)

In [None]:
def train_diamonds_mlflow(num_replicas=1, batch_size=128, use_gpu=False):
    trainer = TFTrainer(
        model_creator=diamonds_simple_model,
        data_creator=diamonds_dataset,
        num_replicas=num_replicas,
        use_gpu=use_gpu,
        verbose=False,
        config=create_diamonds_config(batch_size))

    # model baseline performance
    start_stats = trainer.validate()
    print(start_stats)

    ml_run = client.create_run(experiments[0].experiment_id)

    for i in range(32):
        train_stats = trainer.train()
        val_stats = trainer.validate()            
        client.log_metric(ml_run.info.run_id, "validation_loss", val_stats["validation_loss"])            
        client.log_metric(ml_run.info.run_id, "training_loss", train_stats["train_loss"])
        
    client.set_terminated(ml_run.info.run_id)

    # model performance after training (should improve)
    end_stats = trainer.validate()
    print(end_stats)

    # sanity check that training worked
    dloss = end_stats["validation_loss"] - start_stats["validation_loss"]
    dmse = (end_stats["validation_mean_squared_error"] -
            start_stats["validation_mean_squared_error"])
    print(f"dLoss: {dloss}, dMSE: {dmse}")

    if dloss > 0 or dmse > 0:
        print("training sanity check failed. loss increased!")
    else:
        print("success!")
        
train_diamonds_mlflow()