In [None]:
import argparse
import glob
import os
import sys
import time

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import simplejson

import seaborn as sns
import tensorflow as tf
tf.logging.set_verbosity(tf.logging.ERROR)

In [None]:
sys.path.append(os.getcwd())
from data import process_image_file

# Read and Clean Ray Tune Results

In [None]:
working_dir = os.getcwd()
trial_dir = os.path.join(working_dir, "trials")

In [None]:
# scheduler = "ahb"
# scheduler = "hpb"
scheduler = "pbt"
# scheduler = "None"

In [None]:
scheduler_dir = os.path.join(trial_dir, f"{scheduler}_Train_COVID_Net")

In [None]:
stored_data = {}

In [None]:
i = 0
for trial in sorted(glob.glob(os.path.join(scheduler_dir, "*", "result.json"))):
    losses = [simplejson.loads(line)["Losses"] for line in open(trial)]
    accuracies = [simplejson.loads(line)["Accuracies"] for line in open(trial)]
    learning_rates = [
        simplejson.loads(line)["config"]["learning_rate"] for line in open(trial)
    ]
    class_weights = [
        [simplejson.loads(line)["config"]["class_weight_1"] for line in open(trial)][
            -1
        ],
        [simplejson.loads(line)["config"]["class_weight_2"] for line in open(trial)][
            -1
        ],
        [simplejson.loads(line)["config"]["class_weight_3"] for line in open(trial)][
            -1
        ],
    ]
    covid_percent = [
        simplejson.loads(line)["config"]["covid_percent"] for line in open(trial)
    ][-1]

    if len(set(learning_rates)) < 2:
        learning_rates = learning_rates[-1]

    time_total_m = [simplejson.loads(line)["time_total_s"] for line in open(trial)][
        -1
    ] / 60

    num_iterations = [
        simplejson.loads(line)["training_iteration"] for line in open(trial)
    ][-1]

    stored_data[i] = {
        "losses": losses,
        "accuracies": accuracies,
        "learning_rates": learning_rates,
        "class_weights": class_weights,
        "covid_percent": covid_percent,
        "time_total_m": time_total_m,
        "num_iterations": num_iterations,
    }
    i += 1

In [None]:
with open(f"{scheduler}_results.json", "w") as f:
    simplejson.dump(stored_data, f, allow_nan=True)

# Best parameters

In [None]:
out_df = pd.DataFrame()

In [None]:
# scheduler = "ahb"
# scheduler = "hpb"
scheduler = "pbt"
# scheduler = "None"

In [None]:
with open(f"{scheduler}_results.json", "r") as f:
    stored_data = simplejson.load(f)
    df = pd.DataFrame(columns=stored_data["0"].keys())
    for trial in stored_data:
        df.at[trial, "covid_percent"] = stored_data[trial]["covid_percent"]
        df.at[trial, "time_total_m"] = stored_data[trial]["time_total_m"]
        df.at[trial, "num_iterations"] = stored_data[trial]["num_iterations"]
        df.at[trial, "class_weights"] = stored_data[trial]["class_weights"]
        #         takes into consideration different learning rates in the case of PBT perturbations
        df.at[trial, "learning_rates"] = pd.Series(
            stored_data[trial]["learning_rates"]
        ).unique()
        #         takes the final accuracy and loss values
        df.at[trial, "losses"] = pd.Series(stored_data[trial]["losses"]).to_list()[-1]
        df.at[trial, "accuracies"] = pd.Series(
            stored_data[trial]["accuracies"]
        ).to_list()[-1]

In [None]:
df = df[
    ((df.accuracies > 0.5) & (df.num_iterations == 24))
    | ((df.losses == df.losses.min()) | (df.accuracies == df.accuracies.max()))
]

In [None]:
df["scheduler"] = scheduler

In [None]:
# show results
df

In [None]:
out_df = out_df.append(df, ignore_index=True)

In [None]:
out_df.to_csv("training_joblist.csv")

# Trial Durations

In [None]:
# scheduler = "ahb"
# scheduler = "hpb"
# scheduler = "pbt"
scheduler = "None"

In [None]:
with open(f"{scheduler}_results.json", "r") as f:
    stored_data = simplejson.load(f)
    times = []
    for trial in stored_data:
        times += [stored_data[trial]["time_total_m"]]

In [None]:
np.mean(times)

# Prediction Performance

In [None]:
predictions = pd.DataFrame(columns=["pred","time","true", "true_mapped", "scheduler"])

In [None]:
trial = ["trial_fifo", "trial_hpb", "trial_ahb", "trial_pbt"]
scheduler = ["FIFO", "HyperBand", "Asynchronous HyperBand", "PBT"]
classes = ["Healthy", "Pneumonia", "Covid-19"]

In [None]:
for i in range(4):
    trial_pred = pd.read_csv(f"results/{trial[i]}_predictions.csv")
    trial_pred["scheduler"] = scheduler[i]
    predictions = predictions.append(trial_pred)

In [None]:
def create_crosstab(predictions, scheduler):
    return pd.crosstab(
        predictions[predictions.scheduler == scheduler]["true_mapped"],
        predictions[predictions.scheduler == scheduler]["pred"],
    )

In [None]:
fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(15, 15))
sns.heatmap(
    create_crosstab(predictions, "FIFO").apply(lambda x: 100 * x / x.sum(), axis=1),
    cmap="Blues",
    annot=True,
    annot_kws={"fontsize": 18},
    ax=ax[0, 0],
    fmt=".3f",
    vmin=0,
    vmax=100,
    cbar=False,
)
ax[0, 0].set_title("FIFO", fontsize=20)
ax[0, 0].set_xlabel("(a)", size=20)
ax[0, 0].set_ylabel("", size=20)
ax[0, 0].set_xticklabels(classes, size=18)
ax[0, 0].set_yticklabels(classes, size=18, va="center")

sns.heatmap(
    create_crosstab(predictions, "HyperBand").apply(
        lambda x: 100 * x / x.sum(), axis=1
    ),
    cmap="Blues",
    annot=True,
    annot_kws={"fontsize": 18},
    ax=ax[0, 1],
    fmt=".3f",
    vmin=0,
    vmax=100,
    cbar=False,
)
ax[0, 1].set_title("HyperBand", fontsize=20)
ax[0, 1].set_xlabel("(b)", size=20)
ax[0, 1].set_ylabel("", size=20)
ax[0, 1].set_xticklabels(classes, size=18)
ax[0, 1].set_yticklabels(classes, size=18, va="center")

sns.heatmap(
    create_crosstab(predictions, "Asynchronous HyperBand").apply(
        lambda x: 100 * x / x.sum(), axis=1
    ),
    cmap="Blues",
    annot=True,
    annot_kws={"fontsize": 18},
    ax=ax[1, 0],
    fmt=".3f",
    vmin=0,
    vmax=100,
    cbar=False,
)
ax[1, 0].set_title("Asynchronous HyperBand", fontsize=20)
ax[1, 0].set_xlabel("(c)", size=20)
ax[1, 0].set_ylabel("", size=20)
ax[1, 0].set_xticklabels(classes, size=18)
ax[1, 0].set_yticklabels(classes, size=18, va="center")

sns.heatmap(
    create_crosstab(predictions, "PBT").apply(lambda x: 100 * x / x.sum(), axis=1),
    cmap="Blues",
    annot=True,
    annot_kws={"fontsize": 18},
    ax=ax[1, 1],
    fmt=".3f",
    vmin=0,
    vmax=100,
    cbar=False,
)
ax[1, 1].set_title("PBT", fontsize=20)
ax[1, 1].set_xlabel("(d)", size=20)
ax[1, 1].set_ylabel("", size=20)
ax[1, 1].set_xticklabels(classes, size=18)
ax[1, 1].set_yticklabels(classes, size=18, va="center")

plt.tight_layout()
# plt.show()
plt.savefig("figures/heatmaps_all.png")

# Run Speeds

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
runtimes = pd.DataFrame(columns=["num", "runtime"])
for i in range(6):
    file = "training_run_size_" + str(2**i) + ".csv"
    runtimes.loc[i, "num"] = 2 ** i
    runtimes.loc[i, "runtime"] = pd.read_csv(file).time.values[0]

In [None]:
runtimes["runtime_m"] = runtimes.runtime.apply(lambda x: int(x/60))

In [None]:
runtimes

In [None]:
fig, ax = plt.subplots(nrows=2, ncols=1, figsize=(15, 10))
# Linear scale
ax[0].plot(runtimes.num, runtimes.runtime_m)
ax[0].set_xticks([num for num in runtimes.num])
ax[0].tick_params(axis='both', labelsize=16)
ax[0].set_xlabel("Number of GPU Nodes", size=20)
ax[0].set_ylabel("Runtime (in minutes)", size=20)
ax[0].set_title('(a)', size=20)
ax[0].grid()
# log (base 2) scale
ax[1].plot(runtimes.num, runtimes.runtime_m)
ax[1].set_xticks([num for num in runtimes.num])
ax[1].set_xscale("log", basex=2)
ax[1].tick_params(axis='both', labelsize=16)
ax[1].set_xlabel("Number of GPU Nodes", size=20)
ax[1].set_ylabel("Runtime (in minutes)", size=20)
ax[1].set_title('(b)', size=20)
ax[1].grid()
plt.tight_layout()
plt.show()
# plt.savefig("figures/GPU_speedup_both.png")

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(15, 5))
ax.plot(runtimes.num, runtimes.runtime_m)
ax.set_xticks([num for num in runtimes.num])
ax.tick_params(axis='both', labelsize=16)
ax.set_xlabel("Number of GPU Nodes", size=20)
ax.set_ylabel("Runtime (in minutes)", size=20)
ax.grid()
plt.show()
# plt.tight_layout()
# plt.savefig("figures/GPU_speedup.png")