# Imports

In [None]:
import os
import pandas as pd
import numpy as np
import altair as alt

from utils import download_tfrecord
from tensorflow import make_ndarray
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
from tensorboard.backend.event_processing.tag_types import TENSORS

# Tensorboard

Launch a TensorBoard Session in VS Code or...

Run the below (you may have to run the second command twice)

In [None]:
%load_ext tensorboard

In [None]:
%tensorboard --logdir=log

or use

```
!tensorboard --logdir log
```
if the above doesn't work

# Datasets

Run the below cells to download the CIFAR-10 datasets (or the CIFAR-100 ones)

In [None]:
download_tfrecord("test10", "train10")

# Training (and evaluation)

Use ```main.py``` to train on the imbalanced version of CIFAR 10. This is to get the results in Table 1.

Call it with the following flags/parameters:

* ```dataset```: ```cifar10-lt``` (for CIFAR 10)
* ```mode```: ```baseline```, ```posthoc``` or ```loss```
    * whether to use baseline (vanilla) ERM
    * or posthoc modification of logits
    * or adjusted loss function

The results can be found in Tensorboard: ...

## Experiments and Results

Vanilla ERM

In [None]:
!python main.py --dataset=cifar10-lt --mode=baseline --tb_log_dir=log/ERM

Posthoc update

In [None]:
!python main.py --dataset=cifar10-lt --mode=posthoc --tb_log_dir=log/Additive_update

Logit adjusted loss

In [None]:
!python main.py --dataset=cifar10-lt --mode=loss --tb_log_dir=log/LA

### Read the results

* For Figure 1 and Table 1

#### Figure 1

In [2]:
# Get the class probabilities

# Define the file path
file_path = "./data/cifar10-lt_base_probs.txt"

# Initialize a list to store the numbers
class_prob = []

# Open the file and read it line by line
with open(file_path, "r") as file:
    for line in file:
        # Remove leading/trailing whitespace and convert the line to a float
        number = float(line.strip())
        class_prob.append(number)

In [3]:
# the number of samples in the training data. possible values for β

N = 10000
βs = [1, 0.999, 0.99, 0]

In [4]:
# calculate effective class frequency with formula in terms of β and empirical class frequency

df = pd.DataFrame(
    {
        "class_num": list(range(1, 10 + 1)) * len(βs),
        "class_prob": class_prob * len(βs),
        "β": sum([[β] * 10 for β in βs], start=[]),
    }
)

df["effective_class_frequency"] = (1 - df["β"] ** (N * df["class_prob"])) / (
    1 - df["β"]
)

In [5]:
# normalise effective class frequency to get effective class probability

df["effective_class_frequency"] = df["effective_class_frequency"] / df.groupby("β")[
    "effective_class_frequency"
].transform("sum")

# when β = 1, use empirical class probability

df["effective_class_frequency"] = np.where(
    df["effective_class_frequency"].isna(),
    df["class_prob"],
    df["effective_class_frequency"],
)

In [6]:
# plot effective class probability for each class number for each β

alt.Chart(data=df).mark_bar().encode(
    x=alt.X("β:N", title=None, axis=alt.Axis(ticks=False, labels=False), sort=βs),
    y=alt.Y("effective_class_frequency", title="Smoothed class probability"),
    color=alt.Color("β:N", sort=βs),
    column=alt.Column("class_num:O", title="Classes (ranked by frequency)"),
)

#### Table 1

We get the final test accuracy for each of the approaches (baseline ERM, posthoc update and logit adjusted loss)

In [7]:
def load_tensorboard_data(log_dir: str, exp_name: str) -> pd.DataFrame:
    """
    Returns the logs from Tensorboard SummaryWriter in blob storage as a pandas DataFrame.

    Args:
        log_dir (str): the path to the Tensorboard logs
        exp_name (str): name of the experiment
    """

    event_acc = EventAccumulator(log_dir, size_guidance={TENSORS: 0})
    event_acc.Reload()

    data = []
    tensors = event_acc.Tags()["tensors"]

    for tag in tensors:
        events = event_acc.Tensors(tag)
        steps = [event.step for event in events]
        tensor_protos = [make_ndarray(event.tensor_proto).item() for event in events]
        for step, tensor_proto in list(zip(steps, tensor_protos)):
            data.append(
                {
                    "experiment": exp_name,
                    "metric": tag,
                    "step": step,
                    "value": tensor_proto,
                }
            )

    df = pd.DataFrame(data)
    return df

In [8]:
_exp_names = sorted(os.listdir("./log"))  # the folder names
exp_names = [
    _exp_name.replace("_", " ") for _exp_name in _exp_names
]  # remove underscore for space

df = pd.DataFrame()

for _exp_name, exp_name in list(zip(_exp_names, exp_names)):
    df = pd.concat(
        [
            df,
            load_tensorboard_data(
                log_dir="./log/" + _exp_name + "/test", exp_name=exp_name
            ),
        ]
    )

In [9]:
final_accuracy_df = df[df.step == 19200].reset_index().drop(["index", "step"], axis=1)
final_accuracy_df

Unnamed: 0,experiment,metric,value
0,Additive update,accuracy,0.6971
1,Additive update,logit-adjusted accuracy,0.7645
2,ERM,accuracy,0.6927
3,LA,accuracy,0.761
