# The Machine Learning Workflow Annotated

Source: [Pytorch / Learn the Basics](https://pytorch.org/tutorials/beginner/basics/intro.html)

```{attention} TODO

- Store a properly parametrized model in the repo and USE it (inference) before creating your own.
- Simplify model archi, get rid of sequential
- Explain logit stuff & give math formulas
- Explain & test cross-entropy
- Show that the neural network accepts batched data
- Implement the "PIL image to category -> proba sorted mapping" as task.
- Plot ReLU graph
- Clean up the use of logits; here it actually refers to log-probabilites
- Annotate & tweak learning
```

In [None]:
import torch
import torchvision

## The FashionMNIST Dataset 

In [None]:
# Download training data from open datasets.
training_data = torchvision.datasets.FashionMNIST(
    root="data",
    train=True,
    download=True,
    transform=torchvision.transforms.ToTensor(),
)

# Download test data from open datasets.
test_data = torchvision.datasets.FashionMNIST(
    root="data",
    train=False,
    download=True,
    transform=torchvision.transforms.ToTensor(),
)

In [None]:
data = torchvision.datasets.FashionMNIST(root="data") # by default: training data set, no input/output transform, no download
data

In [None]:
# data is list-like ; each item in an input-output pair
datum = data[0]
datum

In [None]:
image, index = datum

In [None]:
image

In [None]:
# The output is a number that denotes the class of the pictured object. The list of categories is:
print(data.classes)

In [None]:
# Get the category name from the index:
data.classes[index] # that checks out!

In [None]:
import pandas as pd
df = [{"image": image, "category": data.classes[index]} for image, index in data]
df = pd.DataFrame(df)
df

In [None]:
import base64
import io
from IPython.display import HTML

def image_formatter(image):
    with io.BytesIO() as buffer:
        image.save(buffer, "jpeg")
        _bytes = buffer.getvalue()
    _base64 = base64.b64encode(_bytes).decode("ascii")
    return f'<img src="data:image/jpeg;base64, {_base64}">'

HTML(df.head().to_html(formatters={'image': image_formatter}, escape=False))

In [None]:
# Torch only want to deal with numeric array called "tensors", not images.
# So, it is perfectly happy with the output as a numeric value, but we need to transform the input
image_to_tensor = torchvision.transforms.ToTensor()
t = image_to_tensor(image)
t

In [None]:
t.shape, t.dtype

In [None]:
# No information has been lost in the conversion process!
import matplotlib.pyplot as plt
plt.imshow(t.squeeze(), cmap="grey")
plt.colorbar()
None

## Neural Network Model

The model architecture is going to assign to each image input the vector of probability $p_i$ that the item belongs to the $i$th class.
The class prediction is simply the class with the highest probability, but the fact that all $p_i$ are know allows us to evaluate the trust that we should have in the prediction.

The nitty-gritty details:

  - The image should be given as a 28x28 tensor (instead of say a PIL image),

  - The model does actually not output the probabilities $p_i \in [0, 1]$ directly but the corresponding unnormalized log probabilities
    $$
    \ell_i := \log p_i + c
    $$
    Compute $p_i$ with:
    $$
    p_i = \frac{\exp \ell_i}{\sum_{j=0}^{9} \exp \ell_j}.
    $$

    The pytorch [`softmax`](https://pytorch.org/docs/stable/generated/torch.nn.functional.softmax.html) function implements this operation.

In [None]:
class NeuralNetwork(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = torch.nn.Flatten()
        self.linear_1 = torch.nn.Linear(28*28, 512)
        self.relu_1 = torch.nn.ReLU()
        self.linear_2 = torch.nn.Linear(512, 512)
        self.relu_2 = torch.nn.ReLU()
        self.linear_3 = torch.nn.Linear(512, 10)

    def forward(self, image_tensor):
        image_flat = self.flatten(image_tensor)
        x_0 = image_flat
        x_1 = self.linear_1(x_0)
        x_1 = self.relu_1(x_1)
        x_2 = self.linear_2(x_1)
        x_2 = self.relu_2(x_2)
        x_3 = self.linear_3(x_2)
        logits = x_3
        return logits

In [None]:
model = NeuralNetwork()
model

In [None]:
# Load the (trained) model state for this architecture
model.load_state_dict(torch.load("models/base-model.pth"))

In [None]:
image_tensor, cls = training_data[0]
plt.imshow(image_tensor.squeeze(), cmap="grey")
plt.grid(False)
None

In [None]:
with torch.no_grad():
    logits = model(image_tensor)
logits = logits.squeeze()

In [None]:
probas = torch.nn.functional.softmax(logits, dim=-1)
probas

In [None]:
probas_dict = {training_data.classes[i]: p.item() for i, p in enumerate(probas)}
probas_dict = dict(sorted(list(probas_dict.items()), key=lambda pair: -pair[1]))
probas_dict

In [None]:
import seaborn as sns; sns.set_theme()
sns.barplot(probas_dict)
plt.gcf().set_figwidth(12)
plt.gca().set_ylabel("Probability")
None

### Parameters

In [None]:
for p in model.parameters():
    print(f"type: {type(p.data).__name__}, shape: {tuple(p.shape)!s:<10}, data type: {p.dtype}")

```{tip} Model Size
How many scalar parameters describe the model? What is the corresponding model size in MB?
```

In [None]:
num_params = 0
for p in model.parameters():
    num_params += torch.prod(torch.tensor(p.shape)).item()

There are {eval}`num_params` ($\approx$ {eval}`num_params // 1_000`K) parameters in the model. The size of each parameter is 4B, hence the total size is {eval}`round(num_params * 4 / 1_000_000, 1)`MB.

In [None]:
with torch.no_grad():
    to_tensor = torchvision.transforms.ToTensor()
    input = to_tensor(image)
    output = model(input)
output

In [None]:
logits = output.data
probas = torch.nn.functional.softmax(logits, dim=-1)
probas = probas.squeeze()
probas

In [None]:
import seaborn as sns; sns.set_theme()
probas_dict = {training_data.classes[i]: p.item() for i, p in enumerate(probas)}
probas_dict = dict(sorted(list(probas_dict.items()), key=lambda pair: -pair[1]))
sns.barplot(probas_dict)
plt.gcf().set_figwidth(12)
plt.gca().set_ylabel("Probability")
None

In [None]:
for key in probas_dict:
    most_likely_category = key
    break
most_likely_category

In [None]:
def prediction(image):
    to_tensor = torchvision.transforms.ToTensor()
    model.eval()
    with torch.no_grad():
        input = to_tensor(image)
        output = model(input)
    logits = output.data.squeeze()
    probas = torch.nn.functional.softmax(logits, dim=-1)
    category_proba = [(training_data.classes[i], p.item()) for i, p in enumerate(probas)]
    category_proba = sorted(category_proba, key=lambda pair: -pair[1])
    return dict(category_proba)

In [None]:
prediction(image)

In [None]:
sns.barplot(prediction(image))
plt.gcf().set_figwidth(12)
plt.gca().set_ylabel("Probability")
None

### Under the Hood

In [None]:
model.flatten

In [None]:
image

In [None]:
t = torchvision.transforms.ToTensor()(image)
t

In [None]:
model.flatten(t)

In [None]:
model.linear_1

In [None]:
model.linear_1.in_features == 28 * 28

In [None]:
linear_1_params = {name: param.data for name, param in model.linear_1.named_parameters()}
linear_1_params

In [None]:
b1 = linear_1_params["bias"]
b1.shape

In [None]:
A1 = linear_1_params["weight"]
A1.shape

In [None]:
to_tensor = torchvision.transforms.ToTensor() 
t = to_tensor(image)
print(t.shape)
tf = torch.flatten(t)
print(tf.shape)
x1 = A1 @ tf + b1
x1

In [None]:
x1 = model.relu_1(x1)
x1

In [None]:
list(model.relu_1.named_parameters())

In [None]:
linear_2_params = {name: param.data for name, param in model.linear_2.named_parameters()}
A2 = linear_2_params["weight"]
b2 = linear_2_params["bias"]
x2 = A2 @ x1 + b2
x2

In [None]:
x2 = model.relu_2(x2)
x2

In [None]:
linear_3_params = {name: param.data for name, param in model.linear_3.named_parameters()}
A3 = linear_3_params["weight"]
b3 = linear_3_params["bias"]
x3 = A3 @ x2 + b3
x3

In [None]:
def softmax(x):
    return torch.nn.functional.softmax(x, dim=0)

probas = softmax(x3)
probas

In [None]:
data = {training_data.classes[i]: p.item() for i, p in enumerate(probas)}
data

In [None]:
import seaborn as sns; sns.set_theme()
sns.barplot(data)
plt.gcf().set_figwidth(12)
plt.gca().set_ylabel("Probability")
None

In [None]:
out = model(t).data.squeeze()

In [None]:
ps = torch.nn.functional.softmax(out, dim=-1)

In [None]:
data = {training_data.classes[i]: p.item() for i, p in enumerate(ps)}
sns.barplot(data)
plt.gcf().set_figwidth(16)
plt.gca().set_ylabel("Probability")
None

## Training

### Loss Function

The loss function is a measure of the model prediction error: the mismatch between the output predicted by the model and the "real" output. Here, in the context of category identification, we use the cross-entropy loss function.

In [None]:
loss_function = torch.nn.CrossEntropyLoss()

It measures the differences between two probability distributions: here a computed probability distribution $p=(p_0, \dots, p_{n-1})$ and a "deterministic" distribution $q=e_i$ with
$$
e_i=(0, \dots, 0, 1, 0, \dots, 0) \;\;\; \mbox{($1$ in position $i$)}
$$
with:
$$
\mathrm{loss}(p, e_i) = - \log p_i. 
$$
The loss is zero when $p_i = 1$ (perfect match) and $-\infty$ when $p_i = 0$. It does not depend on the distribution of the $p_j$ for $j \neq i$.

```{warning}
The pytorch cross entropy function works directly with unnormalized log probabilities 
$$
\ell_i := \log p_i + c
$$
(the log probabilities up to a shared constant $c$) instead of the probabilites $p$. 
The deterministic distribution is also specified by the index $i$ instead of the vector $q=e_i$.
Hence, it actually computes
$$
\mathrm{loss}(\ell, i) := -\ell_i  + \log \left( \sum_je^{\ell_j} \right). 
$$
```


In [None]:
cross_entropy = torch.nn.CrossEntropyLoss()

In [None]:
cross_entropy(
    input=torch.tensor([1.0, 0.0]).log(), 
    target=torch.tensor(0)
)

In [None]:
cross_entropy(
    input=torch.tensor([0.0, 1.0]).log(), 
    target=torch.tensor(0)
)

In [None]:
cross_entropy(
    input=torch.tensor([0.5, 0.5]).log(), 
    target=torch.tensor(0)
)

In [None]:
- torch.tensor(0.5).log()

In [None]:
cross_entropy(
    input=torch.tensor([2/3, 1/3]).log(), 
    target=torch.tensor(0)
)

In [None]:
cross_entropy(
    input=torch.tensor([200.0, 100.0]).log(), 
    target=torch.tensor(0)
)

### Data Loader

In [None]:
batch_size = 64

# Create data loaders.
train_dataloader = torch.utils.data.DataLoader(training_data, batch_size=batch_size)
test_dataloader = torch.utils.data.DataLoader(test_data, batch_size=batch_size)

for X, y in test_dataloader:
    print(f"X = [N, C, H, W]: {X.shape}")
    print(f"y: {y.shape} {y.dtype}")
    break

### Training

In [None]:
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        # Compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch % 100 == 0:
            loss, current = loss.item(), (batch + 1) * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

In [None]:
def test(dataloader, model, loss_fn):
    """
    Returns:
      - score, the probability of a correct inference on the test dataset
    """
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.2f}%, Avg loss: {test_loss:>8f} \n")
    return correct

In [None]:
model = NeuralNetwork()
model.train()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
loss_function = torch.nn.CrossEntropyLoss()

score = 0.0
new_score = test(test_dataloader, model, loss_function)
epoch = 0
keep_learning = True
while keep_learning:
    epoch += 1
    score = new_score
    print(f"Epoch {epoch+1}\n-------------------------------")
    train(train_dataloader, model, loss_function, optimizer)
    new_score = test(test_dataloader, model, loss_function)
    keep_learning = new_score > score

print("Done!")


In [None]:
torch.save(model.state_dict(), "models/model.pth")
print("Saved PyTorch Model State to model.pth")

## Use the Model

In [None]:
model = NeuralNetwork()
model.load_state_dict(torch.load("models/model.pth"))

In [None]:
classes = test_data.classes

model.eval()
x, y = test_data[0][0], test_data[0][1]
with torch.no_grad():
    pred = model(x)
    predicted, actual = classes[pred[0].argmax(0)], classes[y]
    print(f'Predicted: "{predicted}", Actual: "{actual}"')

In [None]:
model.eval()
input, output = test_data[0][0], test_data[0][1]
print(f"Known category: {test_data.classes[output]}")
logits = model(input)
logits = logits.data.squeeze()
probas = torch.nn.functional.softmax(logits, dim=-1)
print(f"Predicted category: {test_data.classes[probas.argmax().item()]}")
data = {training_data.classes[i]: p.item() for i, p in enumerate(probas)}
sns.barplot(data)
plt.gcf().set_figwidth(16)
plt.gca().set_ylabel("Probability")
None