# Create summary and evaluation of different runs

In [77]:
# !rm -rf snellius_results
!rsync -r snellius:~/uvadlc_practicals_2023/assignment2/part0/results_profiling snellius_results
!rsync -r snellius:~/uvadlc_practicals_2023/assignment2/part1/results_resnet18 snellius_results
!rsync -r snellius:~/uvadlc_practicals_2023/assignment2/part2/results_zs snellius_results
!rsync -r snellius:~/uvadlc_practicals_2023/assignment2/part2/results_vp snellius_results
!rsync -r snellius:~/uvadlc_practicals_2023/assignment2/part2/results_cross_data snellius_results
!rsync -r snellius:~/uvadlc_practicals_2023/assignment2/part2/images snellius_results

In [63]:
import json
from pathlib import Path
import pandas as pd
import plotly.express as px
import plotly.io as pio
pio.templates.default = "plotly"

plots_dir = Path("plots")
plots_dir.mkdir(exist_ok=True)

In [49]:
snellius_results = Path("snellius_results")

def load_jsons_to_df(json_dir):
    json_data = []
    for js_path in json_dir.glob("*.json"):
        with open(js_path) as f: 
            js = json.load(f)
            json_data.append(js)
    return pd.DataFrame(json_data)

resnet_df = load_jsons_to_df(snellius_results / "results_resnet18")
zs_df = load_jsons_to_df(snellius_results / "results_zs")
vp_dp_df = load_jsons_to_df(snellius_results / "results_vp")
cross_df = load_jsons_to_df(snellius_results / "results_cross_data")

# Q1.2: Adapt ResNet18

In [50]:
resnet_df = resnet_df.sort_values(["dataset", "augmentation_name", "test_noise"])
resnet_df

Unnamed: 0,dataset,augmentation_name,test_noise,test_accuracy
1,cifar10,auto_augment,False,0.7679
0,cifar10,auto_augment,True,0.5727
6,cifar10,my_augmentation,False,0.788
8,cifar10,my_augmentation,True,0.3773
10,cifar10,,False,0.807
3,cifar10,,True,0.4585
5,cifar100,auto_augment,False,0.552
11,cifar100,auto_augment,True,0.3394
7,cifar100,my_augmentation,False,0.5522
9,cifar100,my_augmentation,True,0.2145


## Q1.2.a Retrain ResNet18 on Cifar100, resport accuracy

In [51]:
resnet_df.query("dataset == 'cifar100' and test_noise == False and augmentation_name.isnull()")["test_accuracy"]

4    0.5849
Name: test_accuracy, dtype: float64

# Q 2.1 CLIP

## Q1.2.b Use augmentation to improve the results

In [52]:
resnet_df.query("dataset == 'cifar100' and test_noise == False")

Unnamed: 0,dataset,augmentation_name,test_noise,test_accuracy
5,cifar100,auto_augment,False,0.552
7,cifar100,my_augmentation,False,0.5522
4,cifar100,,False,0.5849


In [64]:
fig = px.bar(
    resnet_df.query("dataset == 'cifar100' and test_noise == False").fillna("no augmentation").sort_values("augmentation_name", ascending=False),
    x="augmentation_name", 
    y="test_accuracy", 
    #color="test_noise",
    #facet_col="dataset",
    #barmode="group",
    text_auto=True,
    title="Comparison of test results without and with data augmentation for ResNet18")
fig.write_image(plots_dir / "q12b_resnet_augmentation.png", scale=2.0)
fig

## Q2.1 CLIP Zero-shot classification

In [65]:
zs_df.sort_values(["dataset", "set"], ascending=[True, False])

Unnamed: 0,dataset,set,accuracy
1,cifar10,train,88.721609
2,cifar10,test,88.897765
3,cifar100,train,63.581651
0,cifar100,test,63.079071


In [66]:
fig = px.bar(
    zs_df.sort_values(["dataset", "set"], ascending=[True, False]),
    x="set", 
    y="accuracy", 
    # color="test_noise",
    facet_col="dataset",
    barmode="group",
    text_auto=True,
    title="Comparison of accuracies for CLIP zero-shot classification")
fig.write_image(plots_dir / "q21_clip_zero_shot.png", scale=2.0)
fig

- Much lower for cifar100 - not surprising
- Train and test almost identical - not surprising, as this is not the train set that the models were trained on

# Q2.1.b CLIP Zero-shot classification - alternate tasks with different text prompts

- See images
- Prompting does make a difference, but values are still close - maybe something is wrong?

# Visual and deep prompting

In [67]:
vp_cols = ["dataset", "method", "prompt_size", "prompt_init_method", "test_noise", "top1_test_acc", "best_epoch"]
dp_cols = ["dataset", "prompt_num", "injection_layer", "prompt_size", "test_noise", "top1_test_acc", "best_epoch"]

## Q 2.2. Viual prompting

In [68]:
vp_df = vp_dp_df.query("prompt_type == 'visual_prompt'")[vp_cols].sort_values(["dataset", "method", "prompt_size", "prompt_init_method", "test_noise"])
# vp_df

In [69]:
zs_df.query("set == 'test'").sort_values(["dataset", "set"], ascending=[True, False])

Unnamed: 0,dataset,set,accuracy
2,cifar10,test,88.897765
0,cifar100,test,63.079071


In [70]:
vp_df.query("test_noise == False")[vp_cols].sort_values(["dataset", "method", "prompt_size", "test_noise"])

Unnamed: 0,dataset,method,prompt_size,prompt_init_method,test_noise,top1_test_acc,best_epoch
17,cifar10,fixed_patch,1,random,False,89.04,4
18,cifar10,fixed_patch,224,empty,False,87.77,1
30,cifar10,fixed_patch,224,random,False,82.3,33
1,cifar10,padding,30,random,False,87.61,36
31,cifar100,fixed_patch,1,random,False,64.34,16
11,cifar100,fixed_patch,224,empty,False,63.15,1
7,cifar100,fixed_patch,224,random,False,61.03,40
22,cifar100,padding,30,random,False,62.14,10


- 1 pixel fixed patches slightly improve the results
- 30 pixel padding make it worse -> probably overfitting to train set
- full-image patches even worse
- ..but initializing to zeros instead of randomly does improve
- check prompt images after training

# Q 2.3. deep prompting

In [71]:
dp_df = vp_dp_df.query("prompt_type == 'deep_prompt'")[dp_cols].sort_values(["dataset", "injection_layer", "test_noise"])
# dp_df

In [41]:
dp_df.query("test_noise == False")[["dataset", "injection_layer", "top1_test_acc", "best_epoch"]]

Unnamed: 0,dataset,injection_layer,top1_test_acc,best_epoch
6,cifar10,0,93.08,18
24,cifar10,2,92.82,3
20,cifar10,4,93.19,16
37,cifar10,6,91.88,33
25,cifar10,8,91.61,37
4,cifar10,10,90.26,11
21,cifar100,0,68.38,1
38,cifar100,2,69.26,1
5,cifar100,4,68.64,6
23,cifar100,6,67.69,39


- Deep prompting clearly works better than visual prompting
- The effect deteriorates the later the deep prompt gets injected
    - This is understandable in the sense that less transformer blocks "get the message"
    - But suprising in the sense that VP was bad, than first layer DP is really good, then gets worse again

# Q 2.4 Robustness - test with noise 

In [72]:
resnet_df["experiment"] = "aug: " + resnet_df["augmentation_name"]
fig = px.bar(
    resnet_df,
    x="experiment", 
    y="test_accuracy", 
    color="test_noise",
    facet_col="dataset",
    barmode="group",
    text_auto=True,
    title="Comparison of test results without and with test noise for ResNet18")
fig.write_image(plots_dir / "q24_resnet_test_noise.png", scale=2.0)
fig

In [73]:
vp_df["experiment"] = vp_df["method"] + " " + vp_df["prompt_size"].astype(str) + " " + vp_df["prompt_init_method"]

fig = px.bar(
    vp_df,
    x="experiment", 
    y="top1_test_acc", 
    color="test_noise",
    facet_col="dataset",
    barmode="group",
    text_auto=True,
    title="Comparison of test results without and with test noise for CLIP visual prompting")
fig.write_image(plots_dir / "q24_clip_vp_test_noise.png", scale=2.0)
fig

In [74]:
dp_df["experiment"] = "layer: " + dp_df["injection_layer"].astype(str)
fig = px.bar(
    dp_df,
    x="experiment", 
    y="top1_test_acc", 
    color="test_noise",
    facet_col="dataset",
    barmode="group",
    text_auto=True,
    title="Comparison of test results without and with test noise for CLIP deep prompting")
fig.write_image(plots_dir / "q24_clip_dp_test_noise.png", scale=2.0)
fig

# Q2.5 Cross-dataset evaluation

In [75]:
vp_cross_df = pd.merge(vp_df, cross_df.query("prompt_type == 'visual_prompt'"))
fig = px.bar(
    vp_cross_df,
    x="experiment", 
    y="top1_test_acc_cross_data", 
    color="dataset",
    barmode="group",
    text_auto=True,
    title="Comparison of cross dataset test results for CLIP visual prompting models trained<br>"
          "on either of the two datasets"
)
fig.write_image(plots_dir / "q25_clip_vp_cross_data_eval.png", scale=2.0)
fig

In [76]:
dp_cross_df = pd.merge(dp_df, cross_df.query("prompt_type == 'deep_prompt'"))
fig = px.bar(
    vp_cross_df,
    x="experiment", 
    y="top1_test_acc_cross_data", 
    color="dataset",
    barmode="group",
    text_auto=True,
    title="Comparison of cross dataset test results for CLIP deep prompting models trained<br>"
          "on either of the two datasets"
)
fig.write_image(plots_dir / "q25_clip_dp_cross_data_eval.png", scale=2.0)
fig

# Q3

In [94]:
import numpy as np
# 3.1
adj_a = np.zeros((7, 7))
adj_a[0, :] = 1
adj_a[:, 0] = 1
adj_a[0, 0] = 0
adj_a.tolist()

[[0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
 [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]

In [95]:
# b
adj_b = adj_a.copy()
adj_b[1, 2] = 1
adj_b[2, 1] = 1

adj_b[2, 3] = 1
adj_b[3, 2] = 1

adj_b[3, 4] = 1
adj_b[4, 3] = 1

adj_b[4, 5] = 1
adj_b[5, 4] = 1

adj_b[5, 6] = 1
adj_b[6, 5] = 1

adj_b[6, 1] = 1
adj_b[1, 6] = 1

adj_b.tolist()

[[0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
 [1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0],
 [1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0],
 [1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0],
 [1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0],
 [1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0],
 [1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0]]

In [98]:
adj_c = adj_b.copy()
adj_c = np.delete(adj_c, 0, axis=0)
adj_c = np.delete(adj_c, 0, axis=1)
adj_c.tolist()

[[0.0, 1.0, 0.0, 0.0, 0.0, 1.0],
 [1.0, 0.0, 1.0, 0.0, 0.0, 0.0],
 [0.0, 1.0, 0.0, 1.0, 0.0, 0.0],
 [0.0, 0.0, 1.0, 0.0, 1.0, 0.0],
 [0.0, 0.0, 0.0, 1.0, 0.0, 1.0],
 [1.0, 0.0, 0.0, 0.0, 1.0, 0.0]]

In [105]:
adj_c_sq = adj_c @ adj_c
adj_c_sq.tolist()

[[2.0, 0.0, 1.0, 0.0, 1.0, 0.0],
 [0.0, 2.0, 0.0, 1.0, 0.0, 1.0],
 [1.0, 0.0, 2.0, 0.0, 1.0, 0.0],
 [0.0, 1.0, 0.0, 2.0, 0.0, 1.0],
 [1.0, 0.0, 1.0, 0.0, 2.0, 0.0],
 [0.0, 1.0, 0.0, 1.0, 0.0, 2.0]]