In [None]:
#!pip install tensorflow

In [None]:
#!pip install scikit-learn

In [11]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation
from tensorflow.keras.optimizers import SGD, Adam
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [12]:
print(tf.config.list_physical_devices('GPU'))

[]


In [13]:
#Import svm model
from sklearn import svm
from sklearn import metrics
from sklearn.model_selection import KFold, cross_val_score

In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import pickle

In [15]:
### 11-class with attention based instance filtering
# feat_dir = "11_class_features_attn_based_instance_filtering"
# files = os.listdir(feat_dir)
# files

In [16]:
### 5-class (person)
# files = ["feat_backpack.pickle", "feat_handbag.pickle", "feat_suitcase.pickle", "feat_tie.pickle", "feat_umbrella.pickle"]
### 3-class (dining table)
# why is it 3?
# files = ["feat_full_act_suitcase.pickle", "feat_full_act_tie.pickle", "feat_full_act_umbrella.pickle"]

In [17]:
def base_model(model="NN", n_classes = 5):
    if model == "NN":
        clf_model = Sequential()
        clf_model.add(Input(shape=(768)))
        clf_model.add(Dense(256, activation='relu'))
        clf_model.add(Dropout(0.5))
        clf_model.add(Dense(n_classes))
        clf_model.add(Activation("softmax"))
    #     sgd = SGD(lr=0.001)
        adam = Adam()
        clf_model.compile(loss="categorical_crossentropy", optimizer=adam, metrics=["accuracy"])        
    else:
        clf_model = svm.SVC(kernel='linear') # Linear Kernel
    return clf_model

### Experiment settings


In [18]:
EXP_FOLDER = "experiments"

In [19]:
data_split = "train2017"
dataset_folder = f"{data_split}/task1_30-04_L5-8_clean_caption/features-mask-4-main_thr-0-sec_thr-0/"
exp_name = f"exp_{dataset_folder[18:-1]}" 
exp_name

'exp_-04_L5-8_clean_caption/features-mask-4-main_thr-0-sec_thr-0'

In [20]:
# os.makedirs(os.path.join(EXP_FOLDER, exp_name), exist_ok=True)

### Dataset preparation

In [21]:
# None: discard filter
# True: filter only features where caption matches object
# False: filter only features where caption DOESN'T match object
filter_caption = None

In [22]:
files = os.listdir(dataset_folder)
files

['feat-tokens_act-1-27.pickle',
 'feat-tokens_act-1-28.pickle',
 'feat-tokens_act-1-31.pickle',
 'feat-tokens_act-1-32.pickle']

In [23]:
# # files with features and labels are split due to ram limitations
# # on generation it has to fit ram, and also on reading

# files = os.listdir(dataset_folder)

# features = pd.read_pickle(os.path.join(dataset_folder, files[0]))
# features = features.set_index(["image_id"], verify_integrity=True)
# if filter_caption is not None:
#     print(f"filter caption is on:{filter_caption}")
#     features = features[features["caption_filter"]==filter_caption]
# else:
#     # order by caption filter to make sure there's caption_filter since only a few have
#     features = features.sort_values(by=['caption_filter'], ascending=False)


# for file in files[1:]:
#     print(f"Processing file '{file}'")    
#     obj_features = pd.read_pickle(os.path.join(dataset_folder, file))
#     obj_features = obj_features[~obj_features["image_id"].isin(features.index.tolist())]
#     if filter_caption is not None:
#         print(f"filter caption is on:{filter_caption}")
#         obj_features = obj_features[obj_features["caption_filter"]==filter_caption]
#     else:
#         # order by caption filter to make sure there's caption_filter since only a few have
#         obj_features = obj_features.sort_values(by=['caption_filter'], ascending=False)
#     obj_features = obj_features.set_index(["image_id"], verify_integrity=True)
# #     obj_features = obj_features[:1000]
#     features = pd.concat([features, obj_features])

# # features = features.reset_index(drop=True)
#     # TODO: fix consistent token selection with multiple layers
# features = features[(~features["second_fg_tokens"].isnull()) & 
#                     (~features["main_fg_tokens"].isnull())
# #                     (~features["second_consistent_fg_token"].isnull()) &
# #                     (~features["main_consistent_fg_token"].isnull())
#                    ]

In [24]:
def prepare_dataset(dataset_folder, filter_caption=None, limit_size=1000):
    files = os.listdir(dataset_folder)
    features = pd.DataFrame()
    for file in files:
        ## WORKAROUND 'feat-tokens_act-1-10.pickle'
        if int(file[16])==1:
            print(f"person dataset {file[16]}")
            first_label = "person"
        else:
            print(f"dining_table dataset {file[16]}")
            first_label = "dining_table"

        print(f"Processing file '{file}'")    
        obj_features = pd.read_pickle(os.path.join(dataset_folder, file))
        obj_features["class"] = obj_features["class"].apply(lambda x: first_label+"-"+x)
        if filter_caption is not None:
            print(f"filter caption is on:{filter_caption}")
            obj_features = obj_features[obj_features["caption_filter"]==filter_caption]
        else:
            # order by caption filter to make sure there's caption_filter since only a few have
            obj_features = obj_features.sort_values(by=['caption_filter'], ascending=False)
            obj_features = obj_features.reset_index(drop=True)
        obj_features = obj_features[:limit_size]
        features = pd.concat([features, obj_features])

    features = features.reset_index(drop=True)
    # TODO: fix consistent token selection with multiple layers
    features = features[(~features["second_fg_tokens"].isnull()) & 
                        (~features["main_fg_tokens"].isnull())
    #                     (~features["second_consistent_fg_token"].isnull()) &
    #                     (~features["main_consistent_fg_token"].isnull())
                        ]

    labels = features['class'].values.tolist()
    unique_labels = sorted(list(set(labels)))
    labels_to_idx = dict(zip(unique_labels, range(len(unique_labels))))

    features["labels"] = features['class'].apply(lambda x: labels_to_idx[x])
    # for stratification based on labels + caption
    features["labels_caption"] = features["class"].astype(str) + features["caption_filter"].astype(str)
    features = features.reset_index(drop=True)
    
    return features, unique_labels

In [25]:
data_split = "train2017"
dataset_folder = f"{data_split}/task1_30-04_clean_caption/features-mask-4-main_thr-0-sec_thr-0/"
test_features_mask4, unique_labels = prepare_dataset(dataset_folder, limit_size=1001)

dataset_folder = f"{data_split}/task1_30-04_clean_caption/features-mask-3-main_thr-0-sec_thr-0/"
test_features_mask3, unique_labels = prepare_dataset(dataset_folder, limit_size=1001)

person dataset 1
Processing file 'feat-tokens_act-1-27.pickle'
person dataset 1
Processing file 'feat-tokens_act-1-28.pickle'
person dataset 1
Processing file 'feat-tokens_act-1-31.pickle'
person dataset 1
Processing file 'feat-tokens_act-1-32.pickle'
person dataset 1
Processing file 'feat-tokens_act-1-27.pickle'
person dataset 1
Processing file 'feat-tokens_act-1-28.pickle'
person dataset 1
Processing file 'feat-tokens_act-1-31.pickle'
person dataset 1
Processing file 'feat-tokens_act-1-32.pickle'


In [26]:
print(len(test_features_mask4))
print(len(test_features_mask3))

4004
4004


In [45]:
test_features_mask3.head()

Unnamed: 0,image_id,image_filename,caption_filter,main_fg_tokens,main_consistent_fg_token,second_fg_tokens,second_consistent_fg_token,main_fg_tokens_act,second_fg_tokens_act,class,labels,labels_caption
0,32887,000000032887.jpg,True,"{3: {'max_image': 65, 'max_obj': 65, 'min_obj'...",,"{3: {'max_image': 65, 'max_obj': 65, 'min_obj'...",,"{3: {'min_obj': [2.4349773, -0.1854052, 3.9142...","{3: {'min_obj': [0.63811743, 1.7890681, -0.548...",person-backpack,0,person-backpackTrue
1,309467,000000309467.jpg,True,"{3: {'max_image': 108, 'max_obj': 77, 'min_obj...",,"{3: {'max_image': 108, 'max_obj': 106, 'min_ob...",,"{3: {'min_obj': [-0.28453982, 4.286143, -0.778...","{3: {'min_obj': [1.7944645, 3.106815, -0.57848...",person-backpack,0,person-backpackTrue
2,161609,000000161609.jpg,True,"{3: {'max_image': 75, 'max_obj': 97, 'min_obj'...",,"{3: {'max_image': 75, 'max_obj': 184, 'min_obj...",,"{3: {'min_obj': [0.30532223, 1.8497058, -0.871...","{3: {'min_obj': [0.06252527, 4.5049324, 0.5796...",person-backpack,0,person-backpackTrue
3,253835,000000253835.jpg,True,"{3: {'max_image': 122, 'max_obj': 137, 'min_ob...",,"{3: {'max_image': 122, 'max_obj': 67, 'min_obj...",,"{3: {'min_obj': [0.754342, 1.004171, 1.0498483...","{3: {'min_obj': [1.4790652, 1.7272012, 0.74136...",person-backpack,0,person-backpackTrue
4,157928,000000157928.jpg,True,"{3: {'max_image': 91, 'max_obj': 88, 'min_obj'...",,"{3: {'max_image': 91, 'max_obj': 63, 'min_obj'...",,"{3: {'min_obj': [0.706177, 1.4178104, -0.16808...","{3: {'min_obj': [-0.7944455, 1.7602386, 0.5278...",person-backpack,0,person-backpackTrue


In [31]:
# test_features_mask3[""]
diff_token_count = 0
diff_images = []
for idx, row in test_features_mask3.iterrows():
    temp = test_features_mask4[test_features_mask4["image_id"]==row["image_id"]]
    if len(temp)>1:
        print("error on matching data!")
    if abs(row["main_fg_tokens"][10]["max_obj"]-temp["main_fg_tokens"].values[0][10]["max_obj"]) > 5:
        diff_token_count += 1
        print(f"Different max_obj : {row['main_fg_tokens'][10]['max_obj']}, {temp['main_fg_tokens'].values[0][10]['max_obj']}")
    diff_images.append(row["image_id"])
print(diff_token_count)

Different max_obj : 93, 64
Different max_obj : 99, 90
Different max_obj : 105, 75
Different max_obj : 128, 143
Different max_obj : 20, 29
Different max_obj : 110, 88
Different max_obj : 138, 81
Different max_obj : 169, 189
Different max_obj : 127, 119
Different max_obj : 93, 104
Different max_obj : 89, 157
Different max_obj : 46, 93
Different max_obj : 89, 32
Different max_obj : 129, 106
Different max_obj : 114, 133
Different max_obj : 128, 178
Different max_obj : 141, 176
Different max_obj : 149, 132
Different max_obj : 118, 89
Different max_obj : 163, 61
Different max_obj : 129, 162
Different max_obj : 56, 76
Different max_obj : 147, 116
Different max_obj : 111, 101
Different max_obj : 183, 54
Different max_obj : 164, 136
Different max_obj : 106, 87
Different max_obj : 132, 113
Different max_obj : 93, 87
Different max_obj : 49, 38
Different max_obj : 46, 17
Different max_obj : 65, 4
Different max_obj : 102, 92
Different max_obj : 114, 87
Different max_obj : 54, 38
Different max_obj :

In [33]:
data_split = "train2017"
dataset_folder = f"{data_split}/task2_30-04_clean_caption/features-mask-4-main_thr-0-sec_thr-0/"
test_features_mask4, unique_labels = prepare_dataset(dataset_folder, limit_size=10000)

dataset_folder = f"{data_split}/task1_30-04_clean_caption/features-mask-3-main_thr-0-sec_thr-0/"
test_features_mask3, unique_labels = prepare_dataset(dataset_folder, limit_size=10000)

dining_table dataset 6
Processing file 'feat-tokens_act-67-44.pickle'
dining_table dataset 6
Processing file 'feat-tokens_act-67-47.pickle'
dining_table dataset 6
Processing file 'feat-tokens_act-67-49.pickle'
dining_table dataset 6
Processing file 'feat-tokens_act-67-51.pickle'
person dataset 1
Processing file 'feat-tokens_act-1-27.pickle'
person dataset 1
Processing file 'feat-tokens_act-1-28.pickle'
person dataset 1
Processing file 'feat-tokens_act-1-31.pickle'
person dataset 1
Processing file 'feat-tokens_act-1-32.pickle'


In [88]:
task_group = "task1_30-04"
experiment_folders = os.listdir("val2017")
experiment_folders = [exp for exp in experiment_folders if exp[:11]==task_group]
experiment_folders

['task1_30-04_clean',
 'task1_30-04_clean_caption',
 'task1_30-04_L5-8_clean',
 'task1_30-04_L5-8_clean_caption',
 'task1_30-04_no_overlap',
 'task1_30-04_overlap']

In [90]:
for folder in experiment_folders[:1]:
    print(folder)
    dataset_folder = f"{data_split}/{folder}/features-mask-4-main_thr-0-sec_thr-0/"
    test_features_mask4, unique_labels = prepare_dataset(dataset_folder)

    dataset_folder = f"{data_split}/{folder}/features-mask-3-main_thr-0-sec_thr-0/"
    test_features_mask3, unique_labels = prepare_dataset(dataset_folder)
    
    test_features_mask3["mask-4-main_fg_tokens"] = ""
    test_features_mask3["mask-4-second_fg_tokens"] = ""
    diff_token_count = 0
    for idx, row in test_features_mask3.iterrows():
        temp = test_features_mask4[test_features_mask4["image_id"]==row["image_id"]]
        if len(temp)!=1:
            print("error on matching data!")
            print(row["image_id"])
            break
        all_tokens_match = True
        for layer in list(row["main_fg_tokens"].keys()):
            for object_ in ["main", "second"]:
                max_obj_3 = row[f"{object_}_fg_tokens"][layer]["max_obj"]
                max_obj_4 = temp[f"{object_}_fg_tokens"].values[0][layer]["max_obj"]
                max_token_diff = abs(max_obj_3-max_obj_4)

                min_obj_3 = row[f"{object_}_fg_tokens"][layer]["min_obj"]
                min_obj_4 = temp[f"{object_}_fg_tokens"].values[0][layer]["min_obj"]
                min_token_diff = abs(min_obj_3-min_obj_4)

                if max_token_diff > 1: 
                    if (max_obj_3%13!=0 or max_obj_3%14!=0) and (max_obj_4%13!=0 or max_obj_4%14!=0):
                        all_tokens_match = False
                    elif max_token_diff <3: 
                        print(f"Border tokens max_obj : {max_obj_3}, {max_obj_4}")
                    
                if min_token_diff > 1: 
                    if (min_obj_3%13!=0 or min_obj_3%14!=0) and (min_obj_4%13!=0 or min_obj_4%14!=0):
                        all_tokens_match = False
                    elif min_token_diff <3:
                        print(f"Border tokens min_obj : {min_obj_3}, {min_obj_4}")                       

        if not all_tokens_match:
            diff_token_count += 1
            test_features_mask3.at[idx,"mask-4-main_fg_tokens"] = temp[f"main_fg_tokens"].values[0]
            test_features_mask3.at[idx,"mask-4-second_fg_tokens"] = temp[f"second_fg_tokens"].values[0]
    test_features_mask3 = test_features_mask3.drop(columns=["main_fg_tokens_act", "second_fg_tokens_act"])
    test_features_mask3.to_pickle(f"{data_split}/{folder}/feat_token_diff_mask-3-4.pickle")
    print(diff_token_count)

task1_30-04_clean
person dataset 1
Processing file 'feat-tokens_act-1-27.pickle'
person dataset 1
Processing file 'feat-tokens_act-1-28.pickle'
person dataset 1
Processing file 'feat-tokens_act-1-31.pickle'
person dataset 1
Processing file 'feat-tokens_act-1-32.pickle'
person dataset 1
Processing file 'feat-tokens_act-1-27.pickle'
person dataset 1
Processing file 'feat-tokens_act-1-28.pickle'
person dataset 1
Processing file 'feat-tokens_act-1-31.pickle'
person dataset 1
Processing file 'feat-tokens_act-1-32.pickle'
Border tokens min_obj : 2, 0
408


In [91]:
test_features_mask3.head()

Unnamed: 0,image_id,image_filename,caption_filter,main_fg_tokens,main_consistent_fg_token,second_fg_tokens,second_consistent_fg_token,class,labels,labels_caption,mask-4-main_fg_tokens,mask-4-second_fg_tokens
0,424162,000000424162.jpg,False,"{3: {'max_image': 129, 'max_obj': 95, 'min_obj...",,"{3: {'max_image': 129, 'max_obj': 67, 'min_obj...",,person-backpack,0,person-backpackFalse,"{3: {'max_image': 129, 'max_obj': 78, 'min_obj...","{3: {'max_image': 129, 'max_obj': 67, 'min_obj..."
1,350122,000000350122.jpg,False,"{3: {'max_image': 103, 'max_obj': 98, 'min_obj...",,"{3: {'max_image': 103, 'max_obj': 107, 'min_ob...",,person-backpack,0,person-backpackFalse,"{3: {'max_image': 103, 'max_obj': 103, 'min_ob...","{3: {'max_image': 103, 'max_obj': 103, 'min_ob..."
2,363188,000000363188.jpg,False,"{3: {'max_image': 108, 'max_obj': 108, 'min_ob...",,"{3: {'max_image': 108, 'max_obj': 103, 'min_ob...",,person-backpack,0,person-backpackFalse,"{3: {'max_image': 108, 'max_obj': 108, 'min_ob...","{3: {'max_image': 108, 'max_obj': 103, 'min_ob..."
3,32887,000000032887.jpg,False,"{3: {'max_image': 65, 'max_obj': 65, 'min_obj'...",,"{3: {'max_image': 65, 'max_obj': 65, 'min_obj'...",,person-backpack,0,person-backpackFalse,,
4,281414,000000281414.jpg,False,"{3: {'max_image': 61, 'max_obj': 61, 'min_obj'...",,"{3: {'max_image': 61, 'max_obj': 110, 'min_obj...",,person-backpack,0,person-backpackFalse,,


In [34]:
test_features_mask3[["class", "caption_filter"]].groupby(["class", "caption_filter"])[["caption_filter"]].count()

Unnamed: 0_level_0,Unnamed: 1_level_0,caption_filter
class,caption_filter,Unnamed: 2_level_1
person-backpack,False,991
person-backpack,True,327
person-handbag,False,987
person-handbag,True,159
person-tie,False,917
person-tie,True,995
person-umbrella,False,996
person-umbrella,True,998


In [32]:
test_features_mask4[["class", "caption_filter"]].groupby(["class", "caption_filter"])[["caption_filter"]].count()

Unnamed: 0_level_0,Unnamed: 1_level_0,caption_filter
class,caption_filter,Unnamed: 2_level_1
dining_table-bottle,False,997
dining_table-bottle,True,393
dining_table-bowl,False,906
dining_table-bowl,True,804
dining_table-cup,False,999
dining_table-cup,True,825
dining_table-knife,False,981
dining_table-knife,True,224


In [19]:
## load preds to see size
preds_mask_3 = np.load("experiments/task1_exp_30-04_mask-3_person_accessory_clean/preds_l-11_o-main_t-max_obj.npy")
print(preds_mask_3.shape)
preds_mask_4 = np.load("experiments/task1_exp_30-04_mask-4_person_accessory_clean/preds_l-11_o-main_t-max_obj.npy")
print(preds_mask_4.shape)

(400, 4)
(400, 4)


In [20]:
def coco_dataset_split(train_features, test_features):
    train_idx, val_idx, train_labels, val_labels = train_test_split(train_features.index.tolist(), 
                                                                    train_features["labels"].tolist(), 
                                                                    test_size=0.10, 
                                                                    stratify=train_features["labels_caption"].tolist(),
                                                                    random_state=42, 
                                                                    shuffle=True)
    test_idx, test_labels = test_features.index.tolist(), test_features["labels"].tolist()

    return train_idx, val_idx, test_idx, train_labels, val_labels, test_labels

In [None]:
coco_dataset_split()

In [20]:
## Create a stable test 

In [21]:
train_idx, test_idx, train_labels, test_labels = train_test_split(features.index.tolist(),
                                                                  features["labels"], 
                                                                  test_size=0.10, 
                                                                  stratify=features["labels_caption"],
                                                                  random_state=42, 
                                                                  shuffle=True)   

[339177,
 343383,
 6031,
 372198,
 328449,
 377241,
 247840,
 348865,
 165760,
 450107,
 450707,
 278921,
 447089,
 30643,
 80470,
 418109,
 276354,
 416303,
 251098,
 10693,
 431023,
 467130,
 428000,
 477438,
 205253,
 141200,
 64824,
 573953,
 402287,
 326966,
 569353,
 70201,
 509589,
 432378,
 12547,
 394535,
 359399,
 448531,
 536725,
 387328,
 559665,
 234211,
 335217,
 347506,
 96288,
 99348,
 542205,
 578250,
 135467,
 121503,
 423313,
 148969,
 248767,
 28157,
 108272,
 577869,
 84230,
 434319,
 234889,
 524436,
 434867,
 496198,
 530207,
 378444,
 213725,
 530706,
 349344,
 102704,
 579415,
 48670,
 447613,
 196989,
 529314,
 443784,
 333058,
 33057,
 430750,
 22802,
 377732,
 56116,
 550514,
 66236,
 457861,
 278921,
 174332,
 280761,
 265990,
 282098,
 223874,
 134703,
 309071,
 452700,
 357013,
 61460,
 236068,
 525700,
 455719,
 118741,
 81903,
 244815,
 258850,
 80246,
 54761,
 457745,
 501624,
 427523,
 29886,
 27642,
 472054,
 22240,
 211051,
 442582,
 571848,
 355674

In [24]:
testset = {"train": {"data": train_idx, "labels": train_labels.tolist()},
           "test": {"data": test_idx , "labels": test_labels.tolist()}}

In [26]:
testset["train"]["data"], testset["train"]["labels"]

([2650,
  3125,
  3780,
  4145,
  2989,
  2088,
  1928,
  4341,
  2649,
  644,
  4788,
  9,
  1788,
  2991,
  3344,
  4718,
  3742,
  1240,
  2438,
  3648,
  2281,
  3785,
  2485,
  938,
  3312,
  3594,
  884,
  182,
  238,
  4780,
  1488,
  1480,
  1611,
  4047,
  4048,
  4057,
  2379,
  4112,
  3555,
  2772,
  1481,
  1160,
  4129,
  4131,
  4620,
  1804,
  843,
  2424,
  505,
  3302,
  4173,
  311,
  260,
  3614,
  4659,
  2883,
  3661,
  2802,
  93,
  2905,
  1047,
  4095,
  4470,
  1651,
  217,
  4461,
  2718,
  1994,
  3935,
  3599,
  851,
  3324,
  2401,
  3466,
  2967,
  1050,
  287,
  1924,
  2052,
  2149,
  4322,
  1797,
  4472,
  3267,
  309,
  3820,
  921,
  551,
  2164,
  4598,
  3961,
  2347,
  3664,
  364,
  1477,
  1563,
  2291,
  2681,
  1939,
  4034,
  495,
  2190,
  2126,
  628,
  1242,
  1865,
  4476,
  1901,
  4419,
  3877,
  2021,
  4885,
  3392,
  2944,
  177,
  2646,
  2185,
  2063,
  2768,
  1346,
  4339,
  3808,
  412,
  3499,
  1476,
  1284,
  3059,
  3880,
 

In [27]:
with open("test_split.pickle", 'wb') as handle:
    pickle.dump(testset, handle)

In [22]:
test_labels

474     0
1657    4
3098    3
3977    2
1806    4
       ..
3697    3
21      0
3307    3
2638    1
4666    2
Name: labels, Length: 498, dtype: int64

In [34]:
features.filter(items=train_idx, axis=0)[["class", "caption_filter"]].groupby(["class", "caption_filter"])[["caption_filter"]].count()

Unnamed: 0_level_0,Unnamed: 1_level_0,caption_filter
class,caption_filter,Unnamed: 2_level_1
backpack,False,890
handbag,False,887
suitcase,False,633
suitcase,True,267
tie,False,741
tie,True,159
umbrella,False,418
umbrella,True,482


In [35]:
features.filter(items=test_idx, axis=0)[["class", "caption_filter"]].groupby(["class", "caption_filter"])[["caption_filter"]].count()

Unnamed: 0_level_0,Unnamed: 1_level_0,caption_filter
class,caption_filter,Unnamed: 2_level_1
backpack,False,99
handbag,False,99
suitcase,False,70
suitcase,True,30
tie,False,82
tie,True,18
umbrella,False,46
umbrella,True,54


In [26]:
len(train_idx)

4477

In [36]:
train_idx, val_idx, train_labels, val_labels = train_test_split(features.filter(items=train_idx, axis=0).index.tolist(), 
                                                                features.filter(items=train_idx, axis=0)["labels"], 
                                                                test_size=0.10, 
                                                                stratify=features.filter(items=train_idx, axis=0)["labels_caption"],
                                                                random_state=42, 
                                                                shuffle=True)   

In [37]:
len(train_idx)

4029

In [38]:
len(val_idx)

448

### Model training

In [43]:
token_strategies = ['max_image', 'max_obj', 'min_obj', 'random_obj']
# layers = [3,4,9,10,11]
layers = [9,10,11]
objects = ['main', 'second']
model = "NN"
          

if model=="NN":
    train_y = to_categorical(train_labels)
    val_y = to_categorical(val_labels)
    test_y = to_categorical(test_labels)
else:
    train_y = train_labels
    val_y = val_labels
    test_y = test_labels

histories = {}
test_scores = {}
class_scores = {}

for obj in objects:
    for strategy in token_strategies:
        for layer in layers:
            train_data = features.filter(items=train_idx, axis=0)[f"{obj}_fg_tokens_act"].apply(lambda x: x[layer][strategy]).to_numpy()
            val_data = features.filter(items=val_idx, axis=0)[f"{obj}_fg_tokens_act"].apply(lambda x: x[layer][strategy]).to_numpy()
            test_data = features.filter(items=test_idx, axis=0)[f"{obj}_fg_tokens_act"].apply(lambda x: x[layer][strategy]).to_numpy()
            
            clf_model = base_model(model, n_classes=len(labels))
            print(f"Experiment {exp_name} - training model l:{layer}-o:{obj}-s:{strategy}")
            if model == "NN":
                es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=20, restore_best_weights=True)        
                hist = clf_model.fit(tf.stack(train_data), 
                                     tf.stack(train_y), 
                                     validation_data=(tf.stack(val_data),tf.stack(val_y)),
                                     epochs=60, 
                                     batch_size=128, 
                                     callbacks=[es],
                                     verbose=0)
                # save model
                clf_model.save(os.path.join(EXP_FOLDER, exp_name, f"model-l{layer}-o{obj}-s{strategy}"))
                histories[f"{layer}-{obj}-{strategy}"] = hist
                # Save raw loss and acc from "hist" object to recreate plots
                with open(f"{EXP_FOLDER}/{exp_name}/model_l-{layer}_o-{obj}_t-{strategy}_history.pickle", 'wb') as handle:
                    pickle.dump(hist.history, handle)
                print("Evaluating model...")                
                test_scores[f"{layer}-{obj}-{strategy}"] = clf_model.evaluate(tf.stack(test_data), 
                                                                              tf.stack(test_y), 
                                                                              batch_size=128)
                #TODO: Evaluation per class using saved preds
                preds = clf_model.predict(tf.stack(test_data), batch_size=128)
                np.save(os.path.join(EXP_FOLDER, exp_name, f"preds_l-{layer}_o-{obj}_t-{strategy}.npy"), preds)
                y_pred = np.argmax(preds, axis=1)
                matrix = confusion_matrix(test_labels, y_pred)
                print(matrix.diagonal()/matrix.sum(axis=1))
                class_scores[f"{layer}-{obj}-{strategy}"] = matrix.diagonal()/matrix.sum(axis=1)
                print("Done...")
            else: 
                clf_model.fit(np.stack(train_data), train_y)
                y_pred = clf_model.predict(np.stack(test_data))
                test_scores[f"{layer}-{obj}-{strategy}"] = metrics.accuracy_score(test_labels, y_pred)

Experiment exp_mask-4-main_thr-0-sec_thr-0 - training model l:9-o:main-s:max_image
Restoring model weights from the end of the best epoch: 7.
Epoch 27: early stopping




INFO:tensorflow:Assets written to: experiments/exp_mask-4-main_thr-0-sec_thr-0/model-l9-omain-smax_image/assets


INFO:tensorflow:Assets written to: experiments/exp_mask-4-main_thr-0-sec_thr-0/model-l9-omain-smax_image/assets


Evaluating model...
[0.01010101 0.         0.61       0.65       0.01      ]
Done...
Experiment exp_mask-4-main_thr-0-sec_thr-0 - training model l:10-o:main-s:max_image
Restoring model weights from the end of the best epoch: 2.
Epoch 22: early stopping




INFO:tensorflow:Assets written to: experiments/exp_mask-4-main_thr-0-sec_thr-0/model-l10-omain-smax_image/assets


INFO:tensorflow:Assets written to: experiments/exp_mask-4-main_thr-0-sec_thr-0/model-l10-omain-smax_image/assets


Evaluating model...
[0.04040404 0.01010101 0.03       0.49       0.94      ]
Done...
Experiment exp_mask-4-main_thr-0-sec_thr-0 - training model l:11-o:main-s:max_image
Restoring model weights from the end of the best epoch: 4.
Epoch 24: early stopping




INFO:tensorflow:Assets written to: experiments/exp_mask-4-main_thr-0-sec_thr-0/model-l11-omain-smax_image/assets


INFO:tensorflow:Assets written to: experiments/exp_mask-4-main_thr-0-sec_thr-0/model-l11-omain-smax_image/assets


Evaluating model...
[0.31313131 0.1010101  0.43       0.63       0.48      ]
Done...
Experiment exp_mask-4-main_thr-0-sec_thr-0 - training model l:9-o:main-s:max_obj
Restoring model weights from the end of the best epoch: 4.
Epoch 24: early stopping




INFO:tensorflow:Assets written to: experiments/exp_mask-4-main_thr-0-sec_thr-0/model-l9-omain-smax_obj/assets


INFO:tensorflow:Assets written to: experiments/exp_mask-4-main_thr-0-sec_thr-0/model-l9-omain-smax_obj/assets


Evaluating model...
[0.19191919 0.18181818 0.37       0.65       0.35      ]
Done...
Experiment exp_mask-4-main_thr-0-sec_thr-0 - training model l:10-o:main-s:max_obj


KeyboardInterrupt: 

In [31]:
class_scores['9-main-max_image'][0]

0.0

In [None]:
fig, axs = plt.subplots(nrows=len(histories), ncols=2, figsize=(8, 4*len(histories)))
for idx, (layer_name, hist) in enumerate(histories.items()):
    axs[idx, 0].plot(hist.history['loss'])
    axs[idx, 0].plot(hist.history['val_loss'])
    axs[idx, 0].set_title(f'{layer_name} loss')
#     axs[idx, 0].ylabel('loss')
#     axs[idx, 0].xlabel('epoch')
    axs[idx, 0].legend(['train', 'val'], loc='upper left')

    axs[idx, 1].plot(hist.history['accuracy'])
    axs[idx, 1].plot(hist.history['val_accuracy'])
    axs[idx, 1].set_title(f'{layer_name} accuracy')
#     axs[idx, 1].ylabel('accuracy')
#     axs[idx, 1].xlabel('epoch')
    axs[idx, 1].legend(['train', 'val'], loc='upper left')

plt.savefig(fname=f"{EXP_FOLDER}/{exp_name}/training_curves.png")
plt.show()
plt.close('all')

### Classification task summary:
* Number of instances per tuple, before cleaning. **Total 5 classes**.
    * (1, 27) ('person', 'backpack'): 3524
    * (1, 28) ('person', 'umbrella'): 2089
    * (1, 31) ('person', 'handbag'): 4890
    * (1, 32) ('person', 'tie'): 1543
    * (1, 33) ('person', 'suitcase'): 1048
* I had to limit the number of instances processed to 1000 because the extracted Tensor (hiddden states from 3 layers) became too big (5GB per tuple)
* from 5000 images, there were some issues with token selection and in the end **I was able to extract 2969 hidden states.**
    * these hidden states are from layers 10, 11 e 12 (3 last layers)
    * 10% split to test, 10% for validation
* Then, 30 NN were trained with the follow configurations:
    * 5 token_strategies = `["consistent", 'max_image', 'max_obj', 'min_obj', 'random_obj']`
        * `consistent`: consistent token which gets maximum activation across several layers within the foreground mask. One token per image.
        * `'max_obj', 'min_obj', 'random_obj'`: One token max/min/random **per layer** 10/11/12 within the foreground mask. 3 tokens per image, one for each one of the last 3 layers.
        * `'max_image'`: Token that gets maximum attention in the *whole image* .
    * 3 different layers (10,11,12): The hidden state to be used as input for the decoding task. 
    * 2 different objects: MAIN, SECOND.
        * a model is trained for each one of the 15 combinations of token * layer for the `MAIN` object and `SECOND` object.
        
**TOTAL OF 30 models**

In [None]:
def save_test_scores(scores, filename):
    scores_pd = {'model_name': [], 'object': [], 'token_strategy': [], 'hidden_state_layer': [], 'loss': [], 'test_score': []}
    for model_name, score in scores.items():
        scores_pd['model_name'].append(model_name)
        scores_pd['hidden_state_layer'].append(int(model_name[:model_name.find('-')]))
        scores_pd['object'].append(model_name[model_name.find('-')+1:model_name.rfind('-')])
        scores_pd['token_strategy'].append(model_name[model_name.rfind('-')+1:])
        scores_pd['loss'].append(score[0])
        scores_pd['test_score'].append(score[1])

    scores_pd = pd.DataFrame(scores_pd)
    scores_pd.to_csv(filename, index=False)
    return scores_pd

save_test_scores(test_scores, f"{EXP_FOLDER}/{exp_name}/test_scores.csv")

In [None]:
# plt.bar(x=scores_pd[(scores_pd["token_strategy"]=="max_obj") & (scores_pd["object"]=="main")]["hidden_state_layer"], 
#         height=scores_pd[(scores_pd["token_strategy"]=="max_obj") & (scores_pd["object"]=="main")]["test_score"], label="main-obj_max-act")
# plt.bar(x=scores_pd[(scores_pd["token_strategy"]=="min_obj") & (scores_pd["object"]=="main")]["hidden_state_layer"], 
#         height=scores_pd[(scores_pd["token_strategy"]=="min_obj") & (scores_pd["object"]=="main")]["test_score"], label="main-obj_min-act")
# plt.bar(x=scores_pd[(scores_pd["token_strategy"]=="random_obj") & (scores_pd["object"]=="main")]["hidden_state_layer"], 
#         height=scores_pd[(scores_pd["token_strategy"]=="random_obj") & (scores_pd["object"]=="main")]["test_score"], label="main-obj_random-obj-act")
# plt.bar(x=scores_pd[(scores_pd["token_strategy"]=="max_image") & (scores_pd["object"]=="main")]["hidden_state_layer"], 
#         height=scores_pd[(scores_pd["token_strategy"]=="max_image") & (scores_pd["object"]=="main")]["test_score"], label="main-obj_max-img-act")

# plt.xticks(rotation=85) 
# plt.legend()
# plt.show()

In [None]:
plt.figure(figsize=(12,8))
plt.plot(scores_pd[(scores_pd["token_strategy"]=="max_obj") & (scores_pd["object"]=="main")]["hidden_state_layer"], 
         scores_pd[(scores_pd["token_strategy"]=="max_obj") & (scores_pd["object"]=="main")]["test_score"], label="main-obj_max-act")
plt.plot(scores_pd[(scores_pd["token_strategy"]=="min_obj") & (scores_pd["object"]=="main")]["hidden_state_layer"], 
         scores_pd[(scores_pd["token_strategy"]=="min_obj") & (scores_pd["object"]=="main")]["test_score"], label="main-obj_min-act")
plt.plot(scores_pd[(scores_pd["token_strategy"]=="random_obj") & (scores_pd["object"]=="main")]["hidden_state_layer"], 
         scores_pd[(scores_pd["token_strategy"]=="random_obj") & (scores_pd["object"]=="main")]["test_score"], label="main-obj_random-obj-act")
plt.plot(scores_pd[(scores_pd["token_strategy"]=="max_image") & (scores_pd["object"]=="main")]["hidden_state_layer"], 
         scores_pd[(scores_pd["token_strategy"]=="max_image") & (scores_pd["object"]=="main")]["test_score"], label="main-obj_max-img-act")

plt.plot(scores_pd[(scores_pd["token_strategy"]=="max_obj") & (scores_pd["object"]=="second")]["hidden_state_layer"], 
         scores_pd[(scores_pd["token_strategy"]=="max_obj") & (scores_pd["object"]=="second")]["test_score"], label="second-obj_max-act", linestyle='dashed')
plt.plot(scores_pd[(scores_pd["token_strategy"]=="min_obj") & (scores_pd["object"]=="second")]["hidden_state_layer"], 
         scores_pd[(scores_pd["token_strategy"]=="min_obj") & (scores_pd["object"]=="second")]["test_score"], label="second-obj_min-act", linestyle='dashed')
plt.plot(scores_pd[(scores_pd["token_strategy"]=="random_obj") & (scores_pd["object"]=="second")]["hidden_state_layer"], 
         scores_pd[(scores_pd["token_strategy"]=="random_obj") & (scores_pd["object"]=="second")]["test_score"], label="second-obj_random-obj-act", linestyle='dashed')
plt.plot(scores_pd[(scores_pd["token_strategy"]=="max_image") & (scores_pd["object"]=="second")]["hidden_state_layer"], 
         scores_pd[(scores_pd["token_strategy"]=="max_image") & (scores_pd["object"]=="second")]["test_score"], label="second-obj_max-img-act", linestyle='dashed')

plt.xticks(rotation=85) 
plt.legend()
plt.show()

In [None]:
# scores_pd = scores_pd.rename(columns={"test_score": "NN_test_score"})
# scores_pd["SVM_test_score"] = list(test_scores.values())
