In [1]:
import os, sys
import numpy as np
import pandas as pd
import torch
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression
import sklearn.preprocessing as Preprocessing
import pickle
from datetime import datetime

sys.path.insert(0, 'src')
from utils.places365_pred_utils import get_class_category_dict, get_category_class_dict
from utils.utils import ensure_dir, write_lists, informal_log
from utils.attribute_utils import get_one_hot_attributes, get_frequent_attributes

### Load features and attributes

In [14]:
# import numpy as np
# import os, sys

# sys.path.insert(0, 'src')

# # def get_attributes_set(data, paths):
# #     splits = ['train', 'val', 'test']
# #     seen_attributes = set()
# #     for split in splits:
# #         split_paths = paths[split]
# #         for path in split_paths:
# #             # Obtain attributes
# #             cur_attributes = data['labels'][path]
# #             for attr in cur_attributes:
# #                 seen_attributes.add(attr)
# #     return seen_attributes

# def get_one_hot_attributes(data, paths, n_attr, splits=['train', 'val', 'test']):
#     attributes = {}
#     for split in splits:
#         attributes[split] = []
#     # attributes = {
#     #     'train': [],
#     #     'val': [],
#     #     'test': []
#     # }

#     for split in splits:
#         split_paths = paths[split]
#         print("Processing attributes for {} split".format(split))
#         for path in tqdm(split_paths):
#             # Obtain attributes and covnvert to one hot
#             cur_attributes = data['labels'][path]
#             one_hot_attributes = np.zeros(n_attr)
#             one_hot_attributes[cur_attributes] = 1
#             attributes[split].append(one_hot_attributes)
#         attributes[split] = np.stack(attributes[split], axis=0)

#     # Print statistics from training
#     # train_attributes = attributes['train']
#     # counts = np.sum(train_attributes, axis=0)
#     # print("{} concepts that occur > 150 times in training".format(len(np.where(counts > 150)[0])))
#     # print("{} concepts that occur at all in training".format(len(np.nonzero(counts)[0])))

#     return attributes

# def get_frequent_attributes(attributes,
#                                frequency_threshold=150,
#                                splits=['train', 'val', 'test']):
#     '''
#     Given dictionary of 1-hot encoded attributes, return dictionary of same format with only frequent attributes

#     Arg(s):
#         attributes : dict[str : np.array]
#             keys: split ['train', 'val', 'test']
#             values: one-hot encoded attributes
#         frequency_threshold : int
#             number of occurrences in training for an attribute to be considered 'frequent'
#         splits : list[str]
#             list of split names to key dictionaries

#     Returns:
#         freq_attributes_dict : dict[str : np.array]
#     '''
#     train_counts = np.sum(attributes['train'], axis=0)

#     # Obtain one-hot encoding of attributes that exceed frequency threshold
#     freq_attributes_one_hot = np.where(train_counts > frequency_threshold, 1, 0)
#     # Mask out infrequent attributes
#     freq_attributes_dict = {}
#     for split in splits:
#         cur_attributes = attributes[split]
#         freq_attributes = np.where(freq_attributes_one_hot == 1, cur_attributes, 0)

#         # Sanity checks
#         discarded_attributes_idxs = np.nonzero(np.where(train_counts < frequency_threshold, 1, 0))[0]
#         kept_attributes_idxs = np.nonzero(train_counts > frequency_threshold)[0]
#         assert (kept_attributes_idxs == np.nonzero(freq_attributes_one_hot)[0]).all()

#         zeroed_ctr = 0
#         ctr = 0

#         for idx, (orig, new) in enumerate(zip(cur_attributes, freq_attributes)):
#             # print(orig
#             if not (orig == new).all():
#                 orig_idxs = np.nonzero(orig)[0]
#                 new_idxs = np.nonzero(new)[0]
#                 # Assert new idxs ONLY contains the kept attributes and none of discarded
#                 assert len(np.intersect1d(new_idxs, discarded_attributes_idxs)) == 0
#                 assert len(np.intersect1d(new_idxs, kept_attributes_idxs)) == len(new_idxs)
#                 # Assert overlap with original indices is equal to new indices
#                 assert (np.intersect1d(orig_idxs, new_idxs) == new_idxs).all()
#                 if len(new_idxs) == 0:
#                     zeroed_ctr += 1
#                 ctr += 1
#         print("{} examples have no more attributes".format(zeroed_ctr))
#         print("{}/{} examples affected".format(ctr, len(cur_attributes)))
#         freq_attributes_dict[split] = freq_attributes

#     return freq_attributes_dict, freq_attributes_one_hot

In [2]:
# Load features
features_dir = os.path.join('saved', 'ADE20K', '0501_105640')
train_features_path = os.path.join(features_dir, 'train_features.pth')
val_features_path = os.path.join(features_dir, 'val_features.pth')
test_features_path = os.path.join(features_dir, 'test_features.pth')

train_features_dict = torch.load(train_features_path)
train_features = train_features_dict['features']
train_paths = train_features_dict['paths']

val_features_dict = torch.load(val_features_path)
val_features = val_features_dict['features']
val_paths = val_features_dict['paths']

test_features_dict = torch.load(test_features_path)
test_features = test_features_dict['features']
test_paths = test_features_dict['paths']

paths = {
    'train': train_paths,
    'val': val_paths,
    'test': test_paths
}
n_attributes = 1200
frequency_threshold = 150

# Load data and calculate attributes
data_path = os.path.join('data', 'ade20k', 'full_ade20k_imagelabels.pth')
data = torch.load(data_path)

print("Obtaining one hot encodings of attributes")
attributes = get_one_hot_attributes(
    data=data,
    paths=paths,
    n_attr=n_attributes
)


print("Obtaining frequent attributes only")
freq_attributes, freq_attributes_one_hot = get_frequent_attributes(
    attributes=attributes,
    frequency_threshold=frequency_threshold
)

# Get indices of frequent attributes
frequent_attribute_idxs = np.nonzero(freq_attributes_one_hot)[0]

# Load names of attributes
labels_path = os.path.join('data', 'broden1_224', 'label.csv')
attribute_label_dict = pd.read_csv(labels_path, index_col=0)['name'].to_dict()
print("Loaded human-readable labels)")

Obtaining one hot encodings of attributes
Processing attributes for train split


100%|█████████████████████████████████████████████████████████████████████████| 13326/13326 [00:00<00:00, 116430.05it/s]


Processing attributes for val split


100%|███████████████████████████████████████████████████████████████████████████| 4442/4442 [00:00<00:00, 148595.87it/s]


Processing attributes for test split


100%|███████████████████████████████████████████████████████████████████████████| 4442/4442 [00:00<00:00, 148150.40it/s]


Obtaining frequent attributes only
6 examples have no more attributes
8308/13326 examples affected
3 examples have no more attributes
2736/4442 examples affected
3 examples have no more attributes
2804/4442 examples affected
Loaded human-readable labels)


### For each attribute, create a linear classifier (hyperparameter search)

In [3]:
def hyperparam_search_multinomial(train_features,
                                  train_labels, 
                                  val_features, 
                                  val_labels, 
                                  regularization,
                                  solver,
                                  scaler=None,
                                  Cs = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 3, 5],
                                  log_path=None):
    best_clf = None
    best_acc = 0
    
    if scaler is not None:
        scaler.fit(train_features)
        print("Scaler parameters: {}".format(scaler.get_params()))
        train_features = scaler.transform(train_features)
        val_features = scaler.transform(val_features)
    for c in Cs:
        clf = LogisticRegression(solver=solver, C=c, penalty=regularization)
        clf.fit(train_features, train_labels)
        score = clf.score(val_features, val_labels)
        if score>best_acc:
            best_acc = score
            best_clf = clf
            informal_log("Best accuracy: {} Regularization: {}".format(score, c), log_path)
    
    return best_clf

In [None]:
cavs = {}
train_attributes = freq_attributes['train']
val_attributes = freq_attributes['val']

cavs_save_dir = os.path.join('saved', 'ADE20K', 'cav', 'scaled', datetime.now().strftime(r'%m%d_%H%M%S'))
if os.path.exists(cavs_save_dir):
    raise ValueError("Path {} already exists".format(cavs_save_dir))
ensure_dir(cavs_save_dir)

cavs_save_path = os.path.join(cavs_save_dir, 'cavs.pickle')
log_path = os.path.join(cavs_save_dir, 'log.txt')
n_frequent_attributes = len(frequent_attribute_idxs)

for idx, attribute_idx in tqdm(enumerate(frequent_attribute_idxs)):
    informal_log("[{}] {}/{} Calculating CAV for {}".format(
        datetime.now().strftime(r'%m%d_%H%M%S'),
        idx+1,
        n_frequent_attributes,
        attribute_label_dict[attribute_idx]), log_path)
    scaler = Preprocessing.StandardScaler()
    cav = hyperparam_search_multinomial(
        train_features=train_features,
        train_labels=train_attributes[:, attribute_idx],
        val_features=val_features,
        val_labels=val_attributes[:, attribute_idx],
        scaler=scaler,
        solver='liblinear',
        regularization='l2',
        log_path=log_path
    )
    cavs[attribute_idx] = cav
    
    accuracy = cav.score(val_features, val_attributes[:, attribute_idx])
    informal_log("CAV accuracy for {} concept ({}): {:.4f}".format(
        attribute_label_dict[attribute_idx],
        attribute_idx,
        accuracy), log_path)
    if idx % 10 == 0:
        pickle.dump(cavs, open(cavs_save_path, 'wb'))
        
pickle.dump(cavs, open(cavs_save_path, 'wb'))


0it [00:00, ?it/s]

[0513_094421] 1/182 Calculating CAV for wall
Scaler parameters: {'copy': True, 'with_mean': True, 'with_std': True}
Best accuracy: 0.8973435389464205 Regularization: 0.001
Best accuracy: 0.8995947771274201 Regularization: 0.005


1it [00:21, 21.65s/it]

CAV accuracy for wall concept (12): 0.7418
[0513_094443] 2/182 Calculating CAV for sky
Scaler parameters: {'copy': True, 'with_mean': True, 'with_std': True}


In [39]:
'''
Have CAVs trained using ADE20K training set
For each split:

'''

concept_vectors_dict = {}
concept_vectors_save_path = os.path.join(cavs_save_dir, '{}_cavs.pth')
for split in splits:
    split_features = features[split]
    concept_vectors = []
    for cav in cavs: # 182 of these
        concept_present = cav.predict(split_features) # N x 1 array
        concept_vectors.append(concept_present)
    concept_vectors = np.stack(concept_vectors, axis=1)
    concept_vectors_dict[split] = concept_vectors
    torch.save(concept_vectors, concept_vectors_save_path.format(split))

# Load model's predictions
# predictions_path = os.path.join(features_dir, '{}_logits_predictions.pth')
# train_predictions = torch.load(predictions_path.format('train'))['predictions']
# val_predictions = torch.load(predictions_path.format('val'))['predictions']
# test_predictions = torch.load(predictions_path.format('test'))['predictions']

# predictions = {
#     'train': train_predictions,
#     'val': val_predictions, 
#     'test': test_predictions
# }
prediction_path = os.path.join('saved', 'PlacesCategoryClassification', 'saga', '{}_outputs_predictions.pth')
splits = ['train', 'val', 'test']
predictions = {}
for split in splits:
    predictions[split] = torch.load(prediction_path.format(split))['predictions']

solver = 'saga'
penalty = 'l1',
max_iter = 200
hyperparam_search_multinomial(
    train_features=# NEED TO FILL IN, calculate concept vectors for all inputs,
    train_labels=predictions['train'], 
    val_features=#NEED TO FILL IN, claculate concept vectors for all inputs, 
    val_labels=predictions['val'], 
    regularization=penalty,
    solver=solver,
    scaler=None,
    Cs = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 3, 5],
    log_path=None)


0.5051778478162989
