## Given a 365 way scene classifier, obtain logits for 16-way scene category classifier


In [2]:
import os, sys
import torch
import torchvision
import numpy as np
import pandas as pd
import pickle
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression
import sklearn.preprocessing as Preprocessing

sys.path.insert(0, 'src')
from utils.places365_pred_utils import get_class_category_dict
from utils.utils import ensure_dir, read_json
from utils.model_utils import prepare_device
from datasets.datasets import ImageDataset

In [3]:
# Set variables
seed = 0


### Split Places 365 into train-val

In [41]:
# Run setup script if not already
if os.path.exists(data_save_path):

    # Set variables
    places365_dir= os.path.join('data', 'Places365')
    index_path = os.path.join(places365_dir, 'places365_val.txt')
    image_dir = os.path.join(places365_dir, 'val_256')

    train_split = 0.6

    data_save_dir = os.path.join('data', 'places365_categories')
    data_save_path = os.path.join(data_save_dir, 'places365_imagelabels.pth')

    sys.path.insert(0, 'setup')
    from setup_places365_categories import setup_places365_categories
    setup_places365_categories(
        image_dir=image_dir,
        index_path=index_path,
        train_split=train_split,
        data_save_path=data_save_path,
        seed=seed
    )

Path data/places365_categories/places365_imagelabels.pth exists. Aborting


36500it [00:10, 3325.37it/s]


Saved scene and scene category labels
Split train/val (0.6/0.4)
Saved data to data/places365_categories/places365_imagelabels.pth


### Save features of `val_train` and `val_val` partitions using Resnet model

In [None]:
config_path = os.path.join('configs', 'save_features_places_resnet_places365val.json')
from save_features import save_features

save_features(config_path)

### Load features and labels for training and validation

In [3]:
data_path = os.path.join('data', 'places365_categories', 'places365_imagelabels.pth')
features_dir = os.path.join('saved', 'Places365_val', '0509_161413')
train_features_path = os.path.join(features_dir, 'val_train_features.pth')
val_features_path = os.path.join(features_dir, 'val_val_features.pth')

# Load labels
data = torch.load(data_path)
train_paths = data['val_train']
val_paths = data['val_val']
category_labels = data['scene_category_labels']
train_category_labels = [category_labels[path] for path in train_paths]
val_category_labels = [category_labels[path] for path in val_paths]

# Load features
train_features_dict = torch.load(train_features_path)
val_features_dict = torch.load(val_features_path)

# Sanity checks for elementwise correspondence
for idx, path in enumerate(train_features_dict['paths']):
    assert path == train_paths[idx]
for idx, path in enumerate(val_features_dict['paths']):
    assert path == val_paths[idx]

train_features = train_features_dict['features']
val_features = val_features_dict['features']

# Length sanity checks
assert len(train_features) == len(train_category_labels)
assert len(val_features) == len(val_category_labels)

print("Loaded {} samples for training and {} samples for validation".format(
    len(train_features), len(val_features)
))

Loaded 21900 samples for training and 14600 samples for validation


#### Hyperparmeter search for linear layer

In [6]:

def hyperparam_search(train_features,
                      train_labels, 
                      val_features, 
                      val_labels, 
                      regularization,
                      solver,
                      scaler=None,
                      Cs = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 3, 5],
                      max_iter=100):
    best_clf = None
    best_acc = 0
    
    if scaler is not None:
        scaler.fit(train_features)
        print("Scaler parameters: {}".format(scaler.get_params()))
        train_features = scaler.transform(train_features)
        val_features = scaler.transform(val_features)
    for c in Cs:
        clf = LogisticRegression(
            solver=solver, 
            C=c, 
            penalty=regularization,
            max_iter=max_iter)
        clf.fit(train_features, train_labels)
        score = clf.score(val_features, val_labels)
        if score>best_acc:
            best_acc = score
            best_clf = clf
            print("Best accuracy: {} Regularization: {}".format(score, c))
    
    return best_clf

In [7]:
solver = 'liblinear'
regularization = 'l2'

hyperparam_search(
    train_features=train_features,
    train_labels=train_category_labels,
    val_features=val_features,
    val_labels=val_category_labels,
    regularization=regularization,
    solver=solver
)

Best accuracy: 0.6236986301369863 Regularization: 0.001
Best accuracy: 0.647945205479452 Regularization: 0.005
Best accuracy: 0.6532191780821918 Regularization: 0.01


In [7]:
solver = 'lbfgs'
regularization = 'l2'
max_iter = 500

hyperparam_search(
    train_features=train_features,
    train_labels=train_category_labels,
    val_features=val_features,
    val_labels=val_category_labels,
    regularization=regularization,
    solver=solver,
    max_iter=max_iter
)

Best accuracy: 0.6296575342465753 Regularization: 0.001
Best accuracy: 0.6508219178082192 Regularization: 0.005


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Best accuracy: 0.6558904109589041 Regularization: 0.01


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [9]:
solver = 'saga'
regularization = 'l2'
max_iter = 500
scaler = Preprocessing.StandardScaler()
hyperparam_search(
    train_features=train_features,
    train_labels=train_category_labels,
    val_features=val_features,
    val_labels=val_category_labels,
    regularization=regularization,
    scaler=scaler,
    solver=solver,
    max_iter=max_iter
)

Scaler parameters: {'copy': True, 'with_mean': True, 'with_std': True}
Best accuracy: 0.6445890410958904 Regularization: 0.001
Best accuracy: 0.6566438356164384 Regularization: 0.005


KeyboardInterrupt: 

In [18]:
solver = 'liblinear'
regularization = 'l2'
c = 0.01
max_iter = 500

linear_layer = LogisticRegression(
    solver=solver, 
    C=c, 
    penalty=regularization,
    max_iter=max_iter)

linear_layer.fit(
    train_features,
    train_category_labels)


In [19]:
# Print accuracy on Places_val_val set
logits = linear_layer.decision_function(val_features)
probability_ouputs = linear_layer.predict_proba(val_features)
accuracy = linear_layer.score(val_features, val_category_labels)
print("Accuracy: {}".format(accuracy))

save_dir = os.path.join('saved', 
                        'PlacesCategoryClassification',
                        'sklearn_logreg')
ensure_dir(save_dir)
model_save_path = os.path.join(save_dir, 
                               'linear_{}_{}_{}.pickle'.format(solver, regularization, c))
if os.path.exists(model_save_path):
    print("Model already exists at {}".format(model_save_path))
else:
    pickle.dump(linear_layer, open(model_save_path, 'wb'))
    
                        

Accuracy: 0.6532191780821918


### Use linear layer to obtain 16-way logit predictions from model's features on ADE20K images

In [6]:
ade20k_features_dir = os.path.join('saved', 'ADE20K', '0501_105640')
features_path_template = os.path.join(ade20k_features_dir, '{}_features.pth')
splits = ['train', 'val', 'test']
features = {}
for split in splits:
    features[split] = torch.load(features_path_template.format(split))['features']
# train_features = torch.load(features_path_template.format('train'))['features']
# val_features = torch.load(features_path_template.format('val'))['features']
# test_features = torch.load(features_path_template.format('test'))['features']


In [19]:
# Get predictions for liblinear log reg

logreg_dir = os.path.join('saved', 'PlacesCategoryClassification', 'sklearn_logreg')
liblinear_path = os.path.join(logreg_dir, 'linear_liblinear_l2_0.01.pickle')
liblinear = pickle.load(open(liblinear_path, 'rb'))

saga_path = os.path.join(logreg_dir, 'linear_saga_l2_0.005.pickle')
saga = pickle.load(open(saga_path, 'rb'))
liblinear_out = {}
saga_out = {}


output_path_template = os.path.join(
    'saved/PlacesCategoryClassification/0510_102912/ADE20K_predictions',
    '{}',
    '{}_outputs_predictions.pth'
)

ensure_dir(os.path.dirname(output_path_template).format('liblinear'))
ensure_dir(os.path.dirname(output_path_template).format('saga'))
for split in splits:
    liblinear_out[split] = {}
    saga_out[split] = {}

    # Save unnormalized outputs
    liblinear_out[split]['outputs'] = liblinear.decision_function(features[split])
    saga_out[split]['outputs'] = saga.decision_function(features[split])
    # Save probabilities
    liblinear_out[split]['probabilities'] = liblinear.predict_proba(features[split])
    saga_out[split]['probabilities'] = saga.predict_proba(features[split])
    # Save predictions
    liblinear_out[split]['predictions'] = liblinear.predict(features[split])
    saga_out[split]['predictions'] = saga.predict(features[split])
    n_samples = len(features[split])
    
    liblinear_out_path = output_path_template.format('liblinear', split)
    if not os.path.exists(liblinear_out_path):
        torch.save(liblinear_out[split], liblinear_out_path)
        print("Saved {} outputs from liblinear to {}".format(split, liblinear_out_path))
    saga_out_path = output_path_template.format('saga', split)
    if not os.path.exists(saga_out_path):
        torch.save(saga_out[split], saga_out_path)
        print("{} Saved outputs from saga to {}".format(split, saga_out_path))



    # linear_layer_predictions = torch.load(linear_layer_output_path_template.format(split))['predictions']
    # assert len(linear_layer_predictions) == n_samples

    # print("---***---\n{}".format(split))
    # n_congruent = np.count_nonzero(liblinear_predictions[split] == saga_predictions[split])
    # print("liblinear-logreg {} prediction congruency: {}".format(split, n_congruent / n_samples))

    # n_congruent = np.count_nonzero(liblinear_predictions[split] == linear_layer_predictions)
    # print("liblinear-linear_layer {} prediction congruency: {}".format(split, n_congruent / n_samples))

    # n_congruent = np.count_nonzero(saga_predictions[split] == linear_layer_predictions)
    # print("saga-linear_layer {} prediction congruency: {}".format(split, n_congruent / n_samples))

Saved train outputs from liblinear to saved/PlacesCategoryClassification/0510_102912/ADE20K_predictions/liblinear/train_outputs_predictions.pth
Saved val outputs from liblinear to saved/PlacesCategoryClassification/0510_102912/ADE20K_predictions/liblinear/val_outputs_predictions.pth
Saved test outputs from liblinear to saved/PlacesCategoryClassification/0510_102912/ADE20K_predictions/liblinear/test_outputs_predictions.pth
