In [47]:
import os, sys
import numpy as np
import pandas as pd
import torch
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression
import sklearn.preprocessing as Preprocessing
import pickle

sys.path.insert(0, 'src')
from utils.places365_pred_utils import get_class_category_dict, get_category_class_dict
from utils.utils import ensure_dir, write_lists
from utils.attribute_utils import get_one_hot_attributes, get_frequent_attributes

### Load attributes and predictions

In [31]:
data_path = os.path.join('data', 'ade20k', 'full_ade20k_imagelabels.pth')
save_dir = os.path.join('saved', 'ADE20K', '0501_105640')
predictions_path = os.path.join(save_dir, '{}_logits_predictions.pth')

data = torch.load(data_path)
print("Data keys: {}".format(data.keys()))
train = torch.load(predictions_path.format('train'))
train_paths = train['paths']
train_predictions = train['predictions']

val = torch.load(predictions_path.format('val'))
val_paths = val['paths']
val_logits = val['logits']
val_predictions = val['predictions']

test = torch.load(predictions_path.format('test'))
test_paths = test['paths']
test_predictions = test['predictions']

print(train_predictions.shape, val_predictions.shape, test_predictions.shape)
predictions = {
    'train': train_predictions,
    'val': val_predictions,
    'test': test_predictions
}
paths = {
    'train': train_paths,
    'val': val_paths,
    'test': test_paths
}



Data keys: dict_keys(['train', 'val', 'test', 'labels'])
(13326,) (4442,) (4442,)


In [32]:
# def get_attributes_set(data, paths):
#     splits = ['train', 'val', 'test']
#     seen_attributes = set()
#     for split in splits:
#         split_paths = paths[split]
#         for path in split_paths:
#             # Obtain attributes
#             cur_attributes = data['labels'][path]
#             for attr in cur_attributes:
#                 seen_attributes.add(attr)
#     return seen_attributes

# def get_one_hot_attributes(data, paths, n_attr):
#     splits = ['train', 'val', 'test']
#     attributes = {
#         'train': [],
#         'val': [], 
#         'test': []
#     }
    
#     for split in splits:
#         split_paths = paths[split]
#         print("Processing attributes for {} split".format(split))
#         for path in tqdm(split_paths):
#             # Obtain attributes and covnvert to one hot
#             cur_attributes = data['labels'][path]
#             one_hot_attributes = np.zeros(n_attr)
#             one_hot_attributes[cur_attributes] = 1
#             attributes[split].append(one_hot_attributes)
#         attributes[split] = np.stack(attributes[split], axis=0)
        
#     return attributes

# attr_set = get_attributes_set(data, paths)
# n_attr = len(attr_set)

# print("Lowest attribute index: {} Highest attribute index: {}".format(min(attr_set), max(attr_set)))
# N_ATTR = 1200  # including the attributes

# attributes = get_one_hot_attributes(data, paths, N_ATTR)

Lowest attribute index: 12 Highest attribute index: 1197
Processing attributes for train split


100%|██████████| 13326/13326 [00:00<00:00, 101818.00it/s]


Processing attributes for val split


100%|██████████| 4442/4442 [00:00<00:00, 118150.91it/s]


Processing attributes for test split


100%|██████████| 4442/4442 [00:00<00:00, 135174.48it/s]


In [33]:
# train_attributes = attributes['train']
# counts = np.sum(train_attributes, axis=0)
# print("{} concepts that occur > 150 times in training".format(len(np.where(counts > 150)[0])))
# print("{} concepts that occur at all in training".format(len(np.nonzero(counts)[0])))

182 concepts that occur > 150 times in training
623 concepts that occur at all in training


### Obtain attributes that are frequently occuring in the training set

In [34]:
# FREQUENCY_THRESH = 150
# n_samples = attributes['train'].shape[0]
# train_counts = np.sum(attributes['train'], axis=0)

# def obtain_frequent_attributes(cur_attributes, train_counts):

#     # Obtain one-hot encoding of attributes that exceed frequency threshold
#     frequent_attributes_one_hot = np.where(train_counts > FREQUENCY_THRESH, 1, 0)
#     # Mask out infrequent attributes
#     frequent_attributes = np.where(frequent_attributes_one_hot == 1, cur_attributes, 0)

#     # Sanity checks
#     discarded_attributes_idxs = np.nonzero(np.where(train_counts < FREQUENCY_THRESH, 1, 0))[0]
#     kept_attributes_idxs = np.nonzero(train_counts > FREQUENCY_THRESH)[0]
#     assert (kept_attributes_idxs == np.nonzero(frequent_attributes_one_hot)[0]).all()

#     zeroed_ctr = 0
#     ctr = 0
#     for idx, (orig, new) in enumerate(zip(cur_attributes, frequent_attributes)):
#         # print(orig
#         if not (orig == new).all():
#             orig_idxs = np.nonzero(orig)[0]
#             new_idxs = np.nonzero(new)[0]
#             # Assert new idxs ONLY contains the kept attributes and none of discarded
#             assert len(np.intersect1d(new_idxs, discarded_attributes_idxs)) == 0
#             assert len(np.intersect1d(new_idxs, kept_attributes_idxs)) == len(new_idxs)
#             # Assert overlap with original indices is equal to new indices
#             assert (np.intersect1d(orig_idxs, new_idxs) == new_idxs).all()
#             if len(new_idxs) == 0:
#                 zeroed_ctr += 1
#             ctr += 1
#     print("{} examples have no more attributes".format(zeroed_ctr))
#     print("{}/{} examples affected".format(ctr, len(cur_attributes)))
    
#     return frequent_attributes

# frequent_attributes = {}
# for split in ['train', 'val', 'test']:
#     cur_frequent_attributes = obtain_frequent_attributes(
#         cur_attributes=attributes[split],
#         train_counts = train_counts)
#     frequent_attributes[split] = cur_frequent_attributes



6 examples have no more attributes
8308/13326 examples affected
3 examples have no more attributes
2736/4442 examples affected
3 examples have no more attributes
2804/4442 examples affected


In [None]:
n_attributes = 1200
frequency_threshold = 150
attributes = get_one_hot_attributes(
    data=data,
    paths=paths,
    n_attr=n_attributes
)

frequent_attributes, frequent_attributes_one_hot = get_frequent_attributes(
    attributes=attributes,
    frequency_threshold=frequency_threshold
)

#### Obtain scene category predictions from 365-way scene prediction and save to `save_dir`

In [36]:
scene_category_dict = get_class_category_dict()

def get_scene_category_predictions(scene_predictions, scene_category_dict):
    category_predictions = []
    for scene_prediction in scene_predictions:
        category_predictions.append(scene_category_dict[scene_prediction])
    
    category_predictions = np.array(category_predictions)
    return category_predictions

# Save category predictions in save_dir
category_predictions = {}
for split, split_predictions in predictions.items():
    split_category_predictions = get_scene_category_predictions(
        scene_predictions=split_predictions,
        scene_category_dict=scene_category_dict)
    
    save_path = os.path.join(save_dir, '{}_scene_category_predictions.pth'.format(split))
    if os.path.exists(save_path):
        print("{} already exists.".format(save_path))
    else:
        torch.save({'scene_category_predictions': split_category_predictions}, save_path)
        print("Saved scene category predictions for {} to {}".format(split, save_path))
    category_predictions[split] = split_category_predictions
    

saved/ADE20K/0501_105640/train_scene_category_predictions.pth already exists.
saved/ADE20K/0501_105640/val_scene_category_predictions.pth already exists.
saved/ADE20K/0501_105640/test_scene_category_predictions.pth already exists.


### Run hyperparameter search on logistic regression for linear model to predict classes from attributes

In [48]:
def hyperparam_search_l1(train_features, train_labels, val_features, val_labels, 
                      Cs = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 3, 5]):
    best_clf = None
    best_acc = 0
    
    for c in Cs:
        clf = LogisticRegression(solver='liblinear', C=c, penalty='l1')
        clf.fit(train_features, train_labels)
        score = clf.score(val_features, val_labels)
        if score>best_acc:
            best_acc = score
            best_clf = clf
            print("Best accuracy: {} Regularization: {}".format(score, c))
    
    return best_clf

def hyperparam_search(train_features,
                                  train_labels, 
                                  val_features, 
                                  val_labels, 
                                  regularization,
                                  solver,
                                  scaler=None,
                                  Cs = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 3, 5]):
    best_clf = None
    best_acc = 0
    
    if scaler is not None:
        scaler.fit(train_features)
        print("Scaler parameters: {}".format(scaler.get_params()))
        train_features = scaler.transform(train_features)
        val_features = scaler.transform(val_features)
    for c in Cs:
        clf = LogisticRegression(solver=solver, C=c, penalty=regularization)
        clf.fit(train_features, train_labels)
        score = clf.score(val_features, val_labels)
        if score>best_acc:
            best_acc = score
            best_clf = clf
            print("Best accuracy: {} Regularization: {}".format(score, c))
    
    return best_clf

def partition_paths_by_congruency(explainer_predictions,
                                  model_predictions,
                                  paths):
    '''
    Given list or arrays of explainer and model predictions, partition paths based on if predictions align

    Arg(s):
        explainer_predictions : N-length np.array
            predictions output by the explainer model
        model_predictions : N-length np.array
            predictions output by the model
        paths : N-length list
            paths of images corresponding to each data point

    Returns:
        dictionary : dict[str] : list
            key: 'congruent' or 'incongruent'
            value: list of paths
    '''
    n_samples = len(paths)
    assert len(explainer_predictions) == n_samples
    assert len(model_predictions) == n_samples, "Length of model predictions {} doesn't match n_samples {}".format(
        len(model_predictions), n_samples
    )

    incongruent_paths = []
    congruent_paths = []

    for explainer_prediction, model_prediction, path in tqdm(zip(
        explainer_predictions, model_predictions, paths
    )):
        if explainer_prediction == model_prediction:
            congruent_paths.append(path)
        else:
            incongruent_paths.append(path)
    
    return {
        'congruent': congruent_paths,
        'incongruent': incongruent_paths
    }



#### Predicting all 365 classes with all part/object attributes

In [33]:
hyperparam_search_l1(
    train_features=attributes['train'],
    train_labels=train_predictions,
    val_features=attributes['val'],
    val_labels=val_predictions)
    

0.029941467807294012 0.001
0.1154885186852769 0.005
0.15038271049076993 0.01
0.23660513282305268 0.05
0.28140477262494373 0.1
0.3685276902296263 0.5
0.375056280954525 1


#### Predicting all  365 classes with part/object attributes that appear in >=150 training examples

In [77]:
hyperparam_search_l1(
    train_features=freq_train_attributes,
    train_labels=train_predictions,
    val_features=freq_val_attributes,
    val_labels=val_predictions)

Best accuracy: 0.029941467807294012 Regularization: 0.001
Best accuracy: 0.1154885186852769 Regularization: 0.005
Best accuracy: 0.14678072940117065 Regularization: 0.01
Best accuracy: 0.20711391265195858 Regularization: 0.05
Best accuracy: 0.24245835209365152 Regularization: 0.1
Best accuracy: 0.285006753714543 Regularization: 0.5
Best accuracy: 0.2915353444394417 Regularization: 1


#### Predict only 16 scene categories with all part/object attributes

In [101]:
hyperparam_search_l1(
    train_features=attributes['train'],
    train_labels=category_predictions['train'],
    val_features=attributes['val'],
    val_labels=category_predictions['val'])

Best accuracy: 0.3275551553354345 Regularization: 0.001
Best accuracy: 0.39756866276452046 Regularization: 0.005
Best accuracy: 0.43876632147681227 Regularization: 0.01
Best accuracy: 0.5538045925258892 Regularization: 0.05
Best accuracy: 0.5839711841512832 Regularization: 0.1
Best accuracy: 0.6226924808644755 Regularization: 0.5
Best accuracy: 0.6292210715893741 Regularization: 1


#### Predict 16 scene categories with top attributes (appear in 150 or more training examples)

In [None]:
# hyperparam_search_l1(
#     train_features=frequent_attributes['train'],
#     train_labels=category_predictions['train'],
#     val_features=frequent_attributes['val'],
#     val_labels=category_predictions['val'])

print("Multinomial solver")
hyperparam_search(
    train_features=frequent_attributes['train'],
    train_labels=category_predictions['train'],
    val_features=frequent_attributes['val'],
    val_labels=category_predictions['val'],
    solver='saga',
    regularization='l1',
    Cs=[0.005, 0.01, 0.05, 0.1, 0.5])

Multinomial solver
Best accuracy: 0.42976136875281407 Regularization: 0.005
Best accuracy: 0.4558757316524088 Regularization: 0.01
Best accuracy: 0.539846915803692 Regularization: 0.05
Best accuracy: 0.5520036019810896 Regularization: 0.1




Best accuracy: 0.5632597928860873 Regularization: 0.5


In [8]:
hyperparam_search(
    train_features=frequent_attributes['train'],
    train_labels=category_predictions['train'],
    val_features=frequent_attributes['val'],
    val_labels=category_predictions['val'],
    solver='saga',
    regularization='l1',
    Cs=[0.2, 0.3, 0.4])

Best accuracy: 0.5612336785231877 Regularization: 0.2
Best accuracy: 0.5628095452498875 Regularization: 0.4




### Create Explainer Baseline Using Multinomial and L1

In [37]:
solver = 'saga'
penalty = 'l1'
c = 0.5
max_iter = 200

train_X = frequent_attributes['train']
val_X = frequent_attributes['val']

train_y = category_predictions['train']
val_y = category_predictions['val']

# Create logistic regression classifier and predict for validation set
print("Logistic regression parameters: \n\tC: {} \tSolver: {} \tPenalty: {}".format(
    c, solver, penalty))

logreg = LogisticRegression(
    solver=solver, 
    C=c, 
    penalty=penalty,
    max_iter=max_iter)

logreg.fit(train_X, train_y)

print("Fit logistic regression on training data")

Logistic regression parameters: 
	C: 0.5 	Solver: saga 	Penalty: l1
Fit logistic regression on training data


In [46]:
# Evaluate
print("Accuracy: {}".format(logreg.score(val_X, val_y)))

# Save congruent/incongruent paths
ade20k_imagelabels = torch.load('/n/fs/ac-alignment/explain-alignment/data/ade20k/full_ade20k_imagelabels.pth')
paths = ade20k_imagelabels['val']
explainer_predictions = logreg.predict(val_X)
model_predictions = val_y

path_congruency = partition_paths_by_congruency(
    explainer_predictions=explainer_predictions,
    model_predictions=model_predictions,
    paths=paths
)

congruent_paths = path_congruency['congruent']
incongruent_paths = path_congruency['incongruent']
print("{} congruent an {} incongruent samples".format(len(congruent_paths), len(incongruent_paths)))

explainer_save_path = os.path.join(save_dir, 'baseline_explainer', 
'explainer_{}_{}_{}.pickle'.format(solver, penalty, c))
if os.path.exists(explainer_save_path):
    print("Path '{}' already exists".format(explainer_save_path))
else:
    pickle.dump(logreg, open(explainer_save_path, 'wb'))
    congruent_paths_path = os.path.join(os.path.dirname(explainer_save_path), 'congruent_paths.txt')
    incongruent_paths_path = os.path.join(os.path.dirname(explainer_save_path), 'incongruent_paths.txt')

    write_lists(congruent_paths, congruent_paths_path)
    write_lists(incongruent_paths, incongruent_paths_path)
    
    print("Saved explainer to {}".format(explainer_save_path))
    print("Saved congruent and incongruent paths to {} and {}".format(congruent_paths_path, incongruent_paths_path))

Accuracy: 0.5634849167041873


4442it [00:00, 1268111.79it/s]

2503 congruent an 1939 incongruent samples
Saved explainer to saved/ADE20K/0501_105640/baseline_explainer/explainer_saga_l1_0.5.pickle
Saved congruent and incongruent paths to saved/ADE20K/0501_105640/baseline_explainer/congruent_paths.txt and saved/ADE20K/0501_105640/baseline_explainer/incongruent_paths.txt





### Load 16-way scene category predictions from linear models trained to predict scene categories

In [13]:
linear_ids = ['linear_layer', 'liblinear', 'saga']
restore_path_template = os.path.join('saved', 'PlacesCategoryClassification', '0510_102912',
    'ADE20K_predictions', '{}', '{}_outputs_predictions.pth')

# splits =['train', 'val']
for id in linear_ids:
    cur_train_predictions = torch.load(restore_path_template.format(id, 'train'))['predictions']
    cur_val_predictions = torch.load(restore_path_template.format(id, 'val'))['predictions']

    print("Hyperparameter search for {}".format(id))
    explainer = hyperparam_search_l1(
        train_features=frequent_attributes['train'], 
        train_labels=cur_train_predictions, 
        val_features=frequent_attributes['val'], 
        val_labels=cur_val_predictions, 
        Cs = [0.05, 0.1, 0.5, 1, 3, 5, 7])

    

Hyperparameter search for linear_layer
Best accuracy: 0.5666366501575867 Regularization: 0.05
Best accuracy: 0.590274651058082 Regularization: 0.1
Best accuracy: 0.6046825754164791 Regularization: 0.5
Best accuracy: 0.6055830706888788 Regularization: 1
Best accuracy: 0.6060333183250788 Regularization: 3
Hyperparameter search for liblinear
Best accuracy: 0.5914002701485818 Regularization: 0.05
Best accuracy: 0.6141377757766772 Regularization: 0.1
Best accuracy: 0.6296713192255741 Regularization: 0.5
Best accuracy: 0.6303466906798739 Regularization: 1
Hyperparameter search for saga
Best accuracy: 0.5945520036019811 Regularization: 0.05
Best accuracy: 0.6114362899594777 Regularization: 0.1
Best accuracy: 0.6269698334083746 Regularization: 0.5
Best accuracy: 0.6278703286807744 Regularization: 1
Best accuracy: 0.6312471859522738 Regularization: 3
Best accuracy: 0.6314723097703737 Regularization: 5


In [22]:
# Create explainer methods for each 16-way scene classifier
linear_ids = ['linear_layer', 'liblinear', 'saga']
Cs = [3, 1, 5] # best regularization values found based on h-param search
restore_path_template = os.path.join('saved', 'PlacesCategoryClassification', '0510_102912',
    'ADE20K_predictions', '{}', '{}_outputs_predictions.pth')

solver = 'liblinear'
penalty = 'l1'

cur_train_features = frequent_attributes['train']
cur_val_features = frequent_attributes['val']

explainers = {}
for c, linear_id in zip(Cs, linear_ids):
    cur_train_predictions = torch.load(restore_path_template.format(linear_id, 'train'))['predictions']
    cur_val_predictions = torch.load(restore_path_template.format(linear_id, 'val'))['predictions']

    explainer = LogisticRegression(
        solver=solver,
        penalty=penalty,
        C=c
    )

    explainer.fit(cur_train_features, cur_train_predictions)
    accuracy = explainer.score(cur_val_features, cur_val_predictions)
    print("Accuracy for {}'s explainer: {}".format(linear_id, accuracy))

    # Save explainer, decision_function, probabilities, and predictions
    explainer_name = '{}_explainer_l1_{}'.format(linear_id, c)
    save_dir = os.path.join(os.path.dirname(restore_path_template).format(linear_id), 'baseline_explainer')
    ensure_dir(save_dir)
    explainer_save_path = os.path.join(save_dir, '{}.pickle'.format(explainer_name))
    
    save_data = {
        'outputs': explainer.decision_function(cur_val_features),
        'probabilities': explainer.predict_proba(cur_val_features),
        'predictions': explainer.predict(cur_val_features)
    }
    data_save_path = os.path.join(save_dir, '{}_validation.pth'.format(explainer_name))

    if os.path.exists(explainer_save_path):
        print("Explainer already saved to {}".format(explainer_save_path))
    else:
        pickle.dump(explainer, open(explainer_save_path, 'wb'))
        torch.save(save_data, data_save_path)
        print("Saved explainer and outputs on validation set to {}".format(os.path.dirname(explainer_save_path)))

    explainers[linear_id] = {
        'explainer_model': explainer,
        'validation': save_data
    }

Accuracy for linear_layer's explainer: 0.6060333183250788
Explainer already saved to saved/PlacesCategoryClassification/0510_102912/ADE20K_predictions/linear_layer/baseline_explainer/linear_layer_explainer_l1_3.pickle
Accuracy for liblinear's explainer: 0.630121566861774
Explainer already saved to saved/PlacesCategoryClassification/0510_102912/ADE20K_predictions/liblinear/baseline_explainer/liblinear_explainer_l1_1.pickle
Accuracy for saga's explainer: 0.6314723097703737
Explainer already saved to saved/PlacesCategoryClassification/0510_102912/ADE20K_predictions/saga/baseline_explainer/saga_explainer_l1_5.pickle


#### Determine congruency

In [26]:
# def partition_paths_by_congruency(explainer_predictions,
#                                   model_predictions,
#                                   paths):
#     n_samples = len(paths)
#     assert len(explainer_predictions) == n_samples
#     assert len(model_predictions) == n_samples, "Length of model predictions {} doesn't match n_samples {}".format(
#         len(model_predictions), n_samples
#     )

#     incongruent_paths = []
#     congruent_paths = []

#     for explainer_prediction, model_prediction, path in tqdm(zip(
#         explainer_predictions, model_predictions, paths
#     )):
#         if explainer_prediction == model_prediction:
#             congruent_paths.append(path)
#         else:
#             incongruent_paths.append(path)
    
#     return {
#         'congruent': congruent_paths,
#         'incongruent': incongruent_paths
#     }



In [52]:
cav_explainer = torch.load('saved/PlacesCategoryClassification/0510_102912/ADE20K_predictions/saga/scaled_cav_explainer/saga_explainer_l1_0.5_validation.pth')
baseline_explainer = torch.load('saved/PlacesCategoryClassification/0510_102912/ADE20K_predictions/saga/baseline_explainer/saga_explainer_l1_5_validation.pth')
model_outputs = torch.load('saved/PlacesCategoryClassification/0510_102912/ADE20K_predictions/saga/val_outputs_predictions.pth')

cav_predictions = cav_explainer['predictions']
baseline_predictions = baseline_explainer['predictions']
model_predictions = model_outputs['predictions']
print("cav_explainer accuracy: {}".format(np.count_nonzero(cav_predictions == model_predictions) / len(cav_predictions)))

print("base_explainer accuracy: {}".format(np.count_nonzero(baseline_predictions == model_predictions) / len(baseline_predictions)))
print(np.count_nonzero(np.logical_and(cav_predictions == model_predictions, cav_predictions == baseline_predictions)) / len(cav_predictions))

cav_explainer accuracy: 0.6517334533993696
base_explainer accuracy: 0.6314723097703737
0.5335434488968933


### Obtain predictions of linear classifier on validation set (16 way classifier with only frequent attributes)

In [10]:
# Define variables
c = 0.2
solver = 'saga'
penalty = 'l1'

train_X = frequent_attributes['train']
val_X = frequent_attributes['val']

train_y = category_predictions['train']
val_y = category_predictions['val']

# Create logistic regression classifier and predict for validation set
logreg = LogisticRegression(
    solver=solver, 
    C=c, 
    penalty=penalty)
print("Logistic regression parameters: \n\tC: {} \tSolver: {} \tPenalty: {}".format(
    c, solver, penalty))
logreg.fit(train_X, train_y)

print("Fit logistic regression on training data")

logreg_category_predictions = logreg.predict(val_X)

n_samples = len(val_y)
assert len(logreg_category_predictions) == n_samples
assert len(val_paths) == n_samples

print("Accuracy: {}".format(1 - np.count_nonzero(logreg_category_predictions != val_y) / n_samples))



Logistic regression parameters: 
	C: 0.2 	Solver: saga 	Penalty: l1
Fit logistic regression on training data
Accuracy: 0.5612336785231877


In [20]:
# incongruent_paths = []
# congruent_paths = []
# baseline_save_dir = os.path.join(save_dir, 'baseline_explainer')
# ensure_dir(baseline_save_dir)
# param_string = '16_class_freq_attr'
# incongruent_save_path = os.path.join(baseline_save_dir, 'incongruent_paths_{}_{}_{}_{}.txt'.format(param_string, solver, penalty, c))
# congruent_save_path = os.path.join(baseline_save_dir, 'congruent_paths_{}_{}_{}_{}.txt'.format(param_string, solver, penalty, c))

# for logreg_pred, model_pred, image_path in tqdm(zip(logreg_category_predictions, val_y, val_paths)):
#     if logreg_pred != model_pred:
#         incongruent_paths.append(image_path)
#     else:
#         congruent_paths.append(image_path)

# for save_path, paths_list in zip([incongruent_save_path, congruent_save_path], [incongruent_paths, congruent_paths]):
#     if os.path.exists(save_path):
#         print("{} already exists".format(save_path))
#     else:
#         write_lists(save_path, paths_list)
#         print("Saved {} paths to {}".format(len(paths_list), save_path))


#### Obtain the pre-normalized output and normalized probabilities from the logistic regressor on validation

In [11]:
outputs = logreg.decision_function(val_X)
output_probabilities = logreg.predict_proba(val_X)
print(outputs[0], output_probabilities[0], np.sum(output_probabilities[0]))

[ -6.21541755  -9.57385951 -11.01464339  -8.19257013  -8.35659567
  -2.3154842   -5.336674    -7.39402475  -2.75187164  -5.73843319
  -3.09532866   0.10664419  -1.61541948  -4.4823798   -1.89333494
  -2.06268626] [1.73180361e-03 6.03646460e-05 1.42916118e-05 2.40203346e-04
 2.03873998e-04 7.80187178e-02 4.15828632e-03 5.33624900e-04
 5.20836190e-02 2.78689906e-03 3.75992050e-02 4.57296199e-01
 1.44002514e-01 9.70804106e-03 1.13634363e-01 9.79279926e-02] 1.0


(4442,)
1103
4437
4434
[ 8 15 15  2  2] [209 112 125  52  45]
