In [8]:
import seaborn as sns
import itertools
import tensorflow as tf
import numpy as np
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
import cPickle as pickle
import os
import seaborn as sns
import sys
from IPython.display import Image, display
from sklearn import metrics

sys.path.insert(0, '/Users/angela/src/private/bias-in-datasets/active_learning/src')
from Plotter import format_plot, format_plot_2ys, write_file


sns.set_style("white")
sns.set_palette("husl")

class Config:
    MATCHER = ("^(.*)_(.*)_(.*)_(.*)_(\d+)_(\d*\.?\d*)_(\d*\.?\d*)(_trial\d+)?(_epoch\d+)?")

    def __init__(self, filename):
        groups = self.matches(filename)
        self.strategy = groups[0]
        self.dataset = groups[1]
        self.network = groups[2]
        self.top_k = float(groups[3])
        self.pool_size = int(groups[4])
        self.lr = float(groups[5])
        self.decay = float(groups[6])
        self.trial = None
        self.epoch = None
        if groups[7]:
            unparsed_trial = groups[7]
            self.trial = int(unparsed_trial.strip("_trial"))
        if groups[8]:
            unparsed_epoch = groups[8]
            self.epoch = int(unparsed_epoch.strip("_epoch"))


    def matches(self, filename):
        import re
        return (re.match(Config.MATCHER, filename)).groups()

    @property
    def label(self):
        if self.strategy == "topk":
            label = "{}, {}, top_{}/{}, lr={}".format(self.strategy,
                                                      self.network,
                                                      self.top_k,
                                                      self.pool_size,
                                                      self.lr)
        elif self.strategy == "sampling" or self.strategy == "deterministic":
            label = "{}, {}, {}, {}, lr={}".format(self.strategy,
                                                   self.network,
                                                   self.top_k,
                                                   self.pool_size,
                                                   self.lr)                                               
        if self.trial:
            label += ", trial-{}".format(self.trial)
        if self.epoch:
            label += ", epoch-{}".format(self.epoch)
        return label

def get_percentiles(d, num_percentiles):
    xs = np.arange(0, 100, 100./num_percentiles)
    ys = [np.percentile(d, x) for x in xs]
    return xs, ys
            

In [9]:
def write_percentiles(experiment_dir, data_dir, max_epoch=None, trials=None, num_percentiles=100):
pickles_dir = os.path.join(experiment_dir, "pickles")
target_confidences_dir = os.path.join(pickles_dir, "target_confidences")

print("In plot_confidence_distributions for {}".format(data_dir))

for filename in os.listdir(target_confidences_dir):
    config = Config(filename)
    if trials:
        if config.trial not in trials:
            continue
    pickles_file = os.path.join(target_confidences_dir, filename)

    subdata_dir = os.path.join(data_dir, "data")
    if not os.path.isdir(subdata_dir):
        os.mkdir(subdata_dir)

    with open(pickles_file, 'rb') as handle:
        print(pickles_file)
        d = pickle.load(handle)
        for epoch, data in d.iteritems():
            if max_epoch is not None:
                if epoch > max_epoch:
                    break

            probs = [int(c*100) for c in data["confidences"]]
            results = [int(c) for c in data["results"]]

            num_backpropped = data["num_backpropped"]


            data_filename = os.path.join(subdata_dir, "{}_epoch{}_{}backpropped.txt".format(filename,
                                                                   epoch,
                                                                   num_backpropped))

            xs, ys = get_percentiles(probs, num_percentiles)

            # Get percent correct for each percentile
            ys2 = []
            for confidence in ys:
                indices = [i for i, p in enumerate(probs) if p <= confidence]
                vals = [results[i] for i in indices]
                percent_correct = sum(vals) / float(len(vals)) * 100
                ys2.append(percent_correct)

            print("Dst File: {}".format(data_filename))
            with open(data_filename, "w+") as f:
                for x, y1, y2 in zip(xs, ys, ys2):
                    line = "{},{},{}\n".format(x, y1, y2)
                    f.write(line)

IndentationError: expected an indented block (<ipython-input-9-809477c707c9>, line 2)

In [5]:
def plot_files(plot_dir, files, labels):
    subplot_dir = os.path.join(plot_dir, "target_confidences")
    
    fig, ax1 = plt.subplots()
    ax2 = ax1.twinx()

    for fname, label in zip(files, labels):
        xs = []
        ys = []
        ys2 = []
        with open(fname) as f:
            for line in f:
                vals = line.rstrip().split(",")
                x = float(vals[0])
                y = float(vals[1])
                y2 = float(vals[2])
                xs.append(x)
                ys.append(y)
                ys2.append(y2)
        
        label1 = label + ": Target Confidence"
        label2 = label + ": Percent Correct, AUC: {:.2f}".format(metrics.auc(xs, ys2))
        
        ax1.plot(xs, ys, '-', label=label1)
        ax2.plot(xs, ys2, '.', label=label2)
        

    format_plot_2ys(ax1, ax2, "Percentile", "Target confidence %", "% Correct", legend_scale = 1.1)
    
    write_file(subplot_dir)
    

In [None]:
# Write target confidence percentiles to output

plot_home_dir = "../plots/batch_stats"

experiment_batch_name = "190108_confidences"
plot_dir = "{}/{}".format(plot_home_dir, experiment_batch_name)
if not os.path.exists(plot_dir):
        os.makedirs(plot_dir)
        
experiment_names = ["190108_confidences"]
for experiment_name in experiment_names:
    experiment_dir = "../data/output/cifar10/{}".format(experiment_name)
    
    write_percentiles(experiment_dir, plot_dir, trials=[1])

In [6]:
plot_home_dir = "../plots/batch_stats"

exp_name = "190108_confidences"
plot_dir = "{}/{}".format(plot_home_dir, exp_name)
if not os.path.exists(plot_dir):
        os.makedirs(plot_dir)

prefix = "../plots/batch_stats/190108_confidences/data/"
files = ["{}/{}".format(prefix, "deterministic_cifar10_mobilenet_0.1_128_0.0_0.0005_trial1_seed1337_target_confidences.pickle_epoch310_4694815backpropped.txt"),
         "{}/{}".format(prefix, "deterministic_cifar10_mobilenet_1_128_0.0_0.0005_trial1_seed1337_target_confidences.pickle_epoch100_5000000backpropped.txt")]
labels = ["SB: 4694815 backprops",
          "Baseline: 5000000 backprops"]
plot_files(plot_dir, files, labels)

[(0.0, 0.0), (1.0, 2.0), (2.0, 4.0), (3.0, 7.0), (4.0, 10.0), (5.0, 13.0), (6.0, 16.0), (7.0, 19.0), (8.0, 21.0), (9.0, 24.0), (10.0, 26.0), (11.0, 28.0), (12.0, 30.88), (13.0, 32.0), (14.0, 34.0), (15.0, 36.0), (16.0, 38.0), (17.0, 39.0), (18.0, 41.0), (19.0, 42.0), (20.0, 44.0), (21.0, 46.0), (22.0, 47.0), (23.0, 48.0), (24.0, 49.0), (25.0, 51.0), (26.0, 52.0), (27.0, 53.0), (28.0, 55.0), (29.0, 56.0), (30.0, 57.0), (31.0, 58.0), (32.0, 59.0), (33.0, 60.0), (34.0, 61.0), (35.0, 62.0), (36.0, 63.0), (37.0, 64.0), (38.0, 65.0), (39.0, 66.0), (40.0, 67.0), (41.0, 68.0), (42.0, 69.0), (43.0, 70.0), (44.0, 71.0), (45.0, 72.0), (46.0, 73.0), (47.0, 74.0), (48.0, 74.0), (49.0, 75.0), (50.0, 76.0), (51.0, 77.0), (52.0, 77.0), (53.0, 78.0), (54.0, 79.0), (55.0, 80.0), (56.0, 80.0), (57.0, 81.0), (58.0, 82.0), (59.0, 82.0), (60.0, 83.0), (61.0, 84.0), (62.0, 84.0), (63.0, 85.0), (64.0, 86.0), (65.0, 86.0), (66.0, 87.0), (67.0, 87.0), (68.0, 88.0), (69.0, 88.0), (70.0, 89.0), (71.0, 89.0), (72.

<Figure size 432x288 with 0 Axes>