In [1]:
# This notebook was used to download the output of runs 
# and then create a weighted ensemble using the  
# optimal weights.
from os.path import join

import boto3
from botocore.exceptions import ClientError

from rastervision.common.settings import results_path
from rastervision.common.utils import _makedirs

test_pred_fn = 'test_preds.csv'
scores_fn = 'scores.json'
s3_bucket = 'raster-vision'
out_path = '/opt/data/results/jupyter_out/'

In [2]:
def s3_download(run_name, file_name, new_file_name=None):
    if new_file_name is None:
        new_file_name = file_name
    s3_key = 'results/{}/{}'.format(run_name, file_name)
    run_path = join('/opt/data/results/', run_name, new_file_name)
    s3 = boto3.resource('s3')
    s3.Bucket(s3_bucket).download_file(s3_key, run_path)
    
def download_run(run_name):
    _makedirs(join('/opt/data/results/', run_name))
    try:
        s3_download(run_name, test_pred_fn)
    except:
        s3_download(run_name, 'test_predictions.csv', test_pred_fn)
        
    s3_download(run_name, scores_fn)
    s3_download(run_name, 'validation_preds.csv')
    s3_download(run_name, 'train_preds.csv')

In [23]:
# Use 80% of dev set
run_names = [
    'tagging/7_5_17/ensemble/0',
    'tagging/7_5_17/ensemble/1',
    'tagging/7_5_17/ensemble/2',
    'tagging/7_5_17/ensemble/3',
    'tagging/7_5_17/ensemble/4',
    'tagging/7_18_17/80/resnet/0',
    'tagging/7_18_17/80/resnet/1',
    'tagging/7_18_17/80/resnet/2',
    'tagging/7_18_17/80/resnet/3',
    'tagging/7_18_17/80/resnet/4',
    'tagging/7_18_17/80/inception/0',
    'tagging/7_18_17/80/inception/1',
    'tagging/7_18_17/80/inception/2',
    'tagging/7_18_17/80/inception/3',
    'tagging/7_18_17/80/inception/4'  
]

tagging/7_5_17/ensemble/0
tagging/7_5_17/ensemble/1
tagging/7_5_17/ensemble/2
tagging/7_5_17/ensemble/3
tagging/7_5_17/ensemble/4
tagging/7_18_17/80/resnet/0
tagging/7_18_17/80/resnet/1
tagging/7_18_17/80/resnet/2
tagging/7_18_17/80/resnet/3
tagging/7_18_17/80/resnet/4
tagging/7_18_17/80/inception/0
tagging/7_18_17/80/inception/1
tagging/7_18_17/80/inception/2
tagging/7_18_17/80/inception/3
tagging/7_18_17/80/inception/4


In [None]:
for run_name in run_names:
    print(run_name)
    download_run(run_name)

In [24]:
from os.path import splitext, basename
import glob

def generate_file_inds(path):
    paths = sorted(
        glob.glob(join(path, '*.{}'.format('jpg'))))

    file_inds = []
    for path in paths:
        file_ind = splitext(basename(path))[0]
        file_inds.append(file_ind)
    return file_inds

dev_file_inds = generate_file_inds('/opt/data/datasets/planet_kaggle/train-jpg')
nb_train_inds = int(len(dev_file_inds) * 0.8)
train_file_inds = dev_file_inds[0:nb_train_inds]
val_file_inds = dev_file_inds[nb_train_inds:]
test_file_inds = generate_file_inds('/opt/data/datasets/planet_kaggle/test-jpg')

In [25]:
from rastervision.tagging.data.planet_kaggle import TagStore

results_path = '/opt/data/results/'
train_tag_stores = []
val_tag_stores = []
for run_name in run_names:
    val_path = join(results_path, run_name, 'validation_preds.csv')
    val_tag_stores.append(TagStore(val_path))
    
    train_path = join(results_path, run_name, 'train_preds.csv')
    train_tag_stores.append(TagStore(train_path))
    
gt_csv_path = '/opt/data/datasets/planet_kaggle/train_v2.csv'
gt_tag_store = TagStore(gt_csv_path)

In [27]:
import numpy as np
from sklearn.metrics import fbeta_score

val_preds = []
train_preds = []
for i, run_name in enumerate(run_names):
    print(run_name)
    run_val_preds = np.expand_dims(val_tag_stores[i].get_tag_array(val_file_inds), axis=2)
    val_preds.append(run_val_preds)
    
    run_train_preds = np.expand_dims(train_tag_stores[i].get_tag_array(train_file_inds), axis=2)
    train_preds.append(run_train_preds)
    
val_preds = np.concatenate(val_preds, axis=2)
print(val_preds.shape)

train_preds = np.concatenate(train_preds, axis=2)
print(train_preds.shape)

tagging/7_5_17/ensemble/0
tagging/7_5_17/ensemble/1
tagging/7_5_17/ensemble/2
tagging/7_5_17/ensemble/3
tagging/7_5_17/ensemble/4
tagging/7_18_17/80/resnet/0
tagging/7_18_17/80/resnet/1
tagging/7_18_17/80/resnet/2
tagging/7_18_17/80/resnet/3
tagging/7_18_17/80/resnet/4
tagging/7_18_17/80/inception/0
tagging/7_18_17/80/inception/1
tagging/7_18_17/80/inception/2
tagging/7_18_17/80/inception/3
tagging/7_18_17/80/inception/4
(8096, 17, 15)
(32383, 17, 15)


In [29]:
import math

def get_f2(w, is_train=True):
    if is_train:
        train_preds_sum = np.sum(train_preds * [[w]], axis=2)
        weight_sum = np.sum(w)
        true_thresh = math.ceil(weight_sum / 2)
        our_train_preds = train_preds_sum >= true_thresh
        gt_train_preds = gt_tag_store.get_tag_array(train_file_inds)
        f2 = fbeta_score(
            gt_train_preds, our_train_preds, beta=2, average='samples')
    else:
        val_preds_sum = np.sum(val_preds * [[w]], axis=2)
        weight_sum = np.sum(w)
        true_thresh = math.ceil(weight_sum / 2)
        our_val_preds = val_preds_sum >= true_thresh
        gt_val_preds = gt_tag_store.get_tag_array(val_file_inds)
        f2 = fbeta_score(
            gt_val_preds, our_val_preds, beta=2, average='samples')

    
    return f2

import itertools
possible_weights = list(range(0, 10))
weights = []
f2_scores = []
for w in itertools.product(*[possible_weights]*3):
    w = [w[0]] * 5 + [w[1]] * 5 + [w[2]] * 5
    f2 = get_f2(w, is_train=False)
    f2_scores.append(f2)
    weights.append(w)
    
results = sorted(zip(f2_scores, weights), key=lambda x: x[0])
for r in results:
    print('{:.5f} {}'.format(r[0], r[1]))

0.48360 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
0.93035 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
0.93035 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2]
0.93035 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3]
0.93035 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4]
0.93035 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 5, 5, 5]
0.93035 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 6, 6, 6, 6]
0.93035 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 7, 7, 7, 7]
0.93035 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8]
0.93035 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 9, 9, 9, 9]
0.93035 [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 6, 6, 6, 6, 6]
0.93035 [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 7, 7, 7, 7, 7]
0.93035 [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 8, 8, 8, 8, 8]
0.93035 [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9]
0.93035 [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 6, 6, 6, 6, 6]
0.93035 [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 7, 7, 7, 7, 7]
0.93035 [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8]
0.93035 [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 9, 9, 9, 9, 9]
0.93105 [0, 0, 0, 0, 0, 1, 1

In [30]:
def make_test_preds(run_names, weights, out_path):
    test_preds_list = []
    for run_name in run_names:
        run_path = join(results_path, run_name)
        test_pred_path = join(run_path, test_pred_fn) 
        test_preds = TagStore(test_pred_path).get_tag_array(test_file_inds)
        test_preds_list.append(np.expand_dims(test_preds, axis=2))
    
    test_preds = np.concatenate(test_preds_list, axis=2)
    test_preds_sum = np.sum(test_preds * [[weights]], axis=2)
    weight_sum = np.sum(weights)
    true_thresh = math.ceil(weight_sum / 2)
    our_test_preds = test_preds_sum >= true_thresh

    tag_store = TagStore()
    for i in range(our_test_preds.shape[0]):
        tag_store.add_tags(test_file_inds[i], our_test_preds[i, :])

    tag_store.save(out_path)

In [31]:
best_weights = [8, 8, 8, 8, 8, 7, 7, 7, 7, 7, 5, 5, 5, 5, 5]
# Use models trained on 99% of dev set for final predictions
run_names = [
    'tagging/7_19_17/99/inception/0',
    'tagging/7_19_17/99/inception/1',
    'tagging/7_19_17/99/inception/2',
    'tagging/7_19_17/99/inception/3',
    'tagging/7_19_17/99/inception/4',
    'tagging/7_16_17/resnet/0',
    'tagging/7_16_17/resnet/1',
    'tagging/7_16_17/resnet/2',
    'tagging/7_16_17/resnet/3',
    'tagging/7_16_17/resnet/4',
    'tagging/7_16_17/densenet/0',
    'tagging/7_16_17/densenet/1',
    'tagging/7_16_17/densenet/2',
    'tagging/7_16_17/densenet/3',
    'tagging/7_16_17/densenet/4'
]
make_test_preds(run_names, best_weights, join(out_path, 'weighted_5rn_5dn_5in_99.csv'))