In [1]:
# This notebook combines predictions of a set of models 
# using by training a regression model on the outputs of the
# models.
run_names = [
    'tagging/7_11_17/dense_ensemble2/0',
    'tagging/7_11_17/dense_ensemble2/1',
    'tagging/7_11_17/dense_ensemble2/2',
    'tagging/7_11_17/dense_ensemble2/3',
    'tagging/7_11_17/dense_ensemble2/4'
]

train_probs_fn = 'train_probs.npy'
train_preds_fn = 'train_preds.csv'

val_probs_fn = 'validation_probs.npy'
val_preds_fn = 'validation_preds.csv'

s3_bucket = 'raster-vision'

In [2]:
from os.path import join

import boto3
from botocore.exceptions import ClientError

from rastervision.common.settings import results_path
from rastervision.common.utils import _makedirs

def s3_download(run_name, file_name, new_file_name=None):
    if new_file_name is None:
        new_file_name = file_name
    s3_key = 'results/{}/{}'.format(run_name, file_name)
    run_path = join('/opt/data/results/', run_name, new_file_name)
    s3 = boto3.resource('s3')
    s3.Bucket(s3_bucket).download_file(s3_key, run_path)
    
def download_run(run_name):
    s3_download(run_name, train_probs_fn)
    s3_download(run_name, train_preds_fn)
    
    s3_download(run_name, val_probs_fn)
    s3_download(run_name, val_preds_fn)

In [3]:
for run_name in run_names:
    print(run_name)
    download_run(run_name)

tagging/7_11_17/dense_ensemble2/0
tagging/7_11_17/dense_ensemble2/1
tagging/7_11_17/dense_ensemble2/2
tagging/7_11_17/dense_ensemble2/3
tagging/7_11_17/dense_ensemble2/4


In [4]:
import numpy as np

train_probs = []
val_probs = []
for run_name in run_names:
    run_path = join('/opt/data/results/', run_name)
    train_probs_path = join(run_path, train_probs_fn)
    train_probs.append(np.load(train_probs_path))
        
    val_probs_path = join(run_path, val_probs_fn)
    val_probs.append(np.load(val_probs_path))
    
all_train_probs = np.concatenate(train_probs, axis=1)
all_val_probs = np.concatenate(val_probs, axis=1)
print(all_train_probs.shape)
print(all_val_probs.shape)

(32383, 85)
(8096, 85)


In [5]:
from os.path import splitext, basename
import glob


def generate_file_inds(path):
    paths = sorted(
        glob.glob(join(path, '*.{}'.format('jpg'))))

    file_inds = []
    for path in paths:
        file_ind = splitext(basename(path))[0]
        file_inds.append(file_ind)
    return file_inds

dev_file_inds = generate_file_inds('/opt/data/datasets/planet_kaggle/train-jpg')
nb_train_inds = int(len(dev_file_inds) * 0.8)
train_file_inds = dev_file_inds[0:nb_train_inds]
val_file_inds = dev_file_inds[nb_train_inds:]

In [6]:
from rastervision.tagging.data.planet_kaggle import TagStore
from pandas import read_csv

gt_csv_path = '/opt/data/datasets/planet_kaggle/train_v2.csv'
gt_tag_store = TagStore(gt_csv_path)

gt_train_preds = gt_tag_store.get_tag_array(train_file_inds)
gt_val_preds = gt_tag_store.get_tag_array(val_file_inds)

In [7]:
import numpy as np
from numpy import loadtxt
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression

In [8]:
ntags = 17
models = [None] * ntags

for tag_ind in range(ntags):
    models[tag_ind] = LogisticRegression(verbose=10)
    models[tag_ind].fit(all_train_probs, gt_train_preds[:, tag_ind])
    #models[tag_ind] = XGBClassifier()
    #models[tag_ind].fit(all_train_probs, gt_train_preds[:, tag_ind], eval_set=[(all_val_probs, gt_val_preds[:, tag_ind])], eval_metric='error', early_stopping_rounds=10, verbose=True)

[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear]

In [9]:
val_preds = []
for tag_ind in range(ntags):
    val_preds.append(models[tag_ind].predict_proba(all_val_probs)[:, 1][:, np.newaxis])
val_preds = np.concatenate(val_preds, axis=1)

In [10]:
thresholds = [
    0.20000,
    0.66000,
    0.33000,
    0.39000,
    0.30000,
    0.20000,
    0.09000,
    0.60000,
    0.24000,
    0.18000,
    0.20000,
    0.24000,
    0.21000,
    0.20000,
    0.45000,
    0.18000,
    0.21000
]

In [11]:
from sklearn.metrics import fbeta_score

our_val_preds = val_preds > 0.11
#our_val_preds = val_preds > 0.15
f2 = fbeta_score(
    gt_val_preds, our_val_preds, beta=2, average='samples')
print(f2)

0.93100344671
