In [1]:
import numpy as np
import pandas as pd
from __future__ import print_function
import time, os
from scipy.stats.mstats import gmean, hmean

def deca_load(filename):
    tic = time.time()
    csv_path = 'data_raw/{}.csv'.format(filename)
    hdf_path = 'data_hdf/_{}.h5'.format(filename)
    if os.path.exists(hdf_path):
        print('loading {}... '.format(hdf_path), end='')
        result = pd.read_hdf(hdf_path)
        print("OK ({0:.1f}s)".format(time.time() - tic))
        return result
    else:
        try:
            os.mkdir('data_hdf')
        except:
            pass
        print('loading {}... '.format(csv_path), end='')
        result = pd.read_csv(csv_path)
        print("OK ({0:.1f}s)".format(time.time() - tic))
        result.to_hdf(hdf_path, 'fixed', mode='w', complib='blosc', complevel=9)
        print('Saved {} for fast future loading.'.format(hdf_path))
        return result

In [2]:
def sigmoid(x):
    return 1. / (1. + np.exp(-x))

def inverse_sigmoid(x):
    return np.log(x / (1. - x))

In [3]:
def calc_loss(y, yp):
    yp = np.clip(yp, 1e-7, 1-1e-7)
    loss_fn = y*np.log(yp)
    loss_fp = (1-y)*np.log(1-yp)
    return np.sum(-(5*loss_fn+loss_fp))/y.shape[0]

In [5]:
# PREDS_PATH = '/tmp/allsubs/'
# title_preds = filter(lambda x: x.endswith('txt'), os.listdir(PREDS_PATH))

In [8]:
# %%time
# all_preds = []
# for i, c in enumerate(title_preds):
#     if 'predsa' in c and len(c) == len('predsae.txt'):
#         os.remove(os.path.join(PREDS_PATH, c))
#         continue
#     if 'pred_one' in c:
#         os.remove(os.path.join(PREDS_PATH, c))
#         continue

CPU times: user 4 ms, sys: 236 ms, total: 240 ms
Wall time: 394 ms


# TEST

In [4]:
PREDS_PATH = '/tmp/allsubs/'
title_preds = filter(lambda x: x.endswith('txt'), os.listdir(PREDS_PATH))

In [5]:
len(title_preds)

171

In [6]:
%%time
all_preds = []
for i, c in enumerate(title_preds):
    if 'predsa' in c and len(c) == len('predsae.txt'):
        print('skip', c)
        continue
    if 'pred_one' in c:
        print('skip', c)
        continue
    preds = []
    with open(os.path.join(PREDS_PATH, c)) as fi:
        preds = map(float, fi.readlines())
    all_preds.append(preds)
    if ('2iter' in c):
        all_preds.append(preds)
        all_preds.append(preds)
        print(i, c, 'TRIPLE WGT')
#     elif (('2epo' in c) and ('nn' not in c) and ('NN' not in c) and ('39_' not in c)):
#         all_preds.append(preds)
#         print(i, c, 'DOUBLE WGT')
    else:
        print(i, c)

0 preds_xg_ng2iter_NN_5.txt TRIPLE WGT
1 20_ultim_nine3.txt
2 ultim_one_IT1_k4_9.txt
3 60_ultim_fifteen5_1epo.txt
4 test_preds_xgb_7TWO.txt
5 18_ultim_nine1.txt
6 11298_ultim_thirtytwo4_2epo_nn.txt
7 64_ultim_fifteen5_2epo_nn.txt
8 25_ultim_SIX1.txt
skip predsag.txt
10 test_preds_xgb_4ONE.txt
skip pred_one_#predsam.txt
skip predsae.txt
13 39_ultim_twelve5_2epo.txt
14 99992_ultim_twentyone1_2epo_nn.txt
15 17_ultim_two2.txt
16 87998_ultim_twentytwo1_2epo_k4.txt
skip pred_one_#predsal.txt
18 ultim_one_IT_k4_.txt
19 16_ultim_two2.txt
skip pred_one_#predsad.txt
21 preds_xg_ng2iter_k4_4.txt TRIPLE WGT
22 37_ultim_twelve4_2epo.txt
23 preds_lg_ng1iter_k5_1.txt
24 63_ultim_fifteen5_2epo_nn.txt
25 preds_lg_ng1iter_k4_1.txt
26 19_ultim_nine1.txt
27 preds_xg_ng2iter_k4_2.txt TRIPLE WGT
28 66298_ultim_thirtytwo4_2epo_k5.txt
29 67997_ultim_twentyseven3_2epo_k4.txt
30 preds_xg_ng2iter_NN_4.txt TRIPLE WGT
31 83_ultim_nineteen6_2epo.txt
32 61_ultim_fifteen5_1epo.txt
33 49_ultim_fifteen5_2epo.txt
34 679

In [7]:
len(all_preds)

190

In [11]:
ensemble_preds = gmean(np.vstack(all_preds), axis=0)

In [17]:
ensemble_preds = np.mean(np.vstack(all_preds), axis=0)

In [12]:
ensemble_preds = np.clip(ensemble_preds+.0176, 1e-7, 1-1e-6)

In [8]:
# stacked = np.vstack(all_preds)
m = gmean(np.vstack(all_preds), axis=0)
# mstd = np.std(stacked, axis=0)

m2 = m.copy()
m2+=.0177
m2[m2>.91]-=.010
m2[m2>.81]-=.020
m2 = np.clip(m2, 1e-7, 1-1e-6)
# m2[np.argsort(mstd)[-200000:]] -=.10

In [9]:
with open('/media/ramdisk/fini_last92_almost4_rem.txt', 'w') as fo:
    for s in m2:
        fo.write(str(s) + '\n')