In [16]:
import os
import numpy as np
import pickle as pkl
from random import random

POS_CAP = 100000.0
NEG_CAP = 1000000.0

def get_processed_data(filename, with_all=False):
    # Load data
    if "label-leak-one-source" not in filename:
        with open(filename, 'rb') as f:
            # (labels, scores, weights) = pkl.load(f)
            _, df = pkl.load(f)
            (labels, scores, weights) = df[:, 0], df[:, 1], df[:, 2]
    else:
        if "labels" in filename:
            print("None")
            return None
        region = filename.rsplit('_', 2)[1]
        if region == "geodas":
            region = "NOAA_geodas"
        elif region == "multi":
            region = "US_multi"
        labels_fn = filename.rsplit('/', 1)[0] + "/labels_{}".format(region)
        scores_fn = filename
        with open(labels_fn, 'rb') as f:
            labels = pkl.load(f)
        with open(scores_fn, 'rb') as f:
            scores = pkl.load(f)
        weights = np.ones(len(labels))

    # Map scores to [-1, 1]
    max_scores, min_scores = np.max(scores), np.min(scores)
    if abs(min_scores) <= 1e-3:
        scores = np.array(scores) * 2 - 1
        max_scores, min_scores = np.max(scores), np.min(scores)
    # Map labels to -1 or 1
    if abs(np.min(labels)) <= 1e-8:
        labels = labels * 2 - 1

    # Some preprocess
    norm = np.max([np.abs(max_scores), np.abs(min_scores)])
    scores = scores / norm
    n_pos = np.sum(labels > 0)
    n_neg = labels.shape[0] - n_pos
    if with_all:
        # rate_pos = rate_neg = 0.1
        rate_pos = rate_neg = 0.01
    else:
        rate_pos = POS_CAP / max(1.0, n_pos)
        rate_neg = NEG_CAP / n_neg
    # Down sample
    mask = []
    for i in range(labels.shape[0]):
        if labels[i] < 0 and random() <= rate_neg:
            mask.append(True)
            if rate_neg < 1.0:
                weights[i] /= rate_neg
        elif labels[i] > 0 and random() <= rate_pos:
            mask.append(True)
            if rate_pos < 1.0:
                weights[i] /= rate_pos
        else:
            mask.append(False)
    labels = labels[mask]
    scores = scores[mask]
    weights = weights[mask]
    # error = (scores * labels) <= 0
    # return np.array(sorted(list(zip(labels, scores, error, weights))))
    return ((n_pos, n_neg), np.array(sorted(list(zip(labels, scores, weights)))))


def write_data(data, filename):
    # target = filename.replace("yf-talk-", "")
    target = filename.replace("yf-talk-", "new-")
    base = target.rsplit('/', 1)[0]
    if not os.path.exists(base):
        os.mkdir(base)
    with open(target, 'wb') as f:
        pkl.dump(data, f)

In [9]:
os.listdir("yf-talk-scores")

['label-leak-all',
 'label-leak-one-source',
 'one-source',
 'unweighted-one-source',
 'with-all']

In [18]:
from os import listdir

base = "yf-talk-scores"
for cat in ["label-leak-all"]:  # listdir(base):
    print("Now process {}".format(cat))
    for filename in listdir("{}/{}".format(base, cat)):
        print("{}".format(filename))
        path = "{}/{}/{}".format(base, cat, filename)
        # if not os.path.exists(path.replace("yf-talk-", "")):
        if True:
            data = get_processed_data(path, with_all="all" in cat)
            if data is not None:
                write_data(data, path)
            else:
                print("{} is not valid".format(path))
    print("{} is done.".format(cat))

Now process label-leak-all
testing_result_AGSO
testing_result_JAMSTEC
testing_result_NGA
testing_result_NGDC
testing_result_NOAA_geodas
testing_result_SIO
testing_result_US_multi
label-leak-all is done.
