In [1]:
base_dir = "/Users/igpp-jalafate/workbox/bathymetry-analysis/train-logs/runtime_data"
import io
import os
import numpy as np
import pandas as pd
import pickle
from os.path import split as split_path

write_dir = "/cryosat3/jalafate/bathymetry/data/chulks"

# Create chulks

In [16]:
# regions = ['AGSO', 'JAMSTEC', 'NGA', 'NGDC', 'NOAA_geodas', 'SIO', 'US_multi']
for filename in os.listdir(base_dir):
    try:
        path = os.path.join(base_dir, filename)
        with open(path, 'rb') as f:
            features, labels, weights = pickle.load(f)
    except:
        print("failed,", path)
        continue

    basename = filename.rsplit(".", 2)[0]  # e.g., valid_US_multi_ew9904_ed
    basename_comp = basename.split('_')
    dtype = basename_comp[0]
    region = basename_comp[1]
    if region == "US":
        region = "US_multi"
        cruise_name = '_'.join(basename_comp[3:])
    elif region == "NOAA":
        region = "NOAA_geodas"
        cruise_name = '_'.join(basename_comp[3:])
    else:
        cruise_name = '_'.join(basename_comp[2:])
    
    num_line = len(labels)
    num_corrupt = num_line - np.sum(labels)

    data_type = np.nan_to_num(np.array(features)[:, -1]).astype(int)
    bin_counts = np.bincount(data_type, minlength=5)

    chulk_size = 5000
    if bin_counts[0] + bin_counts[1] > 0:
        chulk_size = 100000

    cursor = 0
    part = 0
    while num_line - cursor >= 2 * chulk_size:
        part_filename = basename + ".part{}.pkl".format(part)
        part += 1
        start, end = cursor, cursor + chulk_size
        cursor += chulk_size
        with open(os.path.join(write_dir, part_filename), "wb") as f:
            pickle.dump((features[start:end], labels[start:end], weights[start:end]), f)
    if cursor < num_line:
        part_filename = basename + ".part{}.pkl".format(part)
        part += 1
        start, end = cursor, cursor + chulk_size
        cursor += chulk_size
        write_filepath = os.path.join(write_dir, part_filename)
        with open(write_filepath, "wb") as f:
            pickle.dump((features[start:end], labels[start:end], weights[start:end]), f)
    break

failed, /Users/igpp-jalafate/workbox/bathymetry-analysis/train-logs/runtime_data/inventory_valid.txt
failed, /Users/igpp-jalafate/workbox/bathymetry-analysis/train-logs/runtime_data/with_part.sh
failed, /Users/igpp-jalafate/workbox/bathymetry-analysis/train-logs/runtime_data/inventory_train.txt
failed, /Users/igpp-jalafate/workbox/bathymetry-analysis/train-logs/runtime_data/inventory_test.txt


# Train/test split

In [30]:
from random import random

In [31]:
# regions = ['AGSO', 'JAMSTEC', 'NGA', 'NGDC', 'NOAA_geodas', 'SIO', 'US_multi']
dtypes = []
regions = []
cruises = []
num_lines = []
num_corrupts = []

for filename in os.listdir(write_dir):
    path = os.path.join(write_dir, filename)
    if not filename.endswith(".pkl"):
        continue

    basename = filename.rsplit(".", 2)[0]  # e.g., valid_US_multi_ew9904_ed
    basename_comp = basename.split('_')
    region = basename_comp[1]
    if region == "US":
        region = "US_multi"
        cruise_name = '_'.join(basename_comp[3:])
    elif region == "NOAA":
        region = "NOAA_geodas"
        cruise_name = '_'.join(basename_comp[3:])
    else:
        cruise_name = '_'.join(basename_comp[2:])

    dtype = "train"
    rand = random()
    if 0.7 <= rand < 0.85:
        dtype = "valid"
    elif rand >= 0.85:
        dtype = "test"

    target_dir = os.path.join(write_dir, region, dtype)
    new_path = os.path.join(target_dir, filename)
    os.rename(path, new_path)

# Count lines

In [32]:
dtypes = []
regions = []
num_lines = []
num_corrupts = []


# regions = ['AGSO', 'JAMSTEC', 'NGA', 'NGDC', 'NOAA_geodas', 'SIO', 'US_multi']
for region in os.listdir(write_dir):
    dir_name = os.path.join(write_dir, region)
    if not os.path.isdir(dir_name):
        continue

    for dtype in ["train", "valid", "test"]:
        subdir_name = os.path.join(dir_name, dtype)
        for filename in os.listdir(subdir_name):
            try:
                path = os.path.join(subdir_name, filename)
                with open(path, 'rb') as f:
                    features, labels, weights = pickle.load(f)
            except:
                print("failed,", path)
                continue

            num_line = len(labels)
            num_corrupt = num_line - np.sum(labels)

            dtypes.append(dtype)
            regions.append(region)
            num_lines.append(num_line)
            num_corrupts.append(num_corrupt)

In [33]:
df = pd.DataFrame.from_dict({
    "dtype": dtypes,
    "region": regions,
    "num_line": num_lines,
    "num_corrupt": num_corrupts,
})
if True:
    df.to_pickle("count-lines-chulks.pickle")