# 1. Validating and formatting

These are the scripts that validate and format the raw data files for the bathymetry project.

## Meaning of columns


```
LAT LONG DEPTH sigh sigd SID  pred id   d10   d20   d60    age   VGG   rate  sed    rough
C1  C2   C3    C4   C5   C6   C7   C8   C9    C10   C11    C12   C13   C14   C15    C16
 
G:T     kind   year
C17     C18    C19

NDP_2.5   NDP_5   NDP_10   NDP_30   STD_2.5m   STD_5m   STD_10m   STD_30m   MED_2.5m   MED_5m   MED_10m
C20       C21     C22      C23      C24        C25      C26       C27       C28        C29      C30

MED_30m
C31

DEPTH-MED_2.5m)/STD_2.5m     (DEPTH-MED_5m)/STD_5m       (DEPTH-MED_10m)/STD_10m      (DEPTH-MED_30m)/STD_30m
C32                          C33                         C34                          C35


VGG = Vertical gravity gradient (= curvature of the sea surface) 
rate = seafloor spreading rate
rough = seafloor roughness
G:T = Gravity:Topography ratio 
NDP = Number of data points
MED = Median
STD = Standard dev
```

# Parse

In [2]:
import os

DATA_BASE_DIR = "/geosat2/julaiti/tsv_all"
TRAINING_FILES_DESC = os.path.join(DATA_BASE_DIR, "training_files_desc.txt")
TESTING_FILES_DESC = os.path.join(DATA_BASE_DIR, "testing_files_desc.txt")
VALIDATION_FILES_DESC = os.path.join(DATA_BASE_DIR, "validation_files_desc.txt")

In [6]:
with open(os.path.join(DATA_BASE_DIR, "valid_regions.txt")) as f:
    input_dir = list(map(
        lambda s: os.path.join(DATA_BASE_DIR, s.strip()), f.readlines()))
len(input_dir)

15

In [7]:
columns = """
LAT LONG DEPTH sigh sigd SID  pred id   d10   d20   d60    age   VGG   rate  sed    rough
G:T     kind   year
NDP_2.5   NDP_5   NDP_10   NDP_30   STD_2.5m   STD_5m   STD_10m   STD_30m   MED_2.5m   MED_5m   MED_10m
MED_30m
DEPTH-MED_2.5m)/STD_2.5m     (DEPTH-MED_5m)/STD_5m       (DEPTH-MED_10m)/STD_10m      (DEPTH-MED_30m)/STD_30m
"""

columns = list(enumerate(map(lambda s: s.strip(), columns.strip().split())))
for a, b in columns:
    print("{}\t{}".format(a, b))

0	LAT
1	LONG
2	DEPTH
3	sigh
4	sigd
5	SID
6	pred
7	id
8	d10
9	d20
10	d60
11	age
12	VGG
13	rate
14	sed
15	rough
16	G:T
17	kind
18	year
19	NDP_2.5
20	NDP_5
21	NDP_10
22	NDP_30
23	STD_2.5m
24	STD_5m
25	STD_10m
26	STD_30m
27	MED_2.5m
28	MED_5m
29	MED_10m
30	MED_30m
31	DEPTH-MED_2.5m)/STD_2.5m
32	(DEPTH-MED_5m)/STD_5m
33	(DEPTH-MED_10m)/STD_10m
34	(DEPTH-MED_30m)/STD_30m


## Count the number of cruises from each regions

In [56]:
from random import shuffle
import io


def is_final_tsv_file(filepath):
    # if "SIO" in filepath:
    #     return filepath.endswith(".tsv_all_final")
    return filepath.endswith(".tsv_all")


def is_enough_cols(filepath):
    with io.open(filepath, 'r', newline='\n') as f:
        return len(f.readline().split()) == len(columns)

training_files = []
testing_files = []
validation_files = []
for root in input_dir:
    all_files = map(lambda s: os.path.join(root, s), os.listdir(root))
    valid_files = filter(is_final_tsv_file, all_files)
    valid_files = filter(is_enough_cols, valid_files)

    filepath = list(valid_files)
    shuffle(filepath)
    if len(filepath) <= 50:
        print("There are {} records under '{}'. They will *only* be used for testing.".format(len(filepath), root))
        testing_files += filepath
    else:
        thr1 = int(len(filepath) * 0.15)
        thr2 = thr1 + thr1  # another 0.2
        testing_files += filepath[:thr1]
        validation_files += filepath[thr1:thr2]
        training_files += filepath[thr2:]
        print("There are {} records under '{}'.".format(len(filepath), root))


with open(TRAINING_FILES_DESC, 'w') as f:
    f.write('\n'.join(training_files))
with open(TESTING_FILES_DESC, 'w') as f:
    f.write('\n'.join(testing_files))
with open(VALIDATION_FILES_DESC, 'w') as f:
    f.write('\n'.join(validation_files))

len(training_files), len(validation_files), len(testing_files)

There are 408 records under '/geosat2/julaiti/tsv_all/3DGBR'.
There are 103 records under '/geosat2/julaiti/tsv_all/AGSO'.
There are 16 records under '/geosat2/julaiti/tsv_all/DNC'. They will *only* be used for testing.
There are 3 records under '/geosat2/julaiti/tsv_all/IBCAO'. They will *only* be used for testing.
There are 94 records under '/geosat2/julaiti/tsv_all/IFREMER'.
There are 546 records under '/geosat2/julaiti/tsv_all/JAMSTEC'.
There are 0 records under '/geosat2/julaiti/tsv_all/JAMSTEC2'. They will *only* be used for testing.
There are 1375 records under '/geosat2/julaiti/tsv_all/NGA'.
There are 24 records under '/geosat2/julaiti/tsv_all/NGA2'. They will *only* be used for testing.
There are 14 records under '/geosat2/julaiti/tsv_all/NOAA'. They will *only* be used for testing.
There are 250 records under '/geosat2/julaiti/tsv_all/SIO'.
There are 618 records under '/geosat2/julaiti/tsv_all/US_multi'.
There are 4655 records under '/geosat2/julaiti/tsv_all/NOAA_geodas'.
The

(6374, 1360, 1467)

## Parse correct files

In [57]:
with open(TRAINING_FILES_DESC) as f:
    training_files = f.readlines()
with open(TESTING_FILES_DESC) as f:
    testing_files = f.readlines()

filename = training_files[0].strip()
with io.open(filename, 'r', newline='\n') as f:
    for a, b in zip(columns, f.readline().split()):
        print("{0:02d} {1:35s}\t{2:s}".format(a[0], a[1], b))

00 LAT                                	151.46563
01 LONG                               	-9.89855
02 DEPTH                              	-876
03 sigh                               	0
04 sigd                               	-1
05 SID                                	54710
06 pred                               	-943
07 id                                 	1
08 d10                                	1
09 d20                                	1
10 d60                                	0.991437762579
11 age                                	39.3703424623
12 VGG                                	56.1061734039
13 rate                               	5154.39169575
14 sed                                	63.5567064532
15 rough                              	67.7707684862
16 G:T                                	0.744597200561
17 kind                               	M
18 year                               	2000
19 NDP_2.5                            	223.13974293
20 NDP_5                              	1129.34403636
2

## Example code for parsing the features and labels

In [4]:
import io

# Set KIND_INDEX based on what is printed above
KIND_INDEX = 17


import numpy as np

data_type = {
    "M": 1,  # - multibeam
    "G": 2,  # - grid
    "S": 3,  # - single beam
    "P": 4,  # - point measurement
}

with open(TRAINING_FILES_DESC) as f:
    training_files = f.readlines()
with open(TESTING_FILES_DESC) as f:
    testing_files = f.readlines()

removed_features = [3, 4, 5, 7]
get_label = lambda cols: cols[4] == '9999'
training_features = []
for filename in training_files:
    filename = filename.strip()
    if not filename:
        continue
    features = []
    labels = []
    with io.open(filename, 'r', newline='\n') as fread:
        for line in fread:
            cols = line.strip().split()
            if not cols:
                continue
            cols[KIND_INDEX] = data_type[cols[KIND_INDEX]]
            labels.append(get_label(cols))
            features.append(np.array(
                [float(cols[i]) for i in range(len(cols)) if i not in removed_features]
            ))
    training_features.append(np.array(features))
    if len(training_features) > 10:
        break

In [8]:
with open(TRAINING_FILES_DESC) as f:
    training_files = f.readlines()
with open(TESTING_FILES_DESC) as f:
    testing_files = f.readlines()

filename = training_files[0].strip()
with io.open(filename, 'r', newline='\n') as f:
    real_index = 0
    for a, b in zip(columns, f.readline().split()):
        if a[0] in removed_features:
            continue
        print("{0:02d} {1:35s}\t{2:s}".format(real_index, a[1], b))
        real_index += 1

00 LAT                                	151.46563
01 LONG                               	-9.89855
02 DEPTH                              	-876
03 pred                               	-943
04 d10                                	1
05 d20                                	1
06 d60                                	0.991437762579
07 age                                	39.3703424623
08 VGG                                	56.1061734039
09 rate                               	5154.39169575
10 sed                                	63.5567064532
11 rough                              	67.7707684862
12 G:T                                	0.744597200561
13 kind                               	M
14 year                               	2000
15 NDP_2.5                            	223.13974293
16 NDP_5                              	1129.34403636
17 NDP_10                             	5682.76607046
18 NDP_30                             	29711.2701283
19 STD_2.5m                           	129.895390742
20 STD_5m  

# 2. Convert data to the LIBSVM format (no longer needed)

Rest of the code in this notebook is no longer required for data pre-processing for the bathymetry datasets.

# Collect

In [49]:
import os

all_files = []
for root, subdirs, files in os.walk("./"):
    all_files += [os.path.join(root, filename)
                  for filename in files if filename.endswith("libsvm")]

len(all_files)

9491

In [52]:
for filename in all_files:
    split = filename.rsplit("/", 1)
    new_dir = split[0] + "_libsvm/"
    if not os.path.exists(new_dir):
        os.makedirs(new_dir)
    os.rename(filename, new_dir + split[1])

# Merge

In [69]:
import os

with open("merge-files.sh", 'w') as f:
    for root, subdirs, files in os.walk("./"):
        t = [os.path.join(root, filename)
             for filename in files if filename.endswith("libsvm")]
        if t:
            command = "cat %s > %s/data.libsvm" % (' '.join(t), root)
            f.write(command + '\n')

In [103]:
# Removed individual files

for root, subdirs, files in os.walk("./"):
    for filename in files:
        if filename.endswith("libsvm") and filename != "data.libsvm":
            os.remove(os.path.join(root, filename))

# Shuffle

In [1]:
import os

all_files = []
for root, subdirs, files in os.walk("./"):
    all_files += [os.path.join(root, filename)
                  for filename in files if filename == "data.libsvm"]

len(all_files)

17

In [2]:
from random import randint
from random import shuffle

def shuffle_limited_memory(filename, ntest, nparts):
    assert(ntest < nparts)
    subfiles = [filename + "_part%d" % i for i in range(nparts)]
    handlers = [open(name, 'w') for name in subfiles]
    with open(filename) as f:
        for line in f:
            t = randint(0, nparts - 1)
            handlers[t].write(line)
    for handler in handlers:
        handler.close()

    base = filename.rsplit("/", 1)[0]
    training = open(base + "/training.libsvm", 'w')
    testing = open(base + "/testing.libsvm", 'w')
    shuffle(subfiles)
    for i, name in enumerate(subfiles):
        with open(name) as f:
            lines = f.readlines()        
        os.remove(name)
        shuffle(lines)
        s = ''.join(lines)
        if not s.endswith('\n'):
            s += '\n'
        if i < ntest:
            testing.write(s)
        else:
            training.write(s)
    training.close()
    testing.close()

In [3]:
for filename in all_files:
    print(filename)
    shuffle_limited_memory(filename, 10, 100)

./NGA_libsvm/data.libsvm
./lakes_libsvm/data.libsvm
./GEOMAR_libsvm/data.libsvm
./NGA2_libsvm/data.libsvm
./JAMSTEC_libsvm/data.libsvm
./GEBCO_libsvm/data.libsvm
./NOAA_libsvm/data.libsvm
./CCOM_libsvm/data.libsvm
./US_multi_libsvm/data.libsvm
./SIO_libsvm/data.libsvm
./3DGBR_libsvm/data.libsvm
./NAVO_libsvm/data.libsvm
./IFREMER_libsvm/data.libsvm
./AGSO_libsvm/data.libsvm
./NOAA_geodas_libsvm/data.libsvm
./NGDC_libsvm/data.libsvm
./IBCAO_libsvm/data.libsvm


# Move files

In [5]:
for filename in all_files:
    old_dir, fname = filename.rsplit("/", 1)
    new_dir = old_dir + "_data"
    os.mkdir(new_dir)
    os.rename(filename, os.path.join(new_dir, fname))

# Upload to S3

In [9]:
import os

with open("upload-s3.sh", 'w') as f:
    for root, subdirs, files in os.walk("./"):
        if root.endswith("_libsvm"):
            dirname = root[2:]
            f.write("aws s3 cp {} s3://tmsn-data/bathymetry/{}/ --recursive\n".format(root, dirname))