# Parse

In [1]:
import os

DATA_BASE_DIR = "/geosat2/julaiti/tsv_all"
TRAINING_FILES_DESC = os.path.join(DATA_BASE_DIR, "training_files_desc.txt")
TESTING_FILES_DESC = os.path.join(DATA_BASE_DIR, "testing_files_desc.txt")

with open(os.path.join(DATA_BASE_DIR, "valid_regions.txt")) as f:
    input_dir = list(map(
        lambda s: os.path.join(DATA_BASE_DIR, s.strip()), f.readlines()))
len(input_dir)

15

In [7]:
columns = """
1. longitude 
2. latitude 
3. depth 
4. sigh 
5. sigd 
6. SID 
7. predicted_depth 
8. ID 
9. d10 
10. d20 
11. d60 
12. seafloor_age 
13. curvature(VGG)
14. spreading_rate 
15. sediment_thickness 
16. seafloor_roughness 
17. NDP_@2.5am 
18. NDP_@5am 
19. NDP_@10am 
20. NDP_@30am 
21. STD_@2.5am 
22. STD_@5am 
23. STD_@10am 
24. STD_@30am 
25. depth_SUB_median@2.5am 
26. depth_SUB_median@5am 
27. depth_SUB_median@10am 
28. depth_SUB_median@30am 
29. year 
30. data_type
"""

columns = list(map(lambda s: s.strip(), columns.strip().split('\n')))
len(columns), columns

(30,
 ['1. longitude',
  '2. latitude',
  '3. depth',
  '4. sigh',
  '5. sigd',
  '6. SID',
  '7. predicted_depth',
  '8. ID',
  '9. d10',
  '10. d20',
  '11. d60',
  '12. seafloor_age',
  '13. curvature(VGG)',
  '14. spreading_rate',
  '15. sediment_thickness',
  '16. seafloor_roughness',
  '17. NDP_@2.5am',
  '18. NDP_@5am',
  '19. NDP_@10am',
  '20. NDP_@30am',
  '21. STD_@2.5am',
  '22. STD_@5am',
  '23. STD_@10am',
  '24. STD_@30am',
  '25. depth_SUB_median@2.5am',
  '26. depth_SUB_median@5am',
  '27. depth_SUB_median@10am',
  '28. depth_SUB_median@30am',
  '29. year',
  '30. data_type'])

In [62]:
from random import shuffle

def is_final_tsv_file(filepath):
    if "SIO" in filepath:
        return filepath.endswith(".tsv_all_final")
    return filepath.endswith(".tsv_all")


training_files = []
testing_files = []
for root in input_dir:
    all_files = map(lambda s: os.path.join(root, s), os.listdir(root))
    valid_files = filter(is_final_tsv_file, all_files)
    
    filepath = list(valid_files)
    shuffle(filepath)
    if len(filepath) <= 50:
        print("There are {} records under '{}'. They will *only* be used for testing.".format(len(filepath), root))
        testing_files += filepath
    else:
        thr = int(len(filepath) * 0.75)
        training_files += filepath[:thr]
        testing_files += filepath[thr:]

with open(TRAINING_FILES_DESC, 'w') as f:
    f.write('\n'.join(training_files))
with open(TESTING_FILES_DESC, 'w') as f:
    f.write('\n'.join(testing_files))
len(training_files), len(testing_files)

There are 16 records under '/geosat2/julaiti/tsv_all/DNC'. They will *only* be used for testing.
There are 3 records under '/geosat2/julaiti/tsv_all/IBCAO'. They will *only* be used for testing.
There are 24 records under '/geosat2/julaiti/tsv_all/NGA2'. They will *only* be used for testing.
There are 14 records under '/geosat2/julaiti/tsv_all/NOAA'. They will *only* be used for testing.
There are 50 records under '/geosat2/julaiti/tsv_all/CCOM'. They will *only* be used for testing.


(6942, 2426)

In [9]:
name_cols = {}
for filename in training_files + testing_files:
    with open(filename) as f:
        num_cols = len(f.readline().split())
    region = filename.rsplit('/', 2)[1]
    if region not in name_cols:
        name_cols[region] = {}
    if num_cols not in name_cols[region]:
        name_cols[region][num_cols] = []
    name_cols[region][num_cols].append(filename)

In [10]:
rerun = []
for region in name_cols:
    print(region)
    for a, b in name_cols[region].items():
        # print("{}({})".format(a, len(b)), end='\t')
        if a != 30:
            rerun += b
    # print('\n' + '-' * 10)

3DGBR
AGSO
IFREMER
JAMSTEC
JAMSTEC2
NGA
SIO
US_multi
NOAA_geodas
NGDC
DNC
IBCAO
NGA2
NOAA
CCOM


## Remove the files with less than 30 columns

In [51]:
with open("wrong-col-nums.txt", 'w') as f:
    f.write('\n'.join(rerun))

In [81]:
rerun = [filename.strip() for filename in rerun]
training_files = [filename.strip() for filename in training_files if filename.strip() not in rerun]
testing_files =  [filename.strip() for filename in testing_files if filename.strip() not in rerun]

with open(TRAINING_FILES_DESC, 'w') as f:
    f.write('\n'.join(training_files))
with open(TESTING_FILES_DESC, 'w') as f:
    f.write('\n'.join(testing_files))

## Parse correct files

In [22]:
with open(training_files[0]) as f:
    for a, b in zip(columns, f.readline().split()):
        print("{0:35s}\t{1}".format(a, b))

1. longitude                       	150.225
2. latitude                        	-9.9
3. depth                           	-1318
4. sigh                            	0
5. sigd                            	-1
6. SID                             	54683
7. predicted_depth                 	-1311
8. ID                              	1
9. d10                             	0.918137582674
10. d20                            	0.772374974666
11. d60                            	0.592828516968
12. seafloor_age                   	NaN
13. curvature(VGG)                 	-49.8216911544
14. spreading_rate                 	4901.890625
15. sediment_thickness             	236.818668
16. seafloor_roughness             	75.0722478379
17. NDP_@2.5am                     	171.559721966
18. NDP_@5am                       	464.000644
19. NDP_@10am                      	2232.07499261
20. NDP_@30am                      	10060.9879654
21. STD_@2.5am                     	20.904548
22. STD_@5am                       	40.779

In [63]:
with open(TRAINING_FILES_DESC) as f:
    training_files = f.readlines()
with open(TESTING_FILES_DESC) as f:
    testing_files = f.readlines()


In [58]:
import numpy as np

data_type = {
    "M": 1,  # - multibeam
    "G": 2,  # - grid
    "S": 3,  # - single beam
    "P": 4,  # - point measurement
}

with open(TRAINING_FILES_DESC) as f:
    training_files = f.readlines()
with open(TESTING_FILES_DESC, 'w') as f:
    testing_files = f.readlines()

removed_features = [0, 1, 3, 4, 5, 7]
get_label = lambda cols: cols[4] == '9999'
training_features = []
for filename in training_files:
    filename = filename.strip()
    if not filename:
        continue
    features = []
    labels = []
    with open(filename) as fread:
        for line in fread:
            cols = line.strip().split()
            if not cols:
                continue
            cols[29] = data_type[cols[29]]
            labels.append(get_label(cols))
            features.append(np.array(
                [float(cols[i]) for i in range(len(cols)) if i not in removed_features]
            ))
    training_features.append(np.array(features))
    if len(training_features) > 10:
        break

# No longer needed

Rest of the code in this notebook is no longer required for data pre-processing for the bathymetry datasets.

# Collect

In [49]:
import os

all_files = []
for root, subdirs, files in os.walk("./"):
    all_files += [os.path.join(root, filename)
                  for filename in files if filename.endswith("libsvm")]

len(all_files)

9491

In [52]:
for filename in all_files:
    split = filename.rsplit("/", 1)
    new_dir = split[0] + "_libsvm/"
    if not os.path.exists(new_dir):
        os.makedirs(new_dir)
    os.rename(filename, new_dir + split[1])

# Merge

In [69]:
import os

with open("merge-files.sh", 'w') as f:
    for root, subdirs, files in os.walk("./"):
        t = [os.path.join(root, filename)
             for filename in files if filename.endswith("libsvm")]
        if t:
            command = "cat %s > %s/data.libsvm" % (' '.join(t), root)
            f.write(command + '\n')

In [103]:
# Removed individual files

for root, subdirs, files in os.walk("./"):
    for filename in files:
        if filename.endswith("libsvm") and filename != "data.libsvm":
            os.remove(os.path.join(root, filename))

# Shuffle

In [1]:
import os

all_files = []
for root, subdirs, files in os.walk("./"):
    all_files += [os.path.join(root, filename)
                  for filename in files if filename == "data.libsvm"]

len(all_files)

17

In [2]:
from random import randint
from random import shuffle

def shuffle_limited_memory(filename, ntest, nparts):
    assert(ntest < nparts)
    subfiles = [filename + "_part%d" % i for i in range(nparts)]
    handlers = [open(name, 'w') for name in subfiles]
    with open(filename) as f:
        for line in f:
            t = randint(0, nparts - 1)
            handlers[t].write(line)
    for handler in handlers:
        handler.close()

    base = filename.rsplit("/", 1)[0]
    training = open(base + "/training.libsvm", 'w')
    testing = open(base + "/testing.libsvm", 'w')
    shuffle(subfiles)
    for i, name in enumerate(subfiles):
        with open(name) as f:
            lines = f.readlines()        
        os.remove(name)
        shuffle(lines)
        s = ''.join(lines)
        if not s.endswith('\n'):
            s += '\n'
        if i < ntest:
            testing.write(s)
        else:
            training.write(s)
    training.close()
    testing.close()

In [3]:
for filename in all_files:
    print(filename)
    shuffle_limited_memory(filename, 10, 100)

./NGA_libsvm/data.libsvm
./lakes_libsvm/data.libsvm
./GEOMAR_libsvm/data.libsvm
./NGA2_libsvm/data.libsvm
./JAMSTEC_libsvm/data.libsvm
./GEBCO_libsvm/data.libsvm
./NOAA_libsvm/data.libsvm
./CCOM_libsvm/data.libsvm
./US_multi_libsvm/data.libsvm
./SIO_libsvm/data.libsvm
./3DGBR_libsvm/data.libsvm
./NAVO_libsvm/data.libsvm
./IFREMER_libsvm/data.libsvm
./AGSO_libsvm/data.libsvm
./NOAA_geodas_libsvm/data.libsvm
./NGDC_libsvm/data.libsvm
./IBCAO_libsvm/data.libsvm


# Move files

In [5]:
for filename in all_files:
    old_dir, fname = filename.rsplit("/", 1)
    new_dir = old_dir + "_data"
    os.mkdir(new_dir)
    os.rename(filename, os.path.join(new_dir, fname))

# Upload to S3

In [9]:
import os

with open("upload-s3.sh", 'w') as f:
    for root, subdirs, files in os.walk("./"):
        if root.endswith("_libsvm"):
            dirname = root[2:]
            f.write("aws s3 cp {} s3://tmsn-data/bathymetry/{}/ --recursive\n".format(root, dirname))