# Parse

In [1]:
import os

def is_final_tsv_file(filename):
    return filename.endswith("tsv")


all_files = []
for root, subdirs, files in os.walk("./"):
    all_files += [os.path.join(root, filename)
                  for filename in files if is_final_tsv_file(filename)]
len(all_files)

0

In [6]:
columns = """
1. longitude 
2. latitude 
3. depth 
4. sigh 
5. sigd 
6. SID 
7. predicted_depth 
8. ID 
9. d10 
10. d20 
11. d60 
12. seafloor_age 
13. curvature(VGG)
14. spreading_rate 
15. sediment_thickness 
16. seafloor_roughness 
17. NDP_@2.5am 
18. NDP_@5am 
19. NDP_@10am 
20. NDP_@30am 
21. STD_@2.5am 
22. STD_@5am 
23. STD_@10am 
24. STD_@30am 
25. depth_SUB_median@2.5am 
26. depth_SUB_median@5am 
27. depth_SUB_median@10am 
28. depth_SUB_median@30am 
29. year 
30. data_type
"""

columns = columns.strip().split('\n')
columns

['1. longitude ',
 '2. latitude ',
 '3. depth ',
 '4. sigh ',
 '5. sigd ',
 '6. SID ',
 '7. predicted_depth ',
 '8. ID ',
 '9. d10 ',
 '10. d20 ',
 '11. d60 ',
 '12. seafloor_age ',
 '13. curvature(VGG)',
 '14. spreading_rate ',
 '15. sediment_thickness ',
 '16. seafloor_roughness ',
 '17. NDP_@2.5am ',
 '18. NDP_@5am ',
 '19. NDP_@10am ',
 '20. NDP_@30am ',
 '21. STD_@2.5am ',
 '22. STD_@5am ',
 '23. STD_@10am ',
 '24. STD_@30am ',
 '25. depth_SUB_median@2.5am ',
 '26. depth_SUB_median@5am ',
 '27. depth_SUB_median@10am ',
 '28. depth_SUB_median@30am ',
 '29. year ',
 '30. data_type']

In [None]:
for filename in all_files:
    svm_fn = filename.rsplit(".", 1)[0] + ".libsvm"
    with open(filename) as fread:
        with open(svm_fn, 'w') as fwrite:
            for line in fread:
                cols = line.strip().split()
                if not cols:
                    continue
                label = (cols[4] == '9999')
                cols = cols[:4] + cols[5:]
                labels = ["%d:%s" % (i, v) for i, v in enumerate(cols) if v.lower() != "nan" and v != "0"]
                fwrite.write("%d %s\n" % (label, ' '.join(labels)))

# Collect

In [49]:
import os

all_files = []
for root, subdirs, files in os.walk("./"):
    all_files += [os.path.join(root, filename)
                  for filename in files if filename.endswith("libsvm")]

len(all_files)

9491

In [52]:
for filename in all_files:
    split = filename.rsplit("/", 1)
    new_dir = split[0] + "_libsvm/"
    if not os.path.exists(new_dir):
        os.makedirs(new_dir)
    os.rename(filename, new_dir + split[1])

# Merge

In [69]:
import os

with open("merge-files.sh", 'w') as f:
    for root, subdirs, files in os.walk("./"):
        t = [os.path.join(root, filename)
             for filename in files if filename.endswith("libsvm")]
        if t:
            command = "cat %s > %s/data.libsvm" % (' '.join(t), root)
            f.write(command + '\n')

In [103]:
# Removed individual files

for root, subdirs, files in os.walk("./"):
    for filename in files:
        if filename.endswith("libsvm") and filename != "data.libsvm":
            os.remove(os.path.join(root, filename))

# Shuffle

In [1]:
import os

all_files = []
for root, subdirs, files in os.walk("./"):
    all_files += [os.path.join(root, filename)
                  for filename in files if filename == "data.libsvm"]

len(all_files)

17

In [2]:
from random import randint
from random import shuffle

def shuffle_limited_memory(filename, ntest, nparts):
    assert(ntest < nparts)
    subfiles = [filename + "_part%d" % i for i in range(nparts)]
    handlers = [open(name, 'w') for name in subfiles]
    with open(filename) as f:
        for line in f:
            t = randint(0, nparts - 1)
            handlers[t].write(line)
    for handler in handlers:
        handler.close()

    base = filename.rsplit("/", 1)[0]
    training = open(base + "/training.libsvm", 'w')
    testing = open(base + "/testing.libsvm", 'w')
    shuffle(subfiles)
    for i, name in enumerate(subfiles):
        with open(name) as f:
            lines = f.readlines()        
        os.remove(name)
        shuffle(lines)
        s = ''.join(lines)
        if not s.endswith('\n'):
            s += '\n'
        if i < ntest:
            testing.write(s)
        else:
            training.write(s)
    training.close()
    testing.close()

In [3]:
for filename in all_files:
    print(filename)
    shuffle_limited_memory(filename, 10, 100)

./NGA_libsvm/data.libsvm
./lakes_libsvm/data.libsvm
./GEOMAR_libsvm/data.libsvm
./NGA2_libsvm/data.libsvm
./JAMSTEC_libsvm/data.libsvm
./GEBCO_libsvm/data.libsvm
./NOAA_libsvm/data.libsvm
./CCOM_libsvm/data.libsvm
./US_multi_libsvm/data.libsvm
./SIO_libsvm/data.libsvm
./3DGBR_libsvm/data.libsvm
./NAVO_libsvm/data.libsvm
./IFREMER_libsvm/data.libsvm
./AGSO_libsvm/data.libsvm
./NOAA_geodas_libsvm/data.libsvm
./NGDC_libsvm/data.libsvm
./IBCAO_libsvm/data.libsvm


# Move files

In [5]:
for filename in all_files:
    old_dir, fname = filename.rsplit("/", 1)
    new_dir = old_dir + "_data"
    os.mkdir(new_dir)
    os.rename(filename, os.path.join(new_dir, fname))

# Upload to S3

In [9]:
import os

with open("upload-s3.sh", 'w') as f:
    for root, subdirs, files in os.walk("./"):
        if root.endswith("_libsvm"):
            dirname = root[2:]
            f.write("aws s3 cp {} s3://tmsn-data/bathymetry/{}/ --recursive\n".format(root, dirname))