In [1]:
!cat get_baltic_sea_level_data.sh

#!/bin/bash

url_base="http://pegelonline.wsv.de/webservices/files/Wasserstand+Rohdaten"

# get all data
for station_name in OSTSEE/LT+KIEL OSTSEE/GREIFSWALD-WIECK OSTSEE/FLENSBURG; do
    mkdir -p data/baltic_sea_level/${station_name}
    for m in {01..02}; do 
        for d in {01..31}; do 
            src_date_str="${d}.${m}.2018";
            dst_date_str="2018-${m}-${d}";
            curl \
            ${url_base}/${station_name}/${src_date_str}/down.txt \
            -o data/baltic_sea_level/${station_name}/${dst_date_str}.txt;
        done;
    done;
done;

# remove missing files (they contain error-404 HTML files)
find data/baltic_sea_level -type f -iname \*.txt -print0 | \
     xargs -0 -n1 -P1 -I {} \
         bash -c "grep -q html {} && rm -fv {};"

# convert to unix line endings
find data/baltic_sea_level -type f -iname \*.txt -print0 | \
     xargs -0 -n1 -P1 -I {} \
         bash -c "sed -i 's/.$//' {};"

# convert to unicode
find data/baltic_sea_level -type f -iname \*.t

In [2]:
rm -rf ../data/baltic_sea_level/*

In [3]:
!bash get_baltic_sea_level_data.sh

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  7231  100  7231    0     0  77052      0 --:--:-- --:--:-- --:--:-- 88182
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  7231  100  7231    0     0  83811      0 --:--:-- --:--:-- --:--:-- 91531
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  7231  100  7231    0     0  97525      0 --:--:-- --:--:-- --:--:--  105k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  7231  100  7231    0     0  93408      0 --:--:-- --:--:-- --:--:--   98k
  % Total    % Received % Xferd  Average Speed   Tim

In [4]:
!head -n15 ../data/baltic_sea_level/OSTSEE/LT+KIEL/2018-01-16.txt | nl

     1	16.01.2018
     2	WSA LÜBECK
     3	OSTSEE
     4	LT KIEL
     5	9610050
     6	W_O
     7	cm
     8	XXX,XXX
     9	XX.XX.XXXX
    10	XX XX
    11	PNP
    12	-4,98
    13	00 01 459
    14	00 02 459
    15	00 03 459


In [5]:
!head -n15 ../data/baltic_sea_level/OSTSEE/KIEL-HOLTENAU/2018-01-16.txt | nl

head: cannot open 'data/baltic_sea_level/OSTSEE/KIEL-HOLTENAU/2018-01-16.txt' for reading: No such file or directory


In [6]:
%matplotlib inline
import numpy as np
import datetime
from pathlib import Path
import matplotlib.pyplot as plt

In [7]:
def _get_date_str_from_filename(file_name):
    """Extract date string from file name."""
    file_name = Path(file_name)
    date_str = file_name.name[:10]
    return date_str

assert (_get_date_str_from_filename(
            "../data/baltic_sea_level/OSTSEE/LT+KIEL/2018-01-16.txt")
        == "2018-01-16")

def _get_station_name_from_path(path):
    station_name = path.split("/")[2:4]
    station_name = "_".join(station_name)
    return station_name

assert (_get_station_name_from_path(
            "../data/baltic_sea_level/OSTSEE/LT+KIEL/2018-01-16.txt")
        == "OSTSEE_LT+KIEL")
assert (_get_station_name_from_path(
            "../data/baltic_sea_level/OSTSEE/LT+KIEL/")
        == "OSTSEE_LT+KIEL")
assert (_get_station_name_from_path(
            "../data/baltic_sea_level/OSTSEE/LT+KIEL")
        == "OSTSEE_LT+KIEL")

def _load_data_file(file_name):
    """Load txt file."""
   
    raw_data = np.genfromtxt(file_name, skip_header=12)
    
    sea_level = raw_data[:, 2]
    
    date_str = _get_date_str_from_filename(file_name)
    
    hours = raw_data[:, 0]
    minutes = raw_data[:, 1]
    
    time_origin = np.datetime64(date_str)
    
    time_vector = time_origin + np.asarray(
        [np.timedelta64(int(h), "h") + np.timedelta64(int(m), "m")
         for h, m in zip(hours, minutes)])
    
    return time_vector, sea_level

def _get_all_files_for_station(station_path):
    station_path = Path(station_path)
    all_files = sorted(station_path.glob("????-??-??.txt"))
    return all_files

def convert_to_clean_file(station_path):
    all_files = _get_all_files_for_station(station_path)
    
    time_vector = np.concatenate([
        d[0]
        for d in map(_load_data_file,
                     _get_all_files_for_station(station_path))])
    
    sea_level = np.concatenate([
        d[1]
        for d in map(_load_data_file,
                     _get_all_files_for_station(station_path))])
    
    station_name = _get_station_name_from_path(station_path)
    
    output_file_name = "../data/baltic_sea_level/{}.dat".format(station_name)
    
    with open(output_file_name, mode="w") as f:
        f.write("# {}\n".format(station_name))
        f.write("time-stamp sea-level-in-m\n")
        for t, sl in zip(time_vector, sea_level):
            f.write("{:s} {:f}\n".format(t, sl))
    
    return output_file_name

In [8]:
print(convert_to_clean_file("../data/baltic_sea_level/OSTSEE/LT+KIEL/"))
print(convert_to_clean_file("../data/baltic_sea_level/OSTSEE/GREIFSWALD-WIECK/"))
print(convert_to_clean_file("../data/baltic_sea_level/OSTSEE/FLENSBURG/"))

data/baltic_sea_level/OSTSEE_LT+KIEL.dat
data/baltic_sea_level/OSTSEE_GREIFSWALD-WIECK.dat
data/baltic_sea_level/OSTSEE_FLENSBURG.dat
