In [1]:
%load_ext lab_black

In [82]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch.nn as nn
import torch
import glob

%matplotlib inline

In [39]:
def load_data(number="01013500"):
    name_getter = lambda x: x != 3
    names = pd.read_table(
        f"basin_dataset_public_v1p2/basin_mean_forcing/daymet/01/{number}_lump_cida_forcing_leap.txt",
        skiprows=name_getter,
        sep=" ",
    ).columns.to_numpy()
    header = np.zeros(len(names) - 3, dtype=object)
    header[0] = "Time"
    header[1:] = names[4:]
    del names
    df = pd.read_table(
        f"basin_dataset_public_v1p2/basin_mean_forcing/daymet/01/{number}_lump_cida_forcing_leap.txt",
        names=header,
        skiprows=4,
    )
    df["Time"] = pd.to_datetime(df["Time"])
    df["Time"] = (df["Time"] - df["Time"][0]).dt.days
    streamflow = pd.read_table(
        f"basin_dataset_public_v1p2/usgs_streamflow/01/{number}_streamflow_qc.txt",
        header=None,
        sep="\s+",
    ).astype("str")
    del streamflow[0]
    for i in range(len(streamflow[1])):
        streamflow[1][i] += " " + streamflow[2][i] + " " + streamflow[3][i]
    streamflow[1] = pd.to_datetime(streamflow[1])
    del streamflow[2]
    del streamflow[3]
    streamflow[1] = (streamflow[1] - streamflow[1][0]).dt.days
    # df = df[streamflow[5] != "M"]
    # streamflow = streamflow[streamflow[5] != "M"]
    # del streamflow[5]
    streamflow.columns = ["Time", "Flow", "Status"]
    return df, streamflow

In [139]:
class Data(torch.utils.data.Dataset):
    def __init__(self, load_data, device):
        len_path = len("basin_dataset_public_v1p2/usgs_streamflow/01/")
        len_num = 8
        basins = glob.glob(
            "basin_dataset_public_v1p2/usgs_streamflow/01/*_streamflow_qc.txt"
        )
        for i in range(len(basins)):
            basins[i] = basins[i][len_path : len_path + len_num]
        # Need this copy to not ruin the loop when deleting values
        basins_copy = basins.copy()
        # print(filenames)
        # basins = ["01013500", "01022500"]
        tmp_dict = {}
        for i, basin in enumerate(basins_copy):
            df, streamflow = load_data(basin)
            if len(df) != len(streamflow):
                print(
                    f"Ignoring basin {basin} as streamflow has {len(streamflow)} points and inputs have {len(df)} points."
                )
                del basins[i]
            else:
                tmp_dict[f"df_{basin}"] = df
                tmp_dict[f"streamflow_{basin}"] = streamflow
        del basins_copy
        del_rows = np.ones_like(
            tmp_dict[f"streamflow_{basins[0]}"]["Status"], dtype="bool"
        )
        for basin in basins:
            del_rows *= tmp_dict[f"streamflow_{basin}"]["Status"] != "M"
        self.x = torch.zeros((len(del_rows[del_rows]), len(basins), 2)).to(device)
        self.y = torch.zeros((len(del_rows[del_rows]), len(basins), 1)).to(device)
        for i, basin in enumerate(basins):
            df = tmp_dict[f"df_{basin}"][del_rows]
            streamflow = tmp_dict[f"streamflow_{basin}"][del_rows]
            dayl = torch.from_numpy(df["dayl(s)"].to_numpy()).float().to(device)[:]
            prcp = torch.from_numpy(df["prcp(mm/day)"].to_numpy()).float().to(device)[:]
            self.y[:, i] = (
                torch.from_numpy(streamflow["Flow"].to_numpy().astype("float"))
                .view(-1, 1)
                .float()
                .to(device)[:]
            )

            self.x[:, i, 0] = dayl[:]
            self.x[:, i, 1] = prcp[:]
        del tmp_dict
        del df
        del streamflow

In [46]:
class Net(nn.Module):
    def __init__(self, nodes=51, layers=1, inputs=2, parallel=True):
        super().__init__()
        if parallel:
            self.device = "cuda:0"
            self.lstm = nn.LSTM(
                input_size=inputs, hidden_size=nodes, num_layers=layers
            ).to(self.device)
            self.lstm = nn.DataParallel(
                self.lstm, device_ids=["cuda:0", "cuda:1"], output_device=self.device
            )
            self.output_layer = nn.Linear(nodes, 1).to(self.device)

        else:
            self.device = "cuda:1"
            self.lstm = nn.LSTM(
                input_size=inputs, hidden_size=nodes, num_layers=layers
            ).to(self.device)

            self.output_layer = nn.Linear(nodes, 1).to(self.device)

    def forward(self, x):
        x, (h_n, c_n) = self.lstm(x)
        return self.output_layer(x)

In [149]:
def train(model, data, epochs, lr, parallel=True, verbose=True):
    if parallel:
        device = "cuda:0"
    else:
        device = "cuda:1"
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    loss_func = nn.MSELoss()
    model.train()
    for i in range(epochs):
        optimizer.zero_grad()
        y_pred = model.forward(data.x)
        loss = loss_func(y_pred, data.y)
        if verbose:
            print(f"Epoch {i}: {loss.item()}")
        loss.backward()
        optimizer.step()

In [156]:
parallel = True
if parallel:
    device = "cuda:0"
else:
    device = "cuda:1"
data = Data(load_data, device)

Ignoring basin 01195100 as streamflow has 12111 points and inputs have 12784 points.


In [159]:
model = Net(nodes=100, parallel=parallel)
train(model, data, 100, 5, parallel, True)

Epoch 0: 1355400.375
Epoch 1: 1275901.125
Epoch 2: 1226000.125
Epoch 3: 1203959.875
Epoch 4: 1204329.0
Epoch 5: 1217177.75
Epoch 6: 1231055.375
Epoch 7: 1238342.875
Epoch 8: 1237275.875
Epoch 9: 1229991.375
Epoch 10: 1220003.625
Epoch 11: 1210638.0
Epoch 12: 1204250.125
Epoch 13: 1201872.25
Epoch 14: 1203168.0
Epoch 15: 1206720.875
Epoch 16: 1210640.75
Epoch 17: 1213287.75
Epoch 18: 1213795.375
Epoch 19: 1212202.625
Epoch 20: 1209242.5
Epoch 21: 1205963.125
Epoch 22: 1203342.0
Epoch 23: 1201994.875
Epoch 24: 1202034.5
Epoch 25: 1203101.875
Epoch 26: 1204551.125
Epoch 27: 1205714.375
Epoch 28: 1206142.625
Epoch 29: 1205730.875
Epoch 30: 1204698.25
Epoch 31: 1203455.5
Epoch 32: 1202428.375
Epoch 33: 1201901.75
Epoch 34: 1201937.375
Epoch 35: 1202381.75
Epoch 36: 1202954.25
Epoch 37: 1203373.75
Epoch 38: 1203467.125
Epoch 39: 1203220.25
Epoch 40: 1202759.0
Epoch 41: 1202276.75
Epoch 42: 1201946.625
Epoch 43: 1201854.625
Epoch 44: 1201977.75
Epoch 45: 1202209.25
Epoch 46: 1202415.5
Epoch 4