In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import numpy as np
import pandas as pd
import os

# Prepare Data

In [11]:
data_dir = "../data/Oct0123_Dec3123/"
data_files = [file for file in os.listdir(data_dir) if file.endswith(".csv")]
data = []
for file in data_files:
    df = pd.read_csv(data_dir + file, index_col=0)
    df.loc[df["pm25"] < 0, "pm25"] = 0
    if df["pm25"].max() > 500:
        print("One outlier dropped")
        continue
    
    # decompose timestamp
    df["timestamp"] = pd.to_datetime(df["timestamp"])
    df["hour"] = df["timestamp"].dt.hour
    df["day"] = df["timestamp"].dt.day
    df["month"] = df["timestamp"].dt.month
    df["year"] = df["timestamp"].dt.year
    df = df.loc[:, ["year", "month", "day", "hour", "pm25", "longitude", "latitude"]]
    df = df.groupby(["year", "month", "day", "hour"]).median().reset_index(drop=False)
    if len(df) < ((31 + 30 + 31) * 24):
        continue
    else:
        data.append(df.loc[:, ["pm25", "longitude", "latitude"]].to_numpy())
data = np.array(data)
print(data.shape)

One outlier dropped
One outlier dropped
One outlier dropped
One outlier dropped
One outlier dropped
One outlier dropped
One outlier dropped
One outlier dropped
One outlier dropped
One outlier dropped
(36, 2208, 3)


In [17]:
class Geo_LSTM_Dataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y
            
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, index):
        return self.X[index], self.Y[index]
    
def construct_dataloader(data, seed=42, window_size=24, batch_size=256):
    np.random.seed(seed)

    n_stations = data.shape[0]
    permutation = np.random.permutation(n_stations)
    data_train = data[permutation[:int(n_stations * 0.8)]]
    data_test = data[permutation[int(n_stations * 0.8):]]
    X_train = []
    Y_train = []
    X_test = []
    Y_test = []

    # construct training data
    for label_index in range(len(data_train)):
        X = np.concatenate([data_train[:label_index], data_train[label_index + 1:]], axis=0)
        Y = data_train[label_index: label_index+1]
        RLat = torch.from_numpy(Y[0, 0, 1] - X[:, 0, 1])
        RLon = torch.from_numpy(Y[0, 0, 2] - X[:, 0, 2])
        for t in range(window_size-1, X.shape[1]):
            history_readings = torch.from_numpy(X[:, t-window_size+1:t+1, 0])
            target_reading = Y[0, t, 0]
            X_train.append((history_readings, RLat, RLon))
            Y_train.append(target_reading)
    Y_train = torch.tensor(Y_train)
    train_dataest = Geo_LSTM_Dataset(X_train, Y_train)
    train_loader = DataLoader(train_dataest, batch_size=batch_size, shuffle=True)

    # construct testing data
    for label_index in range(len(data_test)):
        RLat = torch.from_numpy(data_test[label_index, 0, 1] - data_train[:, 0, 1])
        RLon = torch.from_numpy(data_test[label_index, 0, 2] - data_train[:, 0, 2])
        for t in range(window_size-1, data_test.shape[1]):
            history_readings = torch.from_numpy(data_train[:, t-window_size+1:t+1, 0])
            target_reading = data_test[label_index, t, 0]
            X_test.append((history_readings, RLat, RLon))
            Y_test.append(target_reading)
    Y_test = torch.tensor(Y_test)
    test_dataest = Geo_LSTM_Dataset(X_test, Y_test)
    test_loader = DataLoader(test_dataest, batch_size=batch_size, shuffle=False)

    return train_loader, test_loader


In [18]:
train_loader, test_loader = construct_dataloader(data)
print(len(train_loader), len(test_loader))

239 69


# Construct Model

In [19]:
class Geo_Layer(nn.Module):
    def __init__(self, K=4):
        super(Geo_Layer, self).__init__()
        self.K = K

    def forward(self, X):
        # history_readings: (batch_size, n_stations, window_size)
        # RLat: (batch_size, n_stations)
        # RLon: (batch_size, n_stations)
        history_readings, RLat, RLon = X
        batch_size, n_stations, window_size = history_readings.shape

        # RDist, Rank, R_A: (batch_size, n_stations)
        RDist = torch.sqrt(RLat**2 + RLon**2)
        indice = torch.argsort(RDist)[:, :self.K]   # (batch_size, K)
        nearby_readings = history_readings[torch.arange(batch_size)[:, None], indice]

        return nearby_readings
    
class Geo_LSTM(nn.Module):
    def __init__(self, K=4, num_layers=4, hidden_size=128, fc_hidden=1024):
        super(Geo_LSTM, self).__init__()
        self.geo_layer = Geo_Layer(K)
        self.lstm = nn.LSTM(input_size=K, hidden_size=hidden_size,
                            num_layers=num_layers, batch_first=True)
        self.fc = nn.Sequential(*[
            nn.Linear(hidden_size, fc_hidden),
            nn.ReLU(),
            nn.Linear(fc_hidden, 1)
        ])

    def forward(self, X):
        # nearby_readings: (batch_size, window_size, K)
        nearby_readings = self.geo_layer(X).permute(0, 2, 1).float()
        # output: (batch_size, window_size, hidden_size) -> (batch_size, 1)
        output, _ = self.lstm(nearby_readings)
        output = self.fc(output[:, -1, :]).squeeze()
        return output


# Train

In [40]:
batch_size = 64
epochs = 100
lr = 1e-3

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
geo_lstm = Geo_LSTM().to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(geo_lstm.parameters(), lr=lr)
train_loader, test_loader = construct_dataloader(data, batch_size=batch_size)
best_loss = 1e10
save_path = f"./model_weights2/geolstm.pt"
for epoch in range(epochs):
    for X, Y in train_loader:
        X = (X[0].to(device), X[1].to(device), X[2].to(device))
        Y = Y.to(device)
        optimizer.zero_grad()
        Y_pred = geo_lstm(X)
        loss = criterion(Y_pred, Y.float())
        loss.backward()
        optimizer.step()

    # Evaluate on train and test set
    with torch.no_grad():
        train_loss = 0
        for X, Y in train_loader:
            X = (X[0].to(device), X[1].to(device), X[2].to(device))
            Y = Y.to(device)
            Y_pred = geo_lstm(X)
            train_loss += criterion(Y_pred, Y).item()
        train_loss /= len(train_loader)

    with torch.no_grad():
        test_loss = 0
        for X, Y in test_loader:
            X = (X[0].to(device), X[1].to(device), X[2].to(device))
            Y = Y.to(device)
            Y_pred = geo_lstm(X)
            test_loss += criterion(Y_pred, Y).item()
        test_loss /= len(test_loader)
    
    print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}")

    # save model
    if test_loss < best_loss:
        best_loss = test_loss
        torch.save(geo_lstm.state_dict(), save_path)
        print(f"Model saved at {save_path}")

Epoch 1/100, Train Loss: 16.5414, Test Loss: 31.2485
Model saved at ./model_weights2/geolstm.pt
Epoch 2/100, Train Loss: 15.7367, Test Loss: 28.1042
Model saved at ./model_weights2/geolstm.pt
Epoch 3/100, Train Loss: 13.6701, Test Loss: 30.0912
Epoch 4/100, Train Loss: 13.7107, Test Loss: 35.6252
Epoch 5/100, Train Loss: 14.5203, Test Loss: 30.9043
Epoch 6/100, Train Loss: 12.3737, Test Loss: 29.8547
Epoch 7/100, Train Loss: 12.1259, Test Loss: 32.4155
Epoch 8/100, Train Loss: 11.4377, Test Loss: 27.5355
Model saved at ./model_weights2/geolstm.pt
Epoch 9/100, Train Loss: 10.4718, Test Loss: 29.6916
Epoch 10/100, Train Loss: 10.0157, Test Loss: 35.6552
Epoch 11/100, Train Loss: 9.4000, Test Loss: 31.4157
Epoch 12/100, Train Loss: 11.3364, Test Loss: 27.0907
Model saved at ./model_weights2/geolstm.pt
Epoch 13/100, Train Loss: 9.0546, Test Loss: 36.3665
Epoch 14/100, Train Loss: 8.1821, Test Loss: 38.3247
Epoch 15/100, Train Loss: 8.0330, Test Loss: 30.9375
Epoch 16/100, Train Loss: 7.501

In [57]:
Y_pred_all = []
Y_true_all = []
with torch.no_grad():
    for X, Y in test_loader:
        X = (X[0].to(device), X[1].to(device), X[2].to(device))
        Y = Y.to(device)
        Y_pred = geo_lstm(X)
        Y_pred_all.append(Y_pred.cpu().numpy())
        Y_true_all.append(Y.cpu().numpy())
Y_pred_all = np.concatenate(Y_pred_all)
Y_true_all = np.concatenate(Y_true_all)
RMSE = np.sqrt(np.mean((Y_pred_all - Y_true_all)**2))
CVRMSE = RMSE / Y_true_all.mean()
MAE = np.mean(np.abs(Y_pred_all - Y_true_all))
R2 = 1 - np.sum((Y_pred_all - Y_true_all)**2) / np.sum((Y_true_all - Y_true_all.mean())**2)
print("RMSE: ", RMSE)
print("CVRMSE: ", CVRMSE)
print("MAE: ", MAE)
print("R2: ", R2)

RMSE:  6.037914115878682
CVRMSE:  0.4725213670375175
MAE:  4.016839099237248
R2:  0.5572499000244238


In [59]:
Y_pred_all.min()

0.04552248

In [41]:
for batch in test_loader:
    X, Y = batch
    break

In [51]:
X[0][0, :, -1].mean()

tensor(6.8877, dtype=torch.float64)

In [48]:
Y[0]

tensor(5.8650, dtype=torch.float64)

In [49]:
df

Unnamed: 0,year,month,day,hour,pm25,longitude,latitude
0,2023,10,1,0,5.910,-119.73364,36.67353
1,2023,10,1,1,6.165,-119.73364,36.67353
2,2023,10,1,2,6.665,-119.73364,36.67353
3,2023,10,1,3,7.245,-119.73364,36.67353
4,2023,10,1,4,8.355,-119.73364,36.67353
...,...,...,...,...,...,...,...
2203,2023,12,31,19,5.250,-119.73364,36.67353
2204,2023,12,31,20,5.255,-119.73364,36.67353
2205,2023,12,31,21,5.790,-119.73364,36.67353
2206,2023,12,31,22,5.520,-119.73364,36.67353


In [52]:
torch.mean(X[0], dim=(1,2))

tensor([ 8.6459,  8.7520,  8.8508,  8.9446,  9.0471,  9.1492,  9.2306,  9.2463,
         9.2213,  9.2203,  9.1843,  9.1588,  9.1363,  9.0978,  9.0500,  9.0122,
         8.9825,  9.0739,  9.2106,  9.2890,  9.2582,  9.1897,  9.1461,  9.1211,
         9.1175,  9.1025,  9.0897,  9.0827,  9.0527,  9.0140,  8.9652,  8.9483,
         8.9429,  8.9328,  8.9823,  9.0630,  9.1803,  9.3503,  9.5614,  9.7992,
        10.0150, 10.0885, 10.1169, 10.1212, 10.1045, 10.1421, 10.1959, 10.2541,
        10.3001, 10.3498, 10.4064, 10.5052, 10.6040, 10.7168, 10.8347, 10.9386,
        11.0125, 11.0694, 11.1202, 11.1275, 11.0932, 11.0208, 10.9223, 10.8031],
       dtype=torch.float64)

In [56]:
Y_pred_all = []
Y_true_all = []
for X, Y in test_loader:
    history_readings = X[0]
    Y_pred = torch.mean(history_readings[:, :, -1], dim=1)
    Y_pred_all.append(Y_pred)
    Y_true_all.append(Y)
Y_pred_all = torch.cat(Y_pred_all)
Y_true_all = torch.cat(Y_true_all)
RMSE = torch.sqrt(torch.mean((Y_pred_all - Y_true_all)**2))
CVRMSE = RMSE / Y_true_all.mean()
MAE = torch.mean(torch.abs(Y_pred_all - Y_true_all))
print(f"RMSE: {RMSE:.4f}, CVRMSE: {CVRMSE:.4f}, MAE: {MAE:.4f}")

RMSE: 5.5349, CVRMSE: 0.4332, MAE: 3.9084
