In [2]:
# PyTorch
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.dataset import random_split
import torch.utils.data as data
import torch.autograd as autograd
import torch.nn.functional as F
import pytorch_lightning as pl

# SciKit
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# Python
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import rc

# other
import seaborn as sns
from pylab import rcParams
from tqdm import tqdm

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [3]:
rcParams['figure.figsize'] = 12, 8

tqdm.pandas()

## Dataset

In [4]:
df = pd.read_csv('./station/466920taipei_train.csv')
df = df[["Temperature", "Humidity", "DewPoint", "StationAirPressure"]]

# features_df = df[["Humidity", "DewPoint", "StationAirPressure"]]
# targets_df = df[["Temperature"]]

In [5]:
train_size = int(len(df) * 0.9)
train_size

86724

In [6]:
train_df, test_df = df[:train_size], df[train_size + 1:]
train_df.shape, test_df.shape

((86724, 4), (9635, 4))

In [7]:
scaler = MinMaxScaler(feature_range=(-1, 1))
scaler = scaler.fit(df)

In [8]:
train_df = pd.DataFrame(scaler.transform(train_df), columns=train_df.columns, index=train_df.index)
test_df = pd.DataFrame(scaler.transform(test_df), columns=test_df.columns, index=test_df.index)

print(train_df.head())
print(test_df.head())

   Temperature  Humidity  DewPoint  StationAirPressure
0    -0.454023  0.993858  0.656923            0.997897
1    -0.448276  0.993858  0.658462            0.997825
2    -0.442529  0.993858  0.660000            0.997643
3    -0.442529  0.993858  0.660000            0.997553
4    -0.436782  0.993858  0.661538            0.997498
       Temperature  Humidity  DewPoint  StationAirPressure
86725     0.293103      -1.0      -1.0            0.995994
86726     0.304598      -1.0      -1.0            0.995921
86727     0.235632      -1.0      -1.0            0.995939
86728     0.172414      -1.0      -1.0            0.996012
86729     0.137931      -1.0      -1.0            0.996121


In [9]:
def create_sequences(input_data: pd.DataFrame, target_column, sequence_length):
    sequences = []
    data_size = len(input_data)

    for i in tqdm(range(data_size - sequence_length)):
        sequence = input_data[i : i+sequence_length]
        
        label_position = i + sequence_length
        label = input_data.iloc[label_position][target_column]

        sequences.append((sequence, label))
        
    return sequences

In [10]:
SEQUENCE_LENGTH = 120

train_sequences = create_sequences(train_df, 'Temperature', SEQUENCE_LENGTH)
test_sequences = create_sequences(test_df, 'Temperature', SEQUENCE_LENGTH)

print(train_sequences[0][0].shape)

100%|██████████| 86604/86604 [00:04<00:00, 17660.91it/s]
100%|██████████| 9515/9515 [00:00<00:00, 27144.43it/s]

(120, 4)





In [11]:
len(train_sequences), len(test_sequences)

(86604, 9515)

In [12]:
class TemperatureDataset(Dataset):
    def __init__(self, sequences):
        self.sequences = sequences
    
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        sequences, label = self.sequences[idx]
        return dict(
            sequences=torch.Tensor(sequences.to_numpy()),
            label=torch.tensor(label).float()
        )

In [13]:
class DataModule(pl.LightningDataModule):
    def __init__(
            self, train_sequences, test_sequences, batch_size=8
    ):
        super().__init__()
        self.train_sequences = train_sequences
        self.test_sequences = test_sequences
        self.batch_size = batch_size

    def setup(self, stage=None):
        self.train_dataset = TemperatureDataset(self.train_sequences)
        self.test_dataset = TemperatureDataset(self.test_sequences)

    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            shuffle=False,
            num_workers=2
        )
    
    def val_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size=1,
            shuffle=False,
            num_workers=1
        )
    
    def test_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size=1,
            shuffle=False,
            num_workers=1
        )

In [14]:
N_EPOCHS = 8
BATCH_SIZE = 64

data_module = DataModule(train_sequences, test_sequences, batch_size=BATCH_SIZE)
data_module.setup()

In [15]:
train_dataset = TemperatureDataset(train_sequences)

In [16]:
for item in train_dataset:
    print(item["sequences"].shape)
    print(item["label"].shape)
    print(item["label"])
    break

torch.Size([120, 4])
torch.Size([])
tensor(-0.4195)


## Model

In [17]:
class LSTM(nn.Module):
    def __init__(self, n_features, n_hidden = 128, n_layers=2):
        
        super(LSTM, self).__init__()

        self.n_hidden = n_hidden

        self.lstm = nn.LSTM(
            input_size=n_features,
            hidden_size=n_hidden,
            batch_first=True,
            num_layers=n_layers,
            dropout=0.2
        )

        self.linear = nn.Linear(n_hidden, 1)

    def forward(self, x):

        self.lstm.flatten_parameters()

        _, (hidden, _) = self.lstm(x)
        out = hidden[-1]

        return self.linear(out)

In [18]:
class Predictor(pl.LightningModule):

    def __init__(self, n_features: int):
        super().__init__()
        self.model = LSTM(n_features)
        self.criterion = nn.MSELoss()

    def forward(self, x, labels=None):
        output = self.model(x)
        loss = 0
        if labels is not None:
            loss = self.criterion(output, labels.unsqueeze(dim=1))
        return loss, output
    
    def training_step(self, batch, batch_idx):
        sequences = batch["sequence"]
        labels = batch["label"]

        loss, outputs = self(sequences, labels)
        self.log("train_loss", loss, prog_bar=True, logger=True)
        return loss
    
    def validation_step(self, batch, batch_idx):
        sequences = batch["sequence"]
        labels = batch["label"]

        loss, outputs = self(sequences, labels)
        self.log("val_loss", loss, prog_bar=True, logger=True)
        return loss
    
    def test_step(self, batch, batch_idx):
        sequences = batch["sequence"]
        labels = batch["label"]

        loss, outputs = self(sequences, labels)
        self.log("test_loss", loss, prog_bar=True, logger=True)
        return loss
    
    def configure_optimizers(self):
        return optim.AdamW(self.parameters(), lr=0.0001)

In [19]:
model = Predictor(n_features=train_df.shape[1])

In [20]:
# %load_ext tensorboard
%reload_ext tensorboard
%tensorboard --logdir ./lightning_logs

Reusing TensorBoard on port 6007 (pid 26688), started 0:08:02 ago. (Use '!kill 26688' to kill it.)

In [21]:
checkpoint_callback = pl.callbacks.ModelCheckpoint(
    dirpath="checkpoints",
    filename="best-checkpoint",
    save_top_k=1,
    verbose=True,
    monitor="val_loss",
    mode="min"
)

logger = pl.loggers.TensorBoardLogger("lightning_logs", name="temperature-forecast")

early_stopping_callback = pl.callbacks.EarlyStopping(monitor="val_loss", patience=2)

trainer = pl.Trainer(
    logger=logger,
    callbacks=[checkpoint_callback, early_stopping_callback],
    # checkpoint_callback=checkpoint_callback,
    # callbacks=[early_stopping_callback],
    max_epochs=N_EPOCHS
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [23]:
trainer.fit(model, data_module)

You are using a CUDA device ('NVIDIA GeForce RTX 3050 Ti Laptop GPU') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision





LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type    | Params
--------------------------------------
0 | model     | LSTM    | 200 K 
1 | criterion | MSELoss | 0     
--------------------------------------
200 K     Trainable params
0         Non-trainable params
200 K     Total params
0.803     Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

c:\Python310\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:436: Consider setting `persistent_workers=True` in 'val_dataloader' to speed up the dataloader worker initialization.


In [23]:
trained_model = Predictor.load_from_checkpoint("./checkpoints/best-checkpoint.ckpt")

FileNotFoundError: [Errno 2] No such file or directory: 'd:/WeatherAI/seriously/checkpoints/best-checkpoint.ckpt'