Weather Forecast

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from zipfile import ZipFile
import os
import numpy as np
from torchvision.datasets.utils import download_url
from sklearn.preprocessing import MinMaxScaler
import torch
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn


Download Weather data

In [None]:
dataset_url = "https://storage.googleapis.com/tensorflow/tf-keras-datasets/jena_climate_2009_2016.csv.zip"
download_url(dataset_url, './data')
zip_file = ZipFile('./data/jena_climate_2009_2016.csv.zip')
zip_file.extractall()
df = pd.read_csv('./data/jena_climate_2009_2016.csv.zip')
df.head()

Downloading https://storage.googleapis.com/tensorflow/tf-keras-datasets/jena_climate_2009_2016.csv.zip to ./data/jena_climate_2009_2016.csv.zip


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




Unnamed: 0,Date Time,p (mbar),T (degC),Tpot (K),Tdew (degC),rh (%),VPmax (mbar),VPact (mbar),VPdef (mbar),sh (g/kg),H2OC (mmol/mol),rho (g/m**3),wv (m/s),max. wv (m/s),wd (deg)
0,01.01.2009 00:10:00,996.52,-8.02,265.4,-8.9,93.3,3.33,3.11,0.22,1.94,3.12,1307.75,1.03,1.75,152.3
1,01.01.2009 00:20:00,996.57,-8.41,265.01,-9.28,93.4,3.23,3.02,0.21,1.89,3.03,1309.8,0.72,1.5,136.1
2,01.01.2009 00:30:00,996.53,-8.51,264.91,-9.31,93.9,3.21,3.01,0.2,1.88,3.02,1310.24,0.19,0.63,171.6
3,01.01.2009 00:40:00,996.51,-8.31,265.12,-9.07,94.2,3.26,3.07,0.19,1.92,3.08,1309.19,0.34,0.5,198.0
4,01.01.2009 00:50:00,996.51,-8.27,265.15,-9.04,94.1,3.27,3.08,0.19,1.92,3.09,1309.0,0.32,0.63,214.3


Preparing dataset


In [None]:
titles = [
    "Pressure",
    "Temperature",
    "Temperature in Kelvin",
    "Temperature (dew point)",
    "Relative Humidity",
    "Saturation vapor pressure",
    "Vapor pressure",
    "Vapor pressure deficit",
    "Specific humidity",
    "Water vapor concentration",
    "Airtight",
    "Wind speed",
    "Maximum wind speed",
    "Wind direction in degrees",
]

feature_keys = [
    "p (mbar)",
    "T (degC)",
    "Tpot (K)",
    "Tdew (degC)",
    "rh (%)",
    "VPmax (mbar)",
    "VPact (mbar)",
    "VPdef (mbar)",
    "sh (g/kg)",
    "H2OC (mmol/mol)",
    "rho (g/m**3)",
    "wv (m/s)",
    "max. wv (m/s)",
    "wd (deg)",
]

date_time_key = "Date Time"

In [None]:
print(
    "The selected parameters are:",
    ", ".join([titles[i] for i in [0, 1, 5, 7, 8, 10, 11]]),
)
selected_features = [feature_keys[i] for i in [0, 1, 5, 7, 8, 10, 11]]
features = df[selected_features]
features.index = df[date_time_key]
features.head()

The selected parameters are: Pressure, Temperature, Saturation vapor pressure, Vapor pressure deficit, Specific humidity, Airtight, Wind speed


Unnamed: 0_level_0,p (mbar),T (degC),VPmax (mbar),VPdef (mbar),sh (g/kg),rho (g/m**3),wv (m/s)
Date Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
01.01.2009 00:10:00,996.52,-8.02,3.33,0.22,1.94,1307.75,1.03
01.01.2009 00:20:00,996.57,-8.41,3.23,0.21,1.89,1309.8,0.72
01.01.2009 00:30:00,996.53,-8.51,3.21,0.2,1.88,1310.24,0.19
01.01.2009 00:40:00,996.51,-8.31,3.26,0.19,1.92,1309.19,0.34
01.01.2009 00:50:00,996.51,-8.27,3.27,0.19,1.92,1309.0,0.32


Scale data using minmaxscaler

In [None]:
scaler = MinMaxScaler()
features = scaler.fit_transform(features.values)
features

array([[0.81493857, 0.24863161, 0.03788602, ..., 0.08167896, 0.74321291,
        0.99726153],
       [0.81542998, 0.24216288, 0.03629417, ..., 0.07884288, 0.74934898,
        0.99723061],
       [0.81503686, 0.24050423, 0.0359758 , ..., 0.07827567, 0.75066599,
        0.99717776],
       ...,
       [0.84737101, 0.329242  , 0.06192295, ..., 0.08791832, 0.68526445,
        0.99726651],
       [0.84727273, 0.31149444, 0.05587393, ..., 0.08451503, 0.70073932,
        0.9973074 ],
       [0.84737101, 0.30170841, 0.05284941, ..., 0.08564946, 0.70918016,
        0.99728147]])

In [None]:
split_fraction = 0.815
train_split = int(split_fraction * len(features))
train_split

342749

In [None]:
train_data = features[0 : train_split - 1]
val_data = features[train_split:]
print(train_data.shape)
print(val_data.shape)

(342748, 7)
(77802, 7)


In [None]:
x_train = np.delete(train_data, 1, 1)
y_train = train_data[:, [1]]
print(train_data)
print("%%%%%%%%%%%%%%%")
print(y_train)
print("%%%%%%%%%%%%%%%")
print(x_train)
print("<<<<<<<<<<<<<<<<<>>>>>>>>>>>>>>>>>>>>>>>")
x_val = np.delete(val_data, 1, 1)
y_val = val_data[:, [1]]
print(val_data)
print("%%%%%%%%%%%%%%%")
print(y_val)
print("%%%%%%%%%%%%%%%")
print(x_val)

[[0.81493857 0.24863161 0.03788602 ... 0.08167896 0.74321291 0.99726153]
 [0.81542998 0.24216288 0.03629417 ... 0.07884288 0.74934898 0.99723061]
 [0.81503686 0.24050423 0.0359758  ... 0.07827567 0.75066599 0.99717776]
 ...
 [0.68167076 0.91524299 0.75007959 ... 0.58082813 0.16390793 0.99760857]
 [0.68068796 0.91457953 0.74832856 ... 0.5757232  0.16420725 0.99746198]
 [0.67882064 0.91640405 0.75310411 ... 0.54849688 0.16330929 0.99765644]]
%%%%%%%%%%%%%%%
[[0.24863161]
 [0.24216288]
 [0.24050423]
 ...
 [0.91524299]
 [0.91457953]
 [0.91640405]]
%%%%%%%%%%%%%%%
[[0.81493857 0.03788602 0.00478157 0.08167896 0.74321291 0.99726153]
 [0.81542998 0.03629417 0.00456423 0.07884288 0.74934898 0.99723061]
 [0.81503686 0.0359758  0.00434688 0.07827567 0.75066599 0.99717776]
 ...
 [0.68167076 0.75007959 0.67811345 0.58082813 0.16390793 0.99760857]
 [0.68068796 0.74832856 0.67898283 0.5757232  0.16420725 0.99746198]
 [0.67882064 0.75310411 0.70180396 0.54849688 0.16330929 0.99765644]]
<<<<<<<<<<<<<<

In [None]:
print(x_train.dtype)

float64


create data loaders

we need custom dataset for LSTM which uses seq of readings too


In [None]:
class TimeseriesDataset(torch.utils.data.Dataset):   
    def __init__(self, X, y, seq_len=1):
        self.X = X
        self.y = y
        self.seq_len = seq_len

    def __len__(self):
        return self.X.__len__() - (self.seq_len-1)

    def __getitem__(self, index):
        return ((self.X[index:index+self.seq_len]), (self.y[index+self.seq_len-1]))

In [None]:
sequence_length = 120

train_dataset = TimeseriesDataset(x_train, y_train, 120)
val_dataset = TimeseriesDataset(x_val, y_val, 120)

batch_size = 256

train_loader = DataLoader(train_dataset, shuffle=False, batch_size=batch_size)
val_loader = DataLoader(val_dataset, shuffle=False, batch_size=batch_size)
#test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)

In [None]:
dataiter = iter(train_loader)
dataiterVal = iter(val_loader)
sample_x, sample_y = dataiter.next()
sample_x_val, sample_y_val = dataiter.next()
print(sample_x.shape, sample_y.shape)
print(sample_x_val.shape, sample_y_val.shape)

torch.Size([256, 120, 6]) torch.Size([256, 1])
torch.Size([256, 120, 6]) torch.Size([256, 1])


Creating Model

In [None]:
class WeatherNet(nn.Module):
    def __init__(self, input_size, output_size, hidden_dim, n_layers, drop_prob=0.5):
        super(WeatherNet, self).__init__()
        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        self.input_size = input_size
        self.lstm = nn.LSTM(input_size, hidden_dim, n_layers, dropout=drop_prob, batch_first=True)
        self.dropout = nn.Dropout(0.2)
        self.fc = nn.Linear(hidden_dim, output_size)
        
        
    def forward(self, x, hidden):
        x = x.float() 
        batch_size = x.size(0)
        lstm_out, hidden = self.lstm(x, hidden)
        lstm_out = (lstm_out[:, sequence_length-1, :])
        out = self.dropout(lstm_out)
        out = self.fc(out)
        out = out.view(batch_size, -1)
        out = out[:,-1]
        return out, hidden
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())
        return hidden


initialize model

In [None]:
input_size = 6
output_size = 1
hidden_dim = 64
n_layers = 1
model = WeatherNet(input_size, output_size, hidden_dim, n_layers)
print(model)

WeatherNet(
  (lstm): LSTM(6, 64, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc): Linear(in_features=64, out_features=1, bias=True)
)


  "num_layers={}".format(dropout, num_layers))


In [None]:
lr=0.005
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
print(model.parameters())


<generator object Module.parameters at 0x7f32638f6fc0>


Start training

In [None]:
epochs = 5
counter = 0
print_every = 1000
clip = 5
valid_loss_min = np.Inf

model.train()
history = []
for i in range(epochs):
    h = model.init_hidden(batch_size)
    for inputs, labels in train_loader:
        counter += 1
        h = tuple([e.data for e in h])
        optimizer.zero_grad()
        output, h = model(sample_x, h)
        loss = criterion(output.squeeze(), sample_y.float().squeeze())
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        

        if counter%print_every == 0:
            val_h = model.init_hidden(batch_size)
            val_losses = []
            model.eval()
            for inp, lab in val_loader:
                val_h = tuple([each.data for each in val_h])
                if inp.shape[0] == 256:
                    out, val_h = model(inp, val_h)
                    val_loss = criterion(out.squeeze(), lab.float().squeeze())
                    val_losses.append(val_loss.item())
            model.train()
            print("Epoch: {}/{}...".format(i+1, epochs),
                  "Step: {}...".format(counter),
                  "Loss: {:.6f}...".format(loss.item()),
                  "Val Loss: {:.6f}".format(np.mean(val_losses)))
            if np.mean(val_losses) <= valid_loss_min:
                torch.save(model.state_dict(), './state_dict.pt')
                print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(valid_loss_min,np.mean(val_losses)))
                valid_loss_min = np.mean(val_losses)

        

Epoch: 1/5... Step: 1000... Loss: 0.000084... Val Loss: 0.020911
Validation loss decreased (inf --> 0.020911).  Saving model ...
Epoch: 2/5... Step: 2000... Loss: 0.000014... Val Loss: 0.014789
Validation loss decreased (0.020911 --> 0.014789).  Saving model ...
Epoch: 3/5... Step: 3000... Loss: 0.000011... Val Loss: 0.013289
Validation loss decreased (0.014789 --> 0.013289).  Saving model ...
Epoch: 3/5... Step: 4000... Loss: 0.000010... Val Loss: 0.010764
Validation loss decreased (0.013289 --> 0.010764).  Saving model ...
Epoch: 4/5... Step: 5000... Loss: 0.000011... Val Loss: 0.008502
Validation loss decreased (0.010764 --> 0.008502).  Saving model ...
Epoch: 5/5... Step: 6000... Loss: 0.000006... Val Loss: 0.007319
Validation loss decreased (0.008502 --> 0.007319).  Saving model ...


In [None]:
8l