# S3 TS practical session: (1h30)

# Using Pytorch : TimeSeries and Neural Nets `torch.nn` :


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import logging

logging.basicConfig(level=logging.INFO)


We'll keep it simple and only consider 1-dimensional convolutions

# `torch.nn` interesting parts for series

Before doing any specific task, lets just review some of the interesting bits available for timeseries in pytorch.

# (A) Dealing with variable length input with 1dConv & RNN's

Let's create two dummy series of two different length

In [None]:
x_long = torch.rand(1,1,33)  ## Data needs to be 3d for 1d convolutions ! (batch,serie_size,seq)
x_small = torch.rand(1,1,10)

print(x_long)
print(x_small)




## 1-Dimensional Convolutions:

1D Convolutions are like a sliding window over the serie:

`channel` is the number observations that belongs together, multivariable series have multiple channels (for example, RGB images have 3 channels)

Convolutions have multiple parameters:

[`torch.nn.Conv1d(in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True, padding_modea='zeros')`](https://pytorch.org/docs/stable/nn.html#conv1d)


- `stride` controls the stride for the cross-correlation, a single number or a one-element tuple.

- `padding` controls the amount of implicit zero-paddings on both sides for padding number of points.

- `dilation` controls the spacing between the kernel points; also known as the à trous algorithm. It is harder to describe, but this link has a nice visualization of what dilation does.

- `groups` controls the connections between inputs and outputs. in_channels and out_channels must both be divisible by groups


## [>> Have a look at this visualization to understand how they behave ! <<](https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md)


## Knowing the output size:

This formula should give you the output size per channel

$$ \frac{(W−K+2P)}{S}+1$$

- W is the input volume
- K is the Kernel size
- P is the padding
- S is the stride

## basic 1-d => one sliding window

With one input channel and one output channel

In [None]:
conv_module1 = torch.nn.Conv1d(1,1,4,stride=1)
output_long = conv_module1(x_long)
output_small = conv_module1(x_small)

print(output_long.size()) #24-4+1 (we roll with 4-sized windows and calculate a filter) => 21 values
print(output_long)


print(output_small.size()) #24-4+1 (we roll with 4-sized windows and calculate a filter) => 21 values
print(output_small)


### => The sizes are not the same :(



## basic 1-d => multiple sliding windows + max-pooling along channels

With one input channel and MORE output channel

In [None]:
conv_module1 = torch.nn.Conv1d(1,4,4)

output_long,_ =  torch.max(conv_module1(x_long), dim=-1)
output_small,_ = torch.max(conv_module1(x_small), dim=-1)



print(output_long.size()) 
print(output_long)

print("")

print(output_small.size()) 
print(output_small)



### => Now, they have the same size ! :)

## RNNs


RNN's are neural networks cells that processes the time serie sequentially (time-step after time-step) instead of in parallel like CNN's. This is because each output at time $t$ is conditioned on $t-1$

In [None]:
x_long_rnn = torch.rand(1,33,6)  ## Data needs to be 3d for RNN ! (batch,seq,serie_size)
x_small_rnn = torch.rand(1,12,6) 

### [`torch.nn` rnn'](https://pytorch.org/docs/stable/nn.html#recurrent-layers)


You have two kinds:
    - (RNN/GRU/LSTM) => These cells processes the whole sequence in one operation X(SEQ) => OUT 
    - (RNN/GRU/LSTM)Cell => These cells require a for loop to process a whole sequence X(s)=>X(e)=>X(q) => OUT
    
 Both can take variable length inputs. For the former you'll have to pad with 0's so each sequences are of same length. The Latter (XCell) processes inputs via a `for` loop, so padding can be avoided.
 

In [None]:
small_rnn = nn.RNN(5,5)
small_gru = nn.GRU(5,5)
small_lstm = nn.LSTM(5,5)

## Pytorch's RNN  all behave in the same way:

### (TODO) => Try different the "CELL" variables

In [None]:
CELL = #To complete #small_rnn # small_gru # small_lstm


output,hidden = CELL(torch.rand(1,4,5)) # Processes a batch of 1 series of 4 timesteps of 5 values

print("output:")
print("------")
print(output)
print(output.size())


full_seq_rnn = output[:,-1,:] # We select the output of the last timestep.
full_seq_max,_ = torch.max(output,1) # we aggregate on the sequence dimension

print("\n\n")

print("\"aggregation\"")
print("---")
print(full_seq_rnn)
print(full_seq_max)

The output size has the same length at the sequence because there is one output per timestep (dimension #1);
To have


## (TODO) process `x_long_rnn` and `x_small_rnn` with a `rnn_cell` so they have the same size


In [None]:
rnn_cell = nn.RNN( ## To complete

out_long,_ = rnn_cell(x_long_rnn)
out_small,_ = rnn_cell(x_small_rnn)

out_long = out_long # to complete
out_small = out_small # to complete

print(out_small.size())
print(out_long.size())


print("Outputs are of equal sizes:", out_long.size() == out_small.size())


### => Take a moment to ponder on what output size is

## Pytorch's Cells also all *nearly behave in the same way:

*The LSTMCell output is a tuple of tensors instead of just one tensor (it returns cell state)


In [None]:
small_rnnC = nn.RNNCell(5,5)
small_gruC = nn.GRUCell(5,5)
small_lstmC = nn.LSTMCell(5,5)

small_rnnC(torch.rand(1,5)) # Processes one example of size 5

In [None]:
CELL = #To complete #small_rnnC # small_gruC # small_lstmC


input_seq = torch.rand(1,4,5)

outputs = []
for i in range(input_seq.size(1)): #for every indices of the sequence
    
    input_vec = input_seq[:,i,:]     # we take the vector of one timestep
    output = CELL(input_vec) # Processes a batch of 1 series of 4 timesteps of 5 values
    
    if type(output) is tuple: # if it's a lstm
        output,_ = output
        
    outputs.append(output)
        
        
print("outputs:")
print("------")
print(outputs)
print("")
print("outputs.size()")
print("------")
print([x.size() for x in outputs])


print("\n"*2)

print("aggregation")
print("------")

concatenated = torch.cat(outputs,dim=0)
maxed,_ = torch.max(concatenated,dim=0)


print(outputs[-1])  #we can just select the last one
print(maxed)
print() #or do a max



## (TODO) process `x_long_rnn` and `x_small_rnn` with a `*(RNN)Cell` so they have the same size

In [None]:
CELL_C = nn.RNNCell( ## To complete

# To complete
    
print(out_small.size())
print(out_long.size())


print("Outputs are of equal sizes:", out_long.size() == out_small.size())

# (B) Using those parts for classification
## Here, we are in the supervised learning framework: Signal classification

###  We propose to reuse our simple timeseries classification task to tryout those convolutions/rnn's



Given this dataset, we'll do the same task as before: prediction of the day of the week.

### (a) Loading data/create dataset

In [None]:
#lets load the data and only consider the count as a serie.
df = pd.read_csv("https://raw.githubusercontent.com/cedias/csvdata/master/train.csv",parse_dates=["datetime"],usecols=['datetime','count'])
df.head()

df["y"] = df.datetime.dt.weekday
df["day"] = df.datetime.dt.day
df["hour"] = df.datetime.dt.hour
df["month"] = df.datetime.dt.month
df["time"] = df.datetime.dt.time
df["year"] = df.datetime.dt.year

hour_index = list(range(24))

def paddedlist(df):
    ndf = df.set_index("hour")
    
    if len(df.index.values) < 24:
        ndf = ndf.reindex(hour_index).fillna(0)
     
    # Here: I fill missing data with a 0, I could have used other strategies:
    #pad / ffill: propagate last valid observation forward to next valid
    #backfill / bfill: use next valid observation to fill gap
    #nearest: use nearest valid observations to fill gap
        
    counts = ndf["count"].tolist()
    weekday = ndf.iloc[0]["y"]
    
    return  (counts,weekday)


X,Y = zip(*(df.groupby(["day","month","year"])["hour","count","y"]
            .apply(paddedlist)
            .reset_index(drop=True)
            .sample(frac=1)
            .tolist()
           ))

X = np.array(X)
Y = np.array(Y)


for x,y in zip(X[:5],Y[:5]):
    plt.plot(x,label=int(y))
    plt.legend(title="signal class")
    
X_train = X[:-42]
Y_train = Y[:-42]

X_test = X[-42:]
Y_test = Y[-42:]

## First - two really simple convolutionnal Neural Nets:

**Note**: We will build non-batched implementation for simplicity, but in practical you'll work with batches of data.

### (TODO): Complete the following networks by calculating all the convolutions output size

In [None]:

class EasyNet(nn.Module):
    def __init__(self,num_classes):
        super(EasyNet, self).__init__()
        self.conv1 = torch.nn.Conv1d(1,1,1) # => yields 24 values
        self.conv2 = torch.nn.Conv1d(1,1,2) # => yields 23 values
        self.conv4 = torch.nn.Conv1d(1,1,4) # => yields 21 values 
        self.conv8 = torch.nn.Conv1d(1,1,8) # => ...
        self.conv12 = torch.nn.Conv1d(1,1,12)
        self.conv24 = torch.nn.Conv1d(1,1,24)
        
        size_all_convs =  # To complete
        
        self.dropout1 = nn.Dropout2d(0.25)
        self.dropout2 = nn.Dropout2d(0.5)
        
        print(size_all_convs//2)
        
        self.t1 = nn.Linear(size_all_convs, 24)
        self.t2 = nn.Linear(24, num_classes)

    def forward(self, x):
        
        
        all_convs = torch.cat([self.conv1(x),self.conv2(x),self.conv4(x),self.conv8(x),self.conv12(x),self.conv24(x)],dim=-1)
        first_transform = torch.tanh(self.t1(all_convs))
        second_transform = self.t2(first_transform)
        
        output = second_transform 
        
        return output


## A small test case to test if the network works

In [None]:
net = EasyNet(7) # We do 7 way classification

data_point = torch.Tensor(X[0]).unsqueeze(0).unsqueeze(0)

print(net(data_point))

### (TODO) Optimizing a model, quickly:
 => Complete this rather simple model optimization routine:

In [None]:
BATCH_SIZE = 16

model = net
optim = torch.optim.Adam(model.parameters())
loss = torch.nn.CrossEntropyLoss()

for epoch in range(15): # doing 15 epochs
    logging.info("Iteration %d", epoch)

    sum_loss = 0
    optim.zero_grad() # we reset gradients
    for i,(x,y) in enumerate(zip(X_train,Y_train)): 
        
        x = torch.Tensor(x).unsqueeze(0).unsqueeze(0)
        y = torch.LongTensor([y])
                
        

        yhat = ## to complete
        
        yhat = yhat.squeeze(0)
        
        
        ex_loss =  ## to complete
        ex_loss.backward()


        sum_loss += ex_loss.item()
        
        if i% BATCH_SIZE ==0:
            optim.step()
    
    print("Train loss :", sum_loss/len(X_train))
    
    sum_pred = 0
    
    for x,y in zip(X_test,Y_test): 
        
        x = torch.Tensor(x).unsqueeze(0).unsqueeze(0)
        y = torch.LongTensor([y])
                
        yhat = model(x)
        
        yhat = yhat.squeeze(0)
        _,inds = torch.max(yhat,dim=-1)

        if inds == y:
            sum_pred +=1
        
    
    print("Test accuracy : ",sum_pred/len(X_test) *100) 
    
    



## (TODO) Now let's try max pooling on "channels"

=> You have to do a `torch.max` on the channel dimensions so convolutions yields same size tensors

In [None]:

class EasierNet(nn.Module):
    def __init__(self,num_classes):
        super(EasierNet, self).__init__()
        self.conv1 = torch.nn.Conv1d(1,128,8)
    
        self.t1 = nn.Linear(128, 64)
        self.t2 = nn.Linear(64, num_classes)

    def forward(self, x):
        
        all_convs,_ = ##to complete
        first_transform = torch.dropout(torch.relu(self.t1(all_convs)),p=0.1,train=self.training)
        second_transform = self.t2(first_transform)
        
        output = second_transform 
        
        return output


### Test case

In [None]:
net2 = EasierNet(7) # We do 7 way classificationa

data_point = torch.Tensor(X[0]).unsqueeze(0).unsqueeze(0)

print(net2(data_point))

## (Todo) Simple optimizing scheme : SGD

### Complete the following cell. Also, the implementation here is not batched: try to add gradient batching


In [None]:
import torch.optim

BATCH_SIZE = 16


model = net2
optim = torch.optim.Adam(model.parameters(),lr=0.0001)
loss = torch.nn.CrossEntropyLoss()

for epoch in range(15): # doing 15 epochs
    logging.info("Iteration %d", epoch)

    sum_loss = 0
    
    optim.zero_grad() # we reset gradients
    model.train() #we set model in train mode
    for i,(x,y) in enumerate(zip(X_train,Y_train)): 
        
        x = torch.Tensor(x).unsqueeze(0).unsqueeze(0)
        y = torch.LongTensor([y])
                

        yhat = # To complete
        yhat = yhat
       
        
        l = loss(yhat,y)

        sum_loss+=l.item()
        l.backward()
        
        optim.step()
    
    
    print("Training loss:", sum_loss/len(X_train))
    
    sum_pred = 0
    model.eval()
    
    for x,y in zip(X_test,Y_test): 
        
        x = torch.Tensor(x).unsqueeze(0).unsqueeze(0)
        y = torch.LongTensor([y])
                
        yhat = model(x)
        
        yhat = yhat.squeeze(0)
        _,inds = torch.max(yhat,dim=-1)

        if inds == y:
            sum_pred +=1
        
    
    print("Test Accuracy:", sum_pred/len(X_test) *100) 
    
    



## Second:  Two different RNN's : One with max pooling, one without

### (TODO) RNN: Max-pooling of all timestep

 => Here we want a model which does max pooling on all rnn's outputs to concatenate them in a single one

In [None]:
class EasyRecNet(nn.Module):
    def __init__(self,num_classes,rnn_cell=nn.RNN):
        super(EasyRecNet, self).__init__()
        
        self.rnn = rnn_cell(1,num_classes*2)    
        self.t1 = nn.Linear(num_classes*2, num_classes)

    def forward(self, x):
        
        seq,_ = self.rnn(x)
        
        pooled,_ = ## To complete
        output = self.t1(pooled) 
        
        return output

##  Test case:

In [None]:
net3 = EasyRecNet(7,nn.RNN) # We do 7 way classification with a classic RNN

data_point = torch.Tensor(X[0]).unsqueeze(0).unsqueeze(-1)


print(net3(data_point))

## (TODO) Experiment with multiple rnn cells 
#### Optimizing again:

In [None]:
BATCH_SIZE = 16

CELL_RNN = #TO complete 

net3 = EasyRecNet(7,CELL_RNN) # We do 7 way classification with a classic RNN


model = net3
optim = torch.optim.Adam(model.parameters())
loss = torch.nn.CrossEntropyLoss()

for epoch in range(15): # doing 15 epochs
    logging.info("Iteration %d", epoch)

    sum_loss = 0
    
    optim.zero_grad() # we reset gradients
    model.train() #we set model in train mode
    for i,(x,y) in enumerate(zip(X_train,Y_train)): 
        
        x = torch.Tensor(x).unsqueeze(0).unsqueeze(-1)
        y = torch.LongTensor([y])
                

        yhat = model(x) 
        yhat = yhat
       
        l = loss(yhat,y)

        sum_loss+=l.item()
        l.backward()
        
        if i % BATCH_SIZE == 0: 
            optim.step()
    
    
    print("Training loss:", sum_loss/len(X_train))
    
    sum_pred = 0
    model.eval()
    
    for x,y in zip(X_test,Y_test): 
        
        x = torch.Tensor(x).unsqueeze(0).unsqueeze(-1)
        y = torch.LongTensor([y])
                
        yhat = model(x)
        
        yhat = yhat.squeeze(0)
        _,inds = torch.max(yhat,dim=-1)

        if inds == y:
            sum_pred +=1
        
    
    print("Test Accuracy:", sum_pred/len(X_test) *100) 
    

### (TODO) : RNN: Taking the final output as a sequence aggregate

 => Try to Select the right output

In [None]:
class EasierRecNet(nn.Module):
    def __init__(self,num_classes,rnn_cell=nn.RNN):
        super(EasierRecNet, self).__init__()
        
        self.rnn = rnn_cell(1,num_classes*2)    
        self.t1 = nn.Linear(num_classes*2, num_classes)

    def forward(self, x):
        
        seq,_ = self.rnn(x)
        
        final_rnn_output = # to complete
        output = self.t1(final_rnn_output) 
        
        return output

### A Test case:

In [None]:
net4 = EasierRecNet(7,nn.RNN) # We do 7 way classification with a classic RNN

data_point = torch.Tensor(X[0]).unsqueeze(0).unsqueeze(-1)


print(net4(data_point))

### (TODO) The optimization routine

=> once again choose whatever rnn you'd like

In [None]:
BATCH_SIZE = 16

CELL_RNN =  #TO complete

net4 = EasierRecNet(7,CELL_RNN) # We do 7 way classification with a classic RNN


model = net4
optim = torch.optim.Adam(model.parameters())
loss = torch.nn.CrossEntropyLoss()

for epoch in range(15): # doing 15 epochs
    logging.info("Iteration %d", epoch)

    sum_loss = 0
    
    optim.zero_grad() # we reset gradients
    model.train() #we set model in train mode
    for i,(x,y) in enumerate(zip(X_train,Y_train)): 
        
        x = torch.Tensor(x).unsqueeze(0).unsqueeze(-1)
        y = torch.LongTensor([y])
                

        yhat = model(x)
        yhat = yhat
       
        l = loss(yhat,y)

        sum_loss+=l.item()
        l.backward()
        
        if i % BATCH_SIZE == 0: 
            optim.step()
    
    
    print("Training loss:", sum_loss/len(X_train))
    
    sum_pred = 0
    model.eval()
    
    for x,y in zip(X_test,Y_test): 
        
        x = torch.Tensor(x).unsqueeze(0).unsqueeze(-1)
        y = torch.LongTensor([y])
                
        yhat = model(x)
        
        yhat = yhat.squeeze(0)
        _,inds = torch.max(yhat,dim=-1)

        if inds == y:
            sum_pred +=1
        
    
    print("Test Accuracy:", sum_pred/len(X_test) *100) 
    

## => Take some time to ponder on how dimensions interacts and how different RNNs work

# (C) RNNs forecasting : temperature prediciton

### Now, we propose to do a little temperature forecast exercise, using rnn's.

### Our task is the following: given a series of $t$ temperatures, the goal is to predict the next temperatures $t+1, t+2, t+...$

In [None]:
import torch
import torch.nn as nn
import torch.optim
from torch.utils.data import Dataset,DataLoader
import csv
import pandas as pd
import numpy as np

import time
import unicodedata
import string
from itertools import chain


## A quick glance at the dataset

In [None]:
TEMP_DATA = pd.read_csv("https://raw.githubusercontent.com/cedias/csvdata/master/tempAMAL_train.csv")
TEMP_DATA.head()

# Helper functions

In [None]:
def fill_na(mat):
    ix,iy = np.where(np.isnan(mat))
    for i,j in zip(ix,iy):
        if np.isnan(mat[i+1,j]):
            mat[i,j]=mat[i-1,j]
        else:
            mat[i,j]=(mat[i-1,j]+mat[i+1,j])/2.
    return mat


def read_temps():
    """
    returns a tensor of temperature with mean interpolation for missing data
    """
    return torch.tensor(fill_na(np.array(TEMP_DATA.iloc[:,1:])),dtype=torch.float)

In [None]:
#Checking if you have cuda enabled
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## The Forecast Dataset

We create a dataset class to iterate on temperature data

In [None]:
class ForecastTempDataset(Dataset):
    
    MAX,MIN = 330.,230.
    
    def __init__(self, x,length=20,nb=10000,test=False):
        self.data,self.length,self.nb = (x-self.MIN)/(self.MAX-self.MIN) ,length,nb
        self.size, self.classes = x.shape
        self.indexes = [0]
        self.nb_samples = 0
        if (test):
            self.indexes = np.arange(0,self.size,self.length)
            self.nb_samples = len(self.indexes)-1


    def __len__(self):
        if self.nb_samples:
            return self.nb_samples
        return self.nb
    
    def __getitem__(self,i):
        """ length X dim """
        if self.nb_samples:
            return self.data[self.indexes[i]:self.indexes[i+1]-1,:],self.data[self.indexes[i]+1:self.indexes[i+1]]
        
        id = np.random.randint(self.size-self.length)
        
        return self.data[id:(id+self.length-1),:],self.data[id+1:(id+self.length)]

## The actual recurrent model (Vanilla version of a `nn.RNNCell`)

Recall that Recurrent neural networks (RNNs) are neural nets that can deal with sequences of variable length (unlike feedforward nets). They are able to this by defining a recurrence relation over timesteps which is typically the following formula: 

### $$ S_{k} = f(S_{k-1} \cdot W_{rec} + X_k \cdot W_x) $$

Where $S_k$ is the state at time k, $X_k$ an exogenous input at time k, $W_rec$ and $W_x$ are parameters like the weights parameters in feedforward nets. Note that the RNN can be viewed as a state model with a feedback loop . The state evolves over time due to the recurrence relation, and the feedback is fed back into the state with a delay of one timestep. This delayed feedback loop gives the model memory because it can remember information between timesteps in the states.
The final output of the network $Y_k$ at a certain timestep k is typically computed from one or more states $S_{k−i}...S_{k+j}$.

**Note that we can either compute the current state $S_k$ from the current input $X_k$ and previous state $S_{k−1}$, or predict the next state from $S_{k+1}$ from the current state $S_k$ and current input $X_k$. The difference of notation has not much effect on our model and depends on the task at hand. **

In [None]:
class RNN(nn.Module):
    
    def __init__(self, inputdim,latentdim):
        super(RNN,self).__init__()
        
        self.inputdim , self.latentdim = inputdim,latentdim
        self.encoder = nn.Linear(inputdim,latentdim)
        self.latent = nn.Linear(latentdim,latentdim)
        
    def forward(self,x,h=None):
        """ x: length x batch x dim """
        hseq = []
        
        if h is None:
            h = self.hzero(x.shape[1]).to(x.device)
            
        for i in range(x.shape[0]):
            h = self.one_step(x[i],h)
            hseq.append(h)
            
        return torch.stack(hseq)
    
    def one_step(self,x,h):
        return  torch.tanh(self.encode(x)+self.latent(h))
    
    def encode(self,x):
        return self.encoder(x.view(-1,self.inputdim))
    
    def hzero(self,batch_size):
        return torch.zeros(batch_size,self.latentdim)


## The forecasting function

In [None]:
def forecast(rnn,decoder,x,h=None,length=10):
    
    with torch.no_grad():    
        if h is None:
            h = rnn.hzero(x.shape[1]).to(x.device)
            
        h = rnn.forward(x,h)[-1]
        x = decoder.forward(h)
        yhat = [x]
        
        for i in range(length-1):
            x = decoder.forward(rnn.one_step(x,h))
            yhat.append(x)
            
    return torch.stack(yhat)

###  Here we wrap the optimization function so you can experiment with parameters

- EPOCHS : Number of epochs
- BATCH_SIZE : Batch size 
- LATENT : Size of the latent space learnt by the RNN
- LENGTH : Size of the forecasting set while training
- LENGTH_FC : Size of the test forecast

In [None]:
def training(EPOCHS = 100, BATCH_SIZE = 32,LATENT = 10,LENGTH= 100,LENGTH_FC = 30):
    
    data_temp = read_temps()
    
    id_split = int(data_temp.shape[0]*0.8)
    
    data_train = DataLoader(ForecastTempDataset(data_temp[:id_split,:],length=LENGTH),batch_size=BATCH_SIZE,shuffle=True)
    data_test = DataLoader(ForecastTempDataset(data_temp[id_split:,:],test=True,length=LENGTH),batch_size=BATCH_SIZE,shuffle=False)

    rnn = RNN(data_temp.shape[1],LATENT)
    
    decoder = nn.Linear(LATENT,data_temp.shape[1])
    
    loss = torch.nn.MSELoss()
    
    optim = torch.optim.Adam(chain(rnn.parameters(),decoder.parameters()),lr=0.0001)

    rnn = rnn.to(device)
    decoder = decoder.to(device)

    for epoch in range(EPOCHS):
        
        logging.info("Iteration %d", epoch)
        suml = 0
        err = 0
        
        for x,y in data_train: 
            l=0
            
            optim.zero_grad()
            x = x.to(device)
            
            h = rnn.forward(x.transpose(0,1))
            yhat = decoder.forward(h.view(-1,LATENT)).view(x.size(1),x.size(0),data_temp.size(1))
            
            l += loss(yhat,y.transpose(0,1))
            
            suml += l/len(data_train)
            l.backward()
            optim.step()
            
        logging.info("loss train : %f",suml)

        with torch.no_grad():
            
            l = 0
            lf = 0
            
            for x,y in data_test:
                x = x.to(device)
                h = rnn.forward(x.transpose(0,1))
                
                yhat = decoder.forward(h.view(-1,LATENT)).view(x.size(1),x.size(0),data_temp.size(1))
                l += loss(yhat,y.transpose(0,1))/len(data_test)
                
                ## ALL THE FORECAST happens here
                yhat = forecast(rnn,decoder,x.transpose(0,1)[:-LENGTH_FC],length=LENGTH_FC)
                
                lf += loss(yhat,y.transpose(0,1)[-LENGTH_FC:])/len(data_test)
                
            logging.info("loss test : %f",l)
            logging.info("loss test forecast : %f",lf)

    return rnn


## (TODO) experiment with different variables

- What happens when you learn with less then what you predict
- On the contrary, what happens if you learn with more
- Does the Latent size really makes the performances better ?

In [None]:
training(EPOCHS = 100, BATCH_SIZE = 32,LATENT = 10,LENGTH= 100,LENGTH_FC = 30)