In [1]:
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.autograd import Variable

import random
import math
import time

In [2]:
!nvidia-smi

Mon Jun 22 06:43:45 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.36.06    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   70C    P8    32W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
from google.colab import drive
drive.mount('/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /gdrive


In [4]:
%cd /gdrive/My\ Drive/E4/Projet\ Seq2seq

/gdrive/.shortcut-targets-by-id/1L0GWY2fCntIpHodTYUvTCzdpBSUKQOnq/Projet Seq2seq


In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [6]:
class CSVDataset(Dataset):

    def __init__(self, file_path, n_x, n_y, n_rows, skip_rows):
        super(CSVDataset,self).__init__()
        self.n_x = n_x
        self.n_y = n_y
        temp_data = pd.read_csv(file_path, nrows=n_rows, skiprows=skip_rows)
        temp_titles = temp_data[temp_data.columns[:1]].values
        temp_data = temp_data[temp_data.columns[1:]].values
        
        
        truth_table = np.isnan(temp_data)
        number_nan_row = 0
        nan_row_id = []
        for index, row in enumerate(truth_table):
          if (row[0] == True):
            number_nan_row += 1
            nan_row_id.append(index)

        self.data = []
        self.titles = []
        for index, row in enumerate(temp_data):
          if index in nan_row_id:
            continue
          else:
            self.data.append(row)
            self.titles.append(temp_titles[index])
        
        nan_index = np.argwhere(np.isnan(self.data))
        for x, y in nan_index:
          self.data[x][y] = np.nanmean(self.data, dtype=np.float32)

        self.data = np.array(self.data)

        print(self.data.shape)

    def __len__(self):
        return (self.data.shape[1]-self.n_x-self.n_y)*self.data.shape[0]

    def __getitem__(self, index):
        line = index //  (self.data.shape[1]-self.n_x-self.n_y)
        i = self.n_x + self.n_y + (index % (self.data.shape[1]-self.n_x-self.n_y))

        """ 
        data = [] 
        for row, title in enumerate(self.titles):
          title_split = title[0].split('_')
          for col, value in enumerate(self.data[row]):
            data[row][col] = [value, title_split[-3].split('.')[0], title_split[-2], title_split[-1]]
        """

        item = (torch.FloatTensor(self.data[line,i-self.n_y-self.n_x:i-self.n_y]),
                torch.FloatTensor(self.data[line,i-self.n_y:i]))
        return item

In [7]:
n_x = 50 
n_y = 2
train_dataset = CSVDataset("web-traffic-time-series-forecasting/train_1.csv", n_x, n_y, 600, 0)
valid_dataset = CSVDataset("web-traffic-time-series-forecasting/train_1.csv", n_x, n_y, 200, 600)
test_dataset = CSVDataset("web-traffic-time-series-forecasting/train_1.csv", n_x, n_y, 200, 800)

(535, 550)
(200, 550)
(200, 550)


In [8]:
train_loader = DataLoader(train_dataset, batch_size=256, drop_last=True)
valid_loader = DataLoader(valid_dataset, batch_size=256, drop_last=True)
test_loader = DataLoader(test_dataset, batch_size=256, drop_last=True)

In [9]:
len(train_loader)

1040

In [10]:
for data, label in train_loader:
    print("--------- DATA ---------")
    print(data)
    print(" --------- LABEL ---------")
    print(label)
    break

--------- DATA ---------
tensor([[18., 11.,  5.,  ..., 15., 25.,  9.],
        [11.,  5., 13.,  ..., 25.,  9.,  5.],
        [ 5., 13., 14.,  ...,  9.,  5.,  6.],
        ...,
        [18.,  8.,  9.,  ..., 40., 19., 15.],
        [ 8.,  9., 17.,  ..., 19., 15., 15.],
        [ 9., 17.,  9.,  ..., 15., 15., 29.]])
 --------- LABEL ---------
tensor([[  5.,   6.],
        [  6.,  20.],
        [ 20.,   3.],
        [  3.,  14.],
        [ 14.,  46.],
        [ 46.,   5.],
        [  5.,   5.],
        [  5.,  13.],
        [ 13.,   4.],
        [  4.,   9.],
        [  9.,  10.],
        [ 10.,   9.],
        [  9.,  11.],
        [ 11.,  11.],
        [ 11.,  11.],
        [ 11.,   9.],
        [  9.,  15.],
        [ 15.,   5.],
        [  5.,  10.],
        [ 10.,   7.],
        [  7.,   4.],
        [  4.,   8.],
        [  8.,   9.],
        [  9.,  10.],
        [ 10.,   6.],
        [  6.,  13.],
        [ 13.,  16.],
        [ 16.,   6.],
        [  6.,  24.],
        [ 24.,   9.]

In [11]:
class Encoder(nn.Module):
    def __init__(self, input_dim, hid_dim, n_layers, dropout=0):
        super().__init__()
        self.hid_dim = hid_dim
        self.n_layers = n_layers

        self.lstm = nn.LSTM(input_dim, hid_dim, n_layers, dropout = dropout)
        self.softplus = nn.Softplus()

        self.hidden = torch.zeros(self.n_layers, 1, self.hid_dim, device=device)
        self.cell = torch.zeros(self.n_layers, 1, self.hid_dim, device=device)

    def forward(self, src):
        src = src.unsqueeze(1).to(device)
        output, (hidden, cell) = self.lstm(src.view(len(src) ,1, -1), (self.hidden, self.cell))
        hidden = self.softplus(hidden)
        return hidden, cell

In [12]:
class Decoder(nn.Module):
    def __init__(self, output_dim, hid_dim, n_layers, dropout=0):
        super().__init__()
        
        self.output_dim = output_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        self.lstm = nn.LSTM(output_dim, hid_dim, n_layers, dropout = dropout)
        self.softplus = nn.Softplus()
        self.linear = nn.Linear(hid_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src, hidden, cell):
        src = src.unsqueeze(1).to(device)
        output, (hidden, cell) = self.lstm(src, (hidden, cell))
        prediction = self.softplus(output.view(len(src), -1))
        prediction = self.linear(prediction)
        return prediction, hidden, cell

In [13]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
        assert encoder.hid_dim == decoder.hid_dim, \
            "Hidden dimensions of encoder and decoder must be equal!"
        assert encoder.n_layers == decoder.n_layers, \
            "Encoder and decoder must have equal number of layers!"
        
    def forward(self, src, trg):
       
        hidden, cell = self.encoder(src)
        output, hidden, cell = self.decoder(trg, hidden, cell)
        return output

In [14]:
INPUT_DIM = n_x
OUTPUT_DIM = n_y
HID_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

enc = Encoder(INPUT_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)

model = Seq2Seq(enc, dec, device).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.01)
criterion = nn.MSELoss()
print(model)

Seq2Seq(
  (encoder): Encoder(
    (lstm): LSTM(50, 512, num_layers=2, dropout=0.5)
    (softplus): Softplus(beta=1, threshold=20)
  )
  (decoder): Decoder(
    (lstm): LSTM(2, 512, num_layers=2, dropout=0.5)
    (softplus): Softplus(beta=1, threshold=20)
    (linear): Linear(in_features=512, out_features=2, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)


In [15]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        
        if(i%100 == 0):
          print("In train, batch number: {}".format(i))
        src = torch.tensor(batch[0]).to(torch.float32).to(device)
        trg = torch.tensor(batch[1]).to(torch.float32).to(device)
        
        optimizer.zero_grad()
        output = model(src, trg)
        
        loss = criterion(output, trg)
        with torch.autograd.set_detect_anomaly(True):
          loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [16]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            if(i%100 == 0):
              print("In evaluate, batch number: {}".format(i))

            src = torch.tensor(batch[0]).to(torch.float32).to(device)
            trg = torch.tensor(batch[1]).to(torch.float32).to(device)

            output = model(src, trg)
            loss = criterion(output, trg)
            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [17]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [19]:
N_EPOCHS = 10
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_loader, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_loader, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'Seq2seq/model/seq2seq-model-1.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    #print("Train loss: {}, T".format(i))
    print(f'\tTrain Loss: {train_loss:.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f}')

In train, batch number: 0


  # This is added back by InteractiveShellApp.init_path()
  if sys.path[0] == '':


In train, batch number: 100
In train, batch number: 200
In train, batch number: 300
In train, batch number: 400
In train, batch number: 500
In train, batch number: 600
In train, batch number: 700
In train, batch number: 800
In train, batch number: 900
In train, batch number: 1000
In evaluate, batch number: 0


  
  from ipykernel import kernelapp as app


In evaluate, batch number: 100
In evaluate, batch number: 200
In evaluate, batch number: 300
Epoch: 01 | Time: 3m 30s
	Train Loss: 150891.277
	 Val. Loss: 998649.467
In train, batch number: 0
In train, batch number: 100


KeyboardInterrupt: ignored

In [None]:
data = pd.read_csv("web-traffic-time-series-forecasting/train_1.csv", nrows=100)

In [None]:
data.head

In [None]:
temp_data = data[data.columns[:1]].values

In [None]:
temp_data[0][0].split('_')

In [None]:
for elem in temp_data:
  lst = elem[0].split('_')
  print(lst[-3].split('.')[0])