<a href="https://colab.research.google.com/github/ammarisme/covid-19/blob/master/CV19_result_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
%matplotlib inline
!pip install torch-geometric \
  torch-sparse==latest+cu101 \
  torch-scatter==latest+cu101 \
  torch-cluster==latest+cu101 \
  -f https://pytorch-geometric.com/whl/torch-1.5.0.html

In [0]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
from functools import reduce
import os

import torch
from torch_geometric.data import Data, DataLoader, InMemoryDataset

import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd

from os import listdir
from os.path import isfile, join

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print('running on '+ ("GPU" if torch.cuda.is_available() else "CPU"))

In [0]:
from google.colab import drive
drive.mount('/content/drive')
PATH = '/content/drive/My Drive/covid'

In [0]:
def loss_file_search(mypath):
  losses_files = [f for f in listdir(mypath) if ("losses" in f)]
  ax_paths = []
  for loss_file in losses_files:
    losses = np.load(join(mypath, loss_file), allow_pickle=True)
    training_loss = np.array(losses.tolist()['losses']).T[0]
    ax = plt.plot(training_loss)
    filepath = mypath
    ax_paths.append((ax, filepath))
  
  directories = [f for f in listdir(mypath) if os.path.isdir(join(mypath, f))]
  for directory in directories:
    ax_paths.append(loss_file_search(mypath+'/'+directory))
  
  return ax_paths
  

In [0]:
import matplotlib.pyplot as plt
from matplotlib.pyplot import ylim

import matplotlib.patches as mpatches


root = PATH+'/models__rnn/4'
#ylim(top=5.08, bottom=5.07)#, ylim_bottom=5.0
#ax_paths = loss_file_search(root)
"""axes = [ax_path[0][0] for ax_path in ax_paths]
paths = [ax_path[0][1][-1] for ax_path in ax_paths]
colors = {
    0 :"red",
    3 : "green",
    4 : "blue",
    6: "orange"
}
plt.legend(handles=[
                    mpatches.Patch(color=colors[int(path)], label=str(path)) for path in paths])
plt.show()
"""

In [0]:
class CovidDataSet(InMemoryDataset):
    def __init__(self, root, input_sequence, output_sequence, transform=None, pre_transform=None):
        super(CovidDataSet, self).__init__(root, transform, pre_transform)
        self.data, self.slices = torch.load(self.processed_paths[0])

    @property
    def raw_dir(self):
      if os.path.exists(self.root+PROCESSED_DIR):
        return self.root+'/cleaned'
      else:
        os.mkdir(self.root+PROCESSED_DIR)
        return self.root+'/cleaned'
        
    @property
    def processed_dir(self):
      if os.path.exists(self.root+PROCESSED_DIR):
        return self.root+PROCESSED_DIR
      else:
        os.mkdir(self.root+PROCESSED_DIR)
        return self.root+PROCESSED_DIR

    @property
    def raw_file_names(self):
      mypath = self.raw_dir
      filenames = [f for f in listdir(mypath) if isfile(join(mypath, f))]
      return filenames

    @property
    def processed_file_names(self):
        return ['processed.dt']

    def download(self):
        pass
    
    def process(self):
        
        data_list = []

        for raw_path in self.raw_paths:
          df = pd.read_csv(raw_path)
          for synthetic_seq in df['synthesis_seq'].unique():
            synthetic_data = df[df['synthesis_seq']==synthetic_seq]

            for country in synthetic_data['countryterritoryCode'].unique():
              country_data = synthetic_data[synthetic_data['countryterritoryCode'] == country]
              popData2018 = country_data['popData2018'].values[0]
              _country_code = country_data['_country_code'].values[0]
              
              country_data_i = country_data[:-configuration['output_sequence_len']]
              country_data_o = country_data[configuration['input_sequence_len']:]
              
              
              country_data_array = np.array([country_data_i['cases'].to_numpy(),
                                             country_data_i['deaths'].to_numpy()
                                             ])
              """
              country_data_i['_country_code'].to_numpy(),
                                             country_data_i['countriesAndTerritories'].to_numpy(),
                                             country_data_i['geoId'].to_numpy(),
                                             country_data_i['countryterritoryCode'].to_numpy(),
                                             country_data_i['continentExp'].to_numpy()
              """
              feature_length = len(country_data_array)
              country_data_array = country_data_array.reshape(feature_length,len(country_data_i))

              country_data_array_y = np.array([country_data_o['cases'].to_numpy(), country_data_o['deaths'].to_numpy()])
              country_data_array_y = country_data_array_y.reshape(2,len(country_data_o))

              x = country_data_array[:feature_length].T
              y = country_data_array_y[:2].T

              sets =0
              x_list = []
              dict_x = dict()
              for i in range(configuration['input_sequence_len']):
                array_len = ((len(x) -i) - ((len(x)-i)%configuration['input_sequence_len']))+i
                if array_len <= 0:
                  continue
                sets = int( array_len/ configuration['input_sequence_len'])
                if sets <= 0:
                  continue
                #print('input seq : ', i , ' ', array_len , ' ',array_len-i , ' number of sets : ', sets)
                x_temp = x[i:array_len].T.reshape(sets,feature_length,configuration['input_sequence_len'])
                x_temp = x_temp.reshape(feature_length,sets,configuration['input_sequence_len'])
                uniq_keys = np.array([i+(configuration['input_sequence_len']*k) for k in range(configuration['input_sequence_len'])])
                
                arrays_split = np.hsplit(x_temp,sets)
                dict_x.update(dict(zip(uniq_keys, arrays_split)))
              
              dict_y = dict()
              y_list = []
              for i in range(configuration['output_sequence_len']):
                array_len_y = (len(y)-i) - ((len(y)  - i)%configuration['output_sequence_len'])+i
                if array_len_y <= 0:
                  continue
                sets = int(array_len_y / configuration['output_sequence_len'])
                if sets <= 0:
                  continue
                
                #print('output seq : ', i , ' ', array_len_y , ' ',array_len_y-(i) , ' number of sets : ', sets)
                y_temp = y[i:array_len_y].T.reshape(sets, 2, configuration['output_sequence_len'])
                uniq_keys = np.array([i+(configuration['output_sequence_len']*k) for k in range(configuration['output_sequence_len'])])
                y_temp = y_temp.reshape(2,sets,configuration['output_sequence_len'])
                arrays_split = np.hsplit(y_temp,sets)
                dict_y.update(dict(zip(uniq_keys, arrays_split)))
              

              temp_x_list  = [dict_x[i].T for i in sorted(dict_x.keys())]
              temp_y_list  = [dict_y[i].T for i in sorted(dict_y.keys())]

              #_country_code,popData2018
              xy_list = [Data(x = torch.from_numpy(features).type(torch.FloatTensor).squeeze()) for features in temp_x_list]

              for i in sorted(dict_y.keys()):
                xy_list[i].y = torch.from_numpy(temp_y_list[i]).squeeze()

              data_list += xy_list
          print('processed : '+ raw_path)
        data, slices = self.collate(data_list)
        torch.save((data, slices), self.processed_paths[0])

In [0]:
configuration = {
    'input_sequence_len' : 10,
    'output_sequence_len' : 10,
    'training_batch_size' : 1024,
    'training_dataset_length' :32768,
    'validation_batch_size' : 1024,
    'yhat_size' : 2,
    'feature_len' : 2,
    'output_size' : 2,
}


INPUT_ROOT = PATH+'/input_test'
DATA_TAG = "seq2seq_"+str(configuration['input_sequence_len'])+'_'+str(configuration['output_sequence_len'])
PROCESSED_DIR = '/processed_'+DATA_TAG

validation_dataset = CovidDataSet(INPUT_ROOT, configuration['input_sequence_len'], configuration['output_sequence_len'])
validation_dataset = validation_dataset.shuffle()
validation_dataset = validation_dataset[3000:]
validation_dataloader = DataLoader(validation_dataset,batch_size=configuration['validation_batch_size'])

print('batches validation :', len(validation_dataloader))
print('dataset length validation :', len(validation_dataset))
print('sample data :', validation_dataset[100].x)

In [0]:
class RNNModel(nn.Module):
    def __init__(self, input_size, parameter_sizes, repeats ,output_size):
        super(RNNModel, self).__init__()
        self.input_size = input_size
        self.repeater_input_size = parameter_sizes[0]
        self.hidden_size = parameter_sizes[1]
        self.repeats = repeats
        self.output_size = output_size

        self.fc1 = nn.Linear(input_size, self.repeater_input_size)
        self.relu_activation = nn.ReLU()

        self.layers = dict()

        
        k = 0
        for i in range(repeats):
          i = i+k
          self.layers['fc_'+str(i)] = nn.Linear(self.repeater_input_size, self.hidden_size)
          self.layers['gru_'+str(i+1)] = nn.GRU(self.hidden_size, self.hidden_size)
          self.layers['fc_'+str(i+2)] = nn.Linear(self.hidden_size, self.repeater_input_size)
          k+=2

        self.module_list = nn.ModuleDict(self.layers)

        self.fc2 = nn.Linear(self.repeater_input_size, output_size)
        
    def forward(self, input, hidden):
      output = self.fc1(input)
      output = self.relu_activation(output)
      k = 0
      for i in range(self.repeats):
        i = i+k
        output = self.layers['fc_'+str(i)](output)
        output = self.relu_activation(output)

        output, hidden[i-k] = self.layers['gru_'+str(i+1)](output, hidden[i-k])#should be different. check the nlp page
        output = self.relu_activation(output)
        hidden[i-k] = self.relu_activation(hidden[i-k])

        output = self.layers['fc_'+str(i+2)](output)
        output = self.relu_activation(output)
        k +=2

      output = self.fc2(output)
      output = self.relu_activation(output)

      return output, hidden

    def initHidden(self, batch_size):
        return torch.zeros(1, batch_size, self.hidden_size, device=device)

In [0]:
os.path.exists(PATH+'/models__rnn')

In [0]:
def predict(input_tensor, rnn_model):
  with torch.no_grad():
    input_tensor = input_tensor.type(torch.FloatTensor)

    rnn_model_hidden = []
    for i in range(5):
      rnn_model_hidden.append(rnn_model.initHidden(1))

    validation_loss = 0
    
    outputs = []

    for ei in range(configuration['input_sequence_len']):
      input_tensor_seq = input_tensor.view(configuration['input_sequence_len'],1, configuration['feature_len'])[ei]
      
      input_tensor_seq = input_tensor_seq.view(1, 1, configuration['feature_len']).to(device)
      rnn_model_output, rnn_model_hidden = rnn_model(
          input_tensor_seq, rnn_model_hidden)
      outputs.append(rnn_model_output.detach().numpy())

  return outputs
model = RNNModel(**{
         'input_size': 2,
         'parameter_sizes': [256, 128] ,
         'repeats' : 5 ,
         'output_size': 2
         })
model_statedict = torch.load(PATH+'/models__rnn/4/model_manual_save_0008_7.pt', map_location=torch.device('cpu'))
model.load_state_dict(model_statedict)
model.eval()
for i in range(1000):
  out1 = np.array(predict(torch.tensor(validation_dataset[i+20].x), model)).squeeze()
  out2 = np.array(predict(torch.tensor(validation_dataset[i].x), model)).squeeze()
  if np.sum(out1 - out2)==0:
    print('Model is broken', np.sum(out1) , np.sum(out2))
  else:
    print('Model is fine', np.sum(out1) , np.sum(out2))

In [0]:
model.fc1.bias

In [0]:
for layer in model.children():
    print(layer)
    weights = list(layer.parameters())
    print(weights)

In [0]:
model = RNNModel(**{
         'input_size': 2,
         'parameter_sizes': [256, 128] ,
         'repeats' : 5 ,
         'output_size': 2
         })
#model_statedict = torch.load(root+'/model_manual_save_0628_5.pt', map_location=torch.device('cpu'))
#model.load_state_dict(model_statedict)
#model.train()

In [0]:
out1 = np.array(predict(torch.tensor(validation_dataset[100].x), model)).squeeze()
out2 = np.array(predict(torch.tensor(validation_dataset[101].x), model)).squeeze()
if np.sum(out1 - out2)==0: print('Model is broken')

In [0]:
def get_all_directories(mypath):  
  directories= [f for f in listdir('/'+mypath) if os.path.isdir(join(mypath, f))]
  for directory in directories:
    get_all_directories(directory)
    #loss_file_search(mypath+'/'+directory)

get_all_directories(PATH)

In [0]:
def loss_analysis(losses):
 print('loss analysis') 