In [2]:
import sys
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import *
import tensorflow_probability as tfp
from tensorflow.keras.models import Model
import statsmodels.api as sm
from math import sqrt as math_sqrt
from sklearn.metrics import mean_squared_error
import pickle
import math
import random

#Import custom modules
sys.path.append('../')
from utils.utils import *

In [3]:
training_data, testing_data, testing_index = load_data()

In [4]:
countries, nu_features, feature_names, years, months, weekdays, hours = load_data_information()

In [5]:
# Extracts features, year, etc. from whole data
def change_format(input_data):
    #Extract year from data matrix
    year_w = input_data[:,:,0:1]
    #Extract weekday from data matrix
    weekday_w = input_data[:,:,1:2]
    #Extract hour from data matrix
    hour_w = input_data[:,:,2:3]
    #Extract country from data matrix
    country_w = input_data[:,0:1,3]   
    #Extract month from data matrix
    month_w = input_data[:,:,4:5]
    #Extract features from matrix
    features_w = input_data[:,:,5:5+nu_features]
    #Extract matrix of missing values from data matrix
    miss_vals_w = input_data[:,:,-nu_features-6:-6]
    #Extract pos enc from data matrix
    pos_enc_w = input_data[:,:,-6:]


    #Prepare format for features
    features_tf = np.reshape(features_w, [features_w.shape[0], -1, 1])
    miss_vals_tf = np.reshape(miss_vals_w, [features_w.shape[0], -1, 1])
    pos_enc_tf = np.reshape(tf.transpose(np.repeat(np.reshape(pos_enc_w, [pos_enc_w.shape[0], pos_enc_w.shape[1], pos_enc_w.shape[2], 1]),
                                                   nu_features, axis = 3), perm=[0,1,3,2]),[pos_enc_w.shape[0],-1,pos_enc_w.shape[2]])
    feature_nr_tf = np.repeat(np.reshape(np.repeat(np.reshape(np.array(range(nu_features)),[1,-1]), input_data.shape[1], axis = 0),[1,-1]), input_data.shape[0], axis = 0)
    
    #Reshape other features
    hour_tf = np.reshape(np.repeat(hour_w, nu_features,axis=2),[input_data.shape[0],-1])
    year_tf = np.reshape(np.repeat(year_w, nu_features,axis=2),[input_data.shape[0],-1])
    weekday_tf = np.reshape(np.repeat(weekday_w, nu_features,axis=2),[input_data.shape[0],-1])
    month_tf = np.reshape(np.repeat(month_w, nu_features,axis=2),[input_data.shape[0],-1])
    
    

    return features_tf, miss_vals_tf, pos_enc_tf, country_w, year_tf, weekday_tf, hour_tf, feature_nr_tf

In [6]:
def create_testing_mask(miss_vals,nu_features,p):

    testing_mask = np.zeros(shape = (testing_data.shape[0],testing_data.shape[1]*nu_features))
    for sample in range(testing_mask.shape[0]):

        #Draw number of missing values
        number_miss_vals = np.random.binomial(n=24*nu_features, p=p)
        #number_miss_vals = int(np.round(24*nu_features*p))

        #Draw index
        index = np.random.choice(np.arange(2*24*nu_features,3*24*nu_features), size=number_miss_vals, replace = False)
        testing_mask[sample,index]=1

    #Reshape to original shape
    testing_mask = testing_mask.reshape(miss_vals.shape)
    
    #Make sure no missing values are included
    testing_mask[miss_vals==1]=0
    return(testing_mask)

In [7]:
#Create new mask
perc = np.arange(0.1,1,0.1)

_,miss_vals,_,_,_,_,_,_=change_format(testing_data)
miss_vals=np.reshape(miss_vals, [miss_vals.shape[0], -1, nu_features])

for p in np.round(perc,1):
    testing_mask = create_testing_mask(miss_vals,nu_features,p)
    np.save("../evaluation/testing_mask_{}".format(p),testing_mask)

    #Print percentage
    total = testing_mask.shape[0]*testing_mask.shape[2]*24
    perc = ((testing_mask.sum()/total)*100)
    print(perc)


9.725878497630237
19.44735707061693
29.123039244848993
38.794259523667876
48.56488091696908
58.24700805124239
68.03541505542665
77.67081622940091
87.39464969163345


In [7]:
def create_testing_mask(miss_vals,nu_features,p):

    testing_mask = np.zeros(shape = (testing_data.shape[0],testing_data.shape[1]*nu_features))
    for sample in range(testing_mask.shape[0]):

        #Draw number of missing values
        number_miss_vals = np.random.binomial(n=24*nu_features, p=p)

        #Draw index
        index = np.random.choice(np.arange(3*24*nu_features,4*24*nu_features), size=number_miss_vals, replace = False)
        testing_mask[sample,index]=1

    #Reshape to original shape
    testing_mask = testing_mask.reshape(miss_vals.shape)
    
    #Make sure no missing values are included
    testing_mask[miss_vals==1]=0
    return(testing_mask)

# Old method

In [8]:
def create_testing_mask(miss_vals,nu_features,p):
    
    testing_mask = np.zeros(shape = (testing_data.shape[0],testing_data.shape[1],nu_features))

    #Custom mask
    nu_features_missing = int(np.round(nu_features*p))


    for sample in range(testing_mask.shape[0]):
        features_masked = np.random.choice(nu_features,nu_features_missing,replace=False)
        for feature in features_masked:
            #Create variable length using binomial distribution
            length = np.random.binomial(n=24, p=0.75)
            #Create position during day to start gap
            if length!=24:
                pos = np.random.randint(low = 0, high = 24-length)
            else:
                pos = 0

            testing_mask[sample,2*24+pos:2*24+(pos+length),feature]=1
            
    #Make sure no missing values are included
    testing_mask[miss_vals==1]=0
    return(testing_mask)

In [9]:
_,miss_vals,_,_,_,_,_,_=change_format(testing_data)
miss_vals=np.reshape(miss_vals, [miss_vals.shape[0], -1, nu_features])
p=0.4
testing_mask = create_testing_mask(miss_vals,nu_features,p)
np.save("../evaluation/masks/testing_mask_test",testing_mask)