In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import random
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
import sys

#Import custom modules
sys.path.append('../')
from utils.utils import *

# Import data and mask

In [2]:
training_data, testing_data, testing_index = load_data()
countries, no_features, feature_names, years, months, weekdays, hours = load_data_information()

In [3]:
# Extracts features, year, etc. from whole data
def change_format(input_data):
    #Extract country from data matrix
    country_w = input_data[:,0,3]   
    #Extract features from matrix
    features_w = input_data[:,:,5:5+no_features]
    #Extract matrix of missing values from data matrix
    miss_vals_w = input_data[:,:,-no_features-6:-6]

    #Prepare format for features
    features_tf = np.reshape(features_w, [features_w.shape[0], -1, 1])
    miss_vals_tf = np.reshape(miss_vals_w, [features_w.shape[0], -1, 1])  
    

    return features_tf, miss_vals_tf, country_w

In [4]:
def mask_features(features, miss_vals, prob_mask):
    #Create and eshape mask
    realized_mask = np.zeros(shape = (prob_mask.shape[0], 5 * 24* no_features))
    realized_mask[:,2*24*no_features:3*24*no_features] = np.squeeze(np.random.binomial(1, prob_mask))
    realized_mask = realized_mask.reshape(features.shape)

    #Mask features
    features_masked = np.array(features)
    features_masked[realized_mask == 1] = 0
    #Mask missing values
    miss_vals_masked = np.array(miss_vals)
    miss_vals_masked[realized_mask == 1] = 1

    return features_masked, miss_vals_masked, realized_mask, miss_vals

# Predict test data

In [5]:
#Prepare training and testing data
features_train, _, train_countries = change_format(training_data)
features_train = features_train.reshape(features_train.shape[0],features_train.shape[1])
features_test, miss_vals_test, test_countries = change_format(testing_data)
features_test = features_test.reshape(features_test.shape[0],features_test.shape[1])

In [6]:
# Fit KNNImputer
imputer = KNNImputer()
imputer.fit(features_train)

KNNImputer()

## Full distribution

In [7]:
testing_mask = np.load("../data/evaluation/testing_distribution_full.npy")
_, _, mask, _ = mask_features(features_test, miss_vals_test, testing_mask)
features_test_masked = features_test.copy()
features_test_masked[mask == 1] = np.nan

#Train KNN Imputer
knn_pred = imputer.transform(features_test_masked)

In [8]:
#Obtain prediction
prediction = knn_pred[mask == 1]
np.save("../data/predictions/knn_pred_full.npy",prediction)

In [9]:
missing = np.squeeze(miss_vals_test)[mask == 1]
mse = mean_squared_error(prediction[missing == 0], features_test[mask == 1][missing == 0])
print(mse)

0.056708086


## Filtered distribution

In [7]:
testing_mask = np.load("../data/evaluation/testing_distribution_filtered.npy")
_, _, mask, _ = mask_features(features_test, miss_vals_test, testing_mask)
features_test_masked = features_test.copy()
features_test_masked[mask == 1] = np.nan

#Train KNN Imputer
knn_pred = imputer.transform(features_test_masked)

In [8]:
#Obtain prediction
prediction = knn_pred[mask == 1]
np.save("../data/predictions/knn_pred_filtered.npy",prediction)

In [9]:
missing = np.squeeze(miss_vals_test)[mask == 1]
mse = mean_squared_error(prediction[missing == 0], features_test[mask == 1][missing == 0])
print(mse)

0.066760235
