In [5]:
import numpy as np
import pandas as pd
import tensorflow as tf
import random
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
import sys

#Import custom modules
sys.path.append('../')
from utils.utils import *

# Import data and mask

In [6]:
training_data, testing_data, testing_index = load_data()
countries, no_features, feature_names, years, months, weekdays, hours = load_data_information()

In [7]:
# Extracts features, year, etc. from whole data
def change_format(input_data):
    #Extract country from data matrix
    country_w = input_data[:,0,3]   
    #Extract features from matrix
    features_w = input_data[:,:,5:5+no_features]
    #Extract matrix of missing values from data matrix
    miss_vals_w = input_data[:,:,-no_features-6:-6]

    #Prepare format for features
    features_tf = np.reshape(features_w, [features_w.shape[0], -1, 1])
    miss_vals_tf = np.reshape(miss_vals_w, [features_w.shape[0], -1, 1])  
    

    return features_tf, miss_vals_tf, country_w

In [8]:
#Function for testing mask
def get_testing_mask(features, miss_vals, mask):
    mask = mask.reshape(features.shape)
    features_masked = np.array(features)
    features_masked[mask==1] = 0

    miss_vals_masked = np.array(miss_vals)
    miss_vals_masked[mask==1] = 1

    
    return features_masked, miss_vals_masked, mask

# Loop over testing masks

In [9]:
#Prepare training and testing data
features_train,_,train_countries = change_format(training_data)
features_train = features_train.reshape(features_train.shape[0],features_train.shape[1])
features_test,miss_vals_test,test_countries = change_format(testing_data)
features_test = features_test.reshape(features_test.shape[0],features_test.shape[1])

In [10]:
# Fit KNNImputer
imputer = KNNImputer()
imputer.fit(features_train)

KNNImputer()

In [11]:
perc = np.round(np.arange(0.1,1,0.1),1)
for p in perc:
    testing_mask = np.load("../evaluation/masks/testing_mask_{}.npy".format(p))
    
    #Mask values of testing_data
    _,_,mask = get_testing_mask(features_test,miss_vals_test,testing_mask)
    features_test_masked = features_test.copy()
    features_test_masked[mask==1]=np.nan
    
    #Train KNN Imputer
    knn_pred = imputer.transform(features_test_masked)
    
    #Obtain true values and prediction
    true_values = features_test[mask==1]
    prediction = knn_pred[mask==1]
    
    #Calculate mse
    mse = mean_squared_error(true_values,prediction)
    print(mse)
    
    #Save prediction
    np.save("../data/predictions/knn_pred_{}".format(p),prediction)

0.07756092
0.0792192
0.079119936
0.08223445
0.08716319
0.090610445
0.09350976
0.09967454
0.10487978
