In [1]:
import time
import os

import pandas as pd
import numpy as np
np.set_printoptions(precision=6, suppress=True)

from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
plt.rcParams['figure.figsize'] = ((8/2.54), (6/2.54))
plt.rcParams["font.family"] = "Arial"
plt.rcParams["mathtext.default"] = "rm"
plt.rcParams.update({'font.size': 11})
MARKER_SIZE = 15
cmap_m = ["#f4a6ad", "#f6957e", "#fccfa2", "#8de7be", "#86d6f2", "#24a9e4", "#b586e0", "#d7f293"]
cmap = ["#e94d5b", "#ef4d28", "#f9a54f", "#25b575", "#1bb1e7", "#1477a2", "#a662e5", "#c2f442"]

plt.rcParams['axes.spines.top'] = False
# plt.rcParams['axes.edgecolor'] = 
plt.rcParams['axes.linewidth'] = 1
plt.rcParams['lines.linewidth'] = 1.5
plt.rcParams['xtick.major.width'] = 1
plt.rcParams['xtick.minor.width'] = 1
plt.rcParams['ytick.major.width'] = 1
plt.rcParams['ytick.minor.width'] = 1

## hyperparameters

In [4]:
LOSS_RATES = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95]
DIRECTORY = '../data/'

## data loading

In [5]:
l = np.load('./data/tot_dataset_loss_0.30.npz')
MAXS = l['MAXS']
MINS = l['MINS']

In [6]:
TOM_DIRECTORY = '../data/tom/'
file_list = os.listdir(TOM_DIRECTORY)
dataset_list = ['tom/' + file for file in file_list if file.endswith('.csv') and file.startswith('Val')]
dataset_list.sort()
temp = dataset_list

PAP_DIRECTORY = '../data/pap/'
file_list = os.listdir(PAP_DIRECTORY)
dataset_list = ['pap/' + file for file in file_list if file.endswith('.csv') and file.startswith('Val')]
dataset_list.sort()

temp.extend(dataset_list)
dataset_list = temp

## Mining

In [7]:
for LOSS_RATE in LOSS_RATES:
    print('%.2f loss start.' % LOSS_RATE)
    for FILENAME in dataset_list:
        raw_df = pd.read_csv(DIRECTORY + FILENAME, index_col=['MEAS_DATE']).dropna(how='all')
        raw_df = raw_df[~(raw_df == 0).all(axis=1)].interpolate()
        datetime = pd.DatetimeIndex(raw_df.index)
        env_df = raw_df.values
        np.random.seed(3101)
        null_prob = 0.3
        mask = np.random.choice(2, env_df.size, p=[null_prob, 1-null_prob]).reshape(env_df.shape)
        np.random.seed(4564)
        raw_null_prob = LOSS_RATE
        raw_indices = np.random.choice(np.arange(int(env_df.shape[0]/48)-1),
                                       replace=False, size=int(env_df.shape[0]/48 * raw_null_prob))
        raw_indices = raw_indices*48
        raw_indices_ext = []
        for elem in raw_indices:
            for _ in range(elem, elem+48):
                raw_indices_ext.append(_)
        raw_indices = np.array(raw_indices_ext)
        raw_indices = np.unique(raw_indices[raw_indices < env_df.shape[0]])

        mask[raw_indices, :] = 0
        missing_df = np.ma.array(env_df, mask=1-mask, fill_value=-1)
        missing_df = missing_df.filled()
        
        recover_df = missing_df.copy()
        recover_df = pd.DataFrame(recover_df, index=raw_df.index, columns=raw_df.columns)
        recover_df.index = pd.DatetimeIndex(recover_df.index)
        recover_df[recover_df == -1] = np.nan
        recover_df = recover_df.interpolate().bfill()
        
        recover_df.to_csv('./results/recovered_%s_%.2fp_LI.csv' % (FILENAME.split('/')[-1].split('.')[0], LOSS_RATE))
        print('%s done.' % FILENAME.split('/')[-1].split('.')[0])

0.10 loss start.
Val_PF_0001396_tom_env done.
Val_PF_0002528_tom_env done.
Val_PF_0002531_tom_env done.
Val_PF_0002532_tom_env done.
Val_PF_0001288_pap_env done.
Val_PF_0001393_pap_env done.
Val_PF_0001400_pap_env done.
Val_PF_0002537_pap_env done.
0.20 loss start.
Val_PF_0001396_tom_env done.
Val_PF_0002528_tom_env done.
Val_PF_0002531_tom_env done.
Val_PF_0002532_tom_env done.
Val_PF_0001288_pap_env done.
Val_PF_0001393_pap_env done.
Val_PF_0001400_pap_env done.
Val_PF_0002537_pap_env done.
0.30 loss start.
Val_PF_0001396_tom_env done.
Val_PF_0002528_tom_env done.
Val_PF_0002531_tom_env done.
Val_PF_0002532_tom_env done.
Val_PF_0001288_pap_env done.
Val_PF_0001393_pap_env done.
Val_PF_0001400_pap_env done.
Val_PF_0002537_pap_env done.
0.40 loss start.
Val_PF_0001396_tom_env done.
Val_PF_0002528_tom_env done.
Val_PF_0002531_tom_env done.
Val_PF_0002532_tom_env done.
Val_PF_0001288_pap_env done.
Val_PF_0001393_pap_env done.
Val_PF_0001400_pap_env done.
Val_PF_0002537_pap_env done.
0.50