# Imputation Experiment on MUJOCO Dataset

Forked from [SSSD repo](https://github.com/AI4HealthUOL/SSSD) : 

We collected the dataset directly from [NRTSI repository](https://github.com/lupalab/NRTSI/tree/main/codes_regularly-sampled), which provides a [link](https://www.dropbox.com/s/pjccc2piis8g2fx/mujoco_train.npy?dl=0) for the train set, and another [link](https://www.dropbox.com/s/ktkswh77sueqfy8/mujoco_test.npy?dl=0) for the test set.  

Shan, Siyuan, Yang Li, and Junier B. Oliva. "Nrtsi: Non-recurrent time series imputation." arXiv preprint arXiv:2102.03340 (2021).

In [1]:
import os
import torch
import numpy as np

import warnings
warnings.filterwarnings("ignore")

import sys
sys.path.append(os.path.join(os.path.dirname('__file__'), '../'))

from engine.solver import Trainer
from torch.utils.data import Dataset
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from Utils.io_utils import load_yaml_config, instantiate_from_config
from Models.interpretable_diffusion.model_utils import normalize_to_neg_one_to_one, unnormalize_to_zero_to_one

In [2]:
def random_mask(observed_values, missing_ratio=0.1, seed=1984, exclude_features=None):
    observed_masks = ~np.isnan(observed_values)
    if exclude_features is not None:
        observed_masks[:, exclude_features] = False

    # randomly set some percentage as ground-truth
    masks = observed_masks.reshape(-1).copy()
    obs_indices = np.where(masks)[0].tolist()

    # Store the state of the RNG to restore later.
    st0 = np.random.get_state()
    np.random.seed(seed)

    miss_indices = np.random.choice(
        obs_indices, (int)(len(obs_indices) * missing_ratio), replace=False
    )

    # Restore RNG.
    np.random.set_state(st0)
    
    masks[miss_indices] = False
    gt_masks = masks.reshape(observed_masks.shape)
    observed_values = np.nan_to_num(observed_values)
    return observed_values, observed_masks, gt_masks

In [3]:
class MUJOCODataset(Dataset):
    def __init__(self, data, regular=True, ratio=0.):
        super(MUJOCODataset, self).__init__()
        self.sample_num = data.shape[0]
        self.samples = data
        self.regular = regular
        self.mask = np.empty([0, data.shape[1], data.shape[2]])
        if not self.regular:
            for i in range(data.shape[0]):
                *_, mask = random_mask(data[i, :, :], ratio)
                self.mask = np.row_stack([self.mask, np.expand_dims(mask, 0)])
        self.mask = self.mask.astype(bool)

    def __getitem__(self, ind):
        x = self.samples[ind, :, :]
        if self.regular:
            return torch.from_numpy(x).float()
        mask = self.mask[ind, :, :]
        return torch.from_numpy(x).float(), torch.from_numpy(mask)

    def __len__(self):
        return self.sample_num

In [4]:
train = np.load('../Data/mujoco_train.npy')  # downloaded from https://www.dropbox.com/s/pjccc2piis8g2fx/mujoco_train.npy?dl=0
test = np.load('../Data/mujoco_test.npy')  # downloaded from https://www.dropbox.com/s/ktkswh77sueqfy8/mujoco_test.npy?dl=0

print(train.shape, test.shape)

In [5]:
scaler = MinMaxScaler()
train_scaled = normalize_to_neg_one_to_one(scaler.fit_transform(train.reshape(-1, train.shape[-1]))).reshape(train.shape)
test_scaled = scaler.transform(test.reshape(-1, test.shape[-1])).reshape(test.shape)
test_scaled = normalize_to_neg_one_to_one(test_scaled)

train_dataset = MUJOCODataset(train_scaled)
dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=0, pin_memory=True, sampler=None)

In [6]:
class Args_Example:
    def __init__(self) -> None:
        self.config_path = '../Config/mujoco_sssd.yaml'
        self.save_dir = '../imputation_exp'
        self.gpu = 0
        os.makedirs(self.save_dir, exist_ok=True)

args =  Args_Example()
configs = load_yaml_config(args.config_path)
device = torch.device(f'cuda:{args.gpu}' if torch.cuda.is_available() else 'cpu')

model = instantiate_from_config(configs['model']).to(device)
trainer = Trainer(config=configs, args=args, model=model, dataloader={'dataloader':dataloader})

In [7]:
trainer.train()

In [8]:
sample_num, seq_length, feat_num = test_scaled.shape

for missing_ratio in [0.7, 0.8, 0.9]:
    mses = []
    samples = np.empty([0, sample_num, seq_length, feat_num])
    test_dataset = MUJOCODataset(test_scaled, regular=False, ratio=missing_ratio)
    test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=256, shuffle=False, num_workers=0, pin_memory=True, sampler=None)

    sample, *_ = trainer.restore(test_dataloader, shape=[seq_length, feat_num], coef=1e-2, stepsize=5e-2, sampling_steps=200)
    sample = scaler.inverse_transform(unnormalize_to_zero_to_one(sample.reshape(-1, feat_num))).reshape(sample.shape)
    samples = np.row_stack([samples, np.expand_dims(sample, 0)])
    mask = test_dataset.mask
    mse = mean_squared_error(sample[~mask], test[~mask])
    print(f'Now with {missing_ratio} unobserved: {mse}')