# Test the Experiment class and the functions in experiments.py:

In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np

import dtw_missing.experiments as exp

Create a simple dataset:

In [2]:
dataset = np.array([[1, 1, 1, 5, 5, 2, 2], 
                    [1, 5, 5, 5, 5, 5, 2], 
                    [1.5, 5, 2, 2, 2, 2, 2], 
                    [10, 10, 10, 10, 10, 10, 10]
                  ])
N = len(dataset) # #instances

Contaminate the dataset:

In [3]:
dataset[0, 2] = np.nan
dataset[1, 2:4] = np.nan
dataset[2, -1] = np.nan
dataset[3, 0:4] = np.nan

Set DTW(-AROW) parameters:

In [4]:
dtw_params = {
    'window' : None,
    'max_dist' : None,
    'use_pruning' : False,
    'max_step' : None,
    'max_length_diff' : None,
    'penalty' : None,
    'psi' : None, #(2, 3, 4, 5), # (begin series1, end series1, begin series2, end series2)
    'psi_neg' : True,
    'use_c' : False,
    'use_ndim' : False,
}

## Test the functions:

### Univariate:

In [5]:
D = np.empty((N, N))
for i in range(N):
    for j in range(N):
        D[i,j] = exp.dtw_arow_distance(dataset[i], dataset[j], dtw_params)

print(D)

[[ 0.          4.5126086   4.64578662 15.42724862]
 [ 4.5126086   0.          4.8194681  14.12444689]
 [ 4.64578662  4.8194681   0.         14.11067366]
 [15.42724862 14.12444689 14.11067366  0.        ]]


In [6]:
D_ = exp.dtw_arow_distance_matrix(dataset=dataset, dtw_params=dtw_params)

print(D_)

[[ 0.          4.5126086   4.64578662 15.42724862]
 [ 4.5126086   0.          4.8194681  14.12444689]
 [ 4.64578662  4.8194681   0.         14.11067366]
 [15.42724862 14.12444689 14.11067366  0.        ]]


### Multivariate:

In [7]:
def convert_univariate_into_multivariate(x, a, d):
    # Convert a univariate time series into multivariate (with D dimensions) by adding the constants in a to every dimension
    x = np.tile(x/np.sqrt(d), (d,1)).T
    x += a
    
    return x

Transform the univariate dataset into multivariate without changing the pairwise DTW distances:

In [8]:
d = 3 # #dimensions
a = 10*np.random.rand(d) - 5
dataset_multivariate = np.empty(dataset.shape + (d,))
for i in range(len(dataset)):
    dataset_multivariate[i] = convert_univariate_into_multivariate(dataset[i], a, d)

print(dataset_multivariate)

[[[ 5.27647599 -2.02741264  2.1543471 ]
  [ 5.27647599 -2.02741264  2.1543471 ]
  [        nan         nan         nan]
  [ 7.58587707  0.28198844  4.46374818]
  [ 7.58587707  0.28198844  4.46374818]
  [ 5.85382626 -1.45006237  2.73169737]
  [ 5.85382626 -1.45006237  2.73169737]]

 [[ 5.27647599 -2.02741264  2.1543471 ]
  [ 7.58587707  0.28198844  4.46374818]
  [        nan         nan         nan]
  [        nan         nan         nan]
  [ 7.58587707  0.28198844  4.46374818]
  [ 7.58587707  0.28198844  4.46374818]
  [ 5.85382626 -1.45006237  2.73169737]]

 [[ 5.56515113 -1.73873751  2.44302224]
  [ 7.58587707  0.28198844  4.46374818]
  [ 5.85382626 -1.45006237  2.73169737]
  [ 5.85382626 -1.45006237  2.73169737]
  [ 5.85382626 -1.45006237  2.73169737]
  [ 5.85382626 -1.45006237  2.73169737]
  [        nan         nan         nan]]

 [[        nan         nan         nan]
  [        nan         nan         nan]
  [        nan         nan         nan]
  [        nan         nan        

In [9]:
D = np.empty((N, N))
for i in range(N):
    for j in range(N):
        D[i,j] = exp.dtw_arow_distance(dataset_multivariate[i], dataset_multivariate[j], dtw_params)

print(D)

[[ 0.          4.5126086   4.64578662 15.42724862]
 [ 4.5126086   0.          4.8194681  14.12444689]
 [ 4.64578662  4.8194681   0.         14.11067366]
 [15.42724862 14.12444689 14.11067366  0.        ]]


In [10]:
D_ = exp.dtw_arow_distance_matrix(dataset=dataset_multivariate, dtw_params=dtw_params)

print(D_)

[[ 0.          4.5126086   4.64578662 15.42724862]
 [ 4.5126086   0.          4.8194681  14.12444689]
 [ 4.64578662  4.8194681   0.         14.11067366]
 [15.42724862 14.12444689 14.11067366  0.        ]]


### Univariate variable-length:

In [11]:
lengths = [6, 7, 4, 5] # lengths for the time series in dataset

dataset_variable_length = []
for i in range(len(dataset)):
    dataset_variable_length.append(dataset[i, 0:lengths[i]])

print(dataset_variable_length)

[array([ 1.,  1., nan,  5.,  5.,  2.]), array([ 1.,  5., nan, nan,  5.,  5.,  2.]), array([1.5, 5. , 2. , 2. ]), array([nan, nan, nan, nan, 10.])]


In [12]:
D = np.empty((N, N))
for i in range(N):
    for j in range(N):
        D[i,j] = exp.dtw_arow_distance(dataset_variable_length[i], dataset_variable_length[j], dtw_params)

print(D)

[[ 0.          4.5607017   4.5338235  12.77367084]
 [ 4.5607017   0.          6.10555303 15.09966887]
 [ 4.5338235   6.10555303  0.                 inf]
 [12.77367084 15.09966887         inf  0.        ]]


In [13]:
D_ = exp.dtw_arow_distance_matrix(dataset=dataset_variable_length, dtw_params=dtw_params)

print(D_)

[[ 0.          4.5607017   4.5338235  12.77367084]
 [ 4.5607017   0.          6.10555303 15.09966887]
 [ 4.5338235   6.10555303  0.                 inf]
 [12.77367084 15.09966887         inf  0.        ]]


### Multivariate variable-length:

In [14]:
d = 3 # #dimensions

dataset_multivariate_variable_length = list(map(lambda o: convert_univariate_into_multivariate(o, a, d), dataset_variable_length))

print(dataset_multivariate_variable_length)

[array([[ 5.27647599, -2.02741264,  2.1543471 ],
       [ 5.27647599, -2.02741264,  2.1543471 ],
       [        nan,         nan,         nan],
       [ 7.58587707,  0.28198844,  4.46374818],
       [ 7.58587707,  0.28198844,  4.46374818],
       [ 5.85382626, -1.45006237,  2.73169737]]), array([[ 5.27647599, -2.02741264,  2.1543471 ],
       [ 7.58587707,  0.28198844,  4.46374818],
       [        nan,         nan,         nan],
       [        nan,         nan,         nan],
       [ 7.58587707,  0.28198844,  4.46374818],
       [ 7.58587707,  0.28198844,  4.46374818],
       [ 5.85382626, -1.45006237,  2.73169737]]), array([[ 5.56515113, -1.73873751,  2.44302224],
       [ 7.58587707,  0.28198844,  4.46374818],
       [ 5.85382626, -1.45006237,  2.73169737],
       [ 5.85382626, -1.45006237,  2.73169737]]), array([[        nan,         nan,         nan],
       [        nan,         nan,         nan],
       [        nan,         nan,         nan],
       [        nan,         nan,

In [15]:
D = np.empty((N, N))
for i in range(N):
    for j in range(N):
        D[i,j] = exp.dtw_arow_distance(dataset_multivariate_variable_length[i], dataset_multivariate_variable_length[j], dtw_params)

print(D)

[[ 0.          4.5607017   4.5338235  12.77367084]
 [ 4.5607017   0.          6.10555303 15.09966887]
 [ 4.5338235   6.10555303  0.                 inf]
 [12.77367084 15.09966887         inf  0.        ]]


In [16]:
D_ = exp.dtw_arow_distance_matrix(dataset=dataset_multivariate_variable_length, dtw_params=dtw_params)

print(D_)

[[ 0.          4.5607017   4.5338235  12.77367084]
 [ 4.5607017   0.          6.10555303 15.09966887]
 [ 4.5338235   6.10555303  0.                 inf]
 [12.77367084 15.09966887         inf  0.        ]]


## Test the Experiment class:

In [17]:
e_dtw_arow = exp.Experiment()
e_dtw_arow.dataset = dataset
e_dtw_arow.compute_pairwise_distances('dtw_arow')
print(e_dtw_arow.get_pairwise_distances())

[[ 0.          4.5126086   4.64578662 15.42724862]
 [ 4.5126086   0.          4.8194681  14.12444689]
 [ 4.64578662  4.8194681   0.         14.11067366]
 [15.42724862 14.12444689 14.11067366  0.        ]]


In [25]:
# dtw_params = {}
dtw_cai_params = dict(no_clusters='elbow', 
                      no_clusters_range_for_elbow=(2,4),
                     )

In [57]:
e_dtw_cai = exp.Experiment()
e_dtw_cai.dataset = dataset
e_dtw_cai.compute_pairwise_distances('dtw_cai', 
                                     missing_method_params=[dtw_params, dtw_cai_params],
                                     progress_bar=True, n_jobs=1)

Computing DTW-AROW distances...


100%|██████████| 4/4 [00:00<00:00, 1336.62it/s]


Executing the clustering in DTW-CAI...
Running the elbow method ...
DBA initialized by the medoid

Iteration0:


calculating distances: 100%|██████████| 4/4 [00:00<00:00, 803.85it/s]
imputing: 100%|██████████| 3/3 [00:00<00:00, 567.08it/s]
updating the means: 100%|██████████| 3/3 [00:00<00:00, 726.16it/s]



Iteration1:


calculating distances: 100%|██████████| 4/4 [00:00<00:00, 1121.10it/s]
imputing: 100%|██████████| 3/3 [00:00<00:00, 924.47it/s]


cluster 0 (counting from 0) has a completely missing mean! Assigning another instance to it...
cluster 1 (counting from 0) is empty! Assigning another instance to it...
cluster 2 (counting from 0) is empty! Assigning another instance to it...


updating the means: 100%|██████████| 3/3 [00:00<00:00, 605.50it/s]



Iteration2:


calculating distances: 100%|██████████| 4/4 [00:00<00:00, 793.66it/s]
imputing: 100%|██████████| 3/3 [00:00<00:00, 585.80it/s]
updating the means: 100%|██████████| 3/3 [00:00<00:00, 667.88it/s]



Iteration3:


calculating distances: 100%|██████████| 4/4 [00:00<00:00, 956.29it/s]
imputing: 100%|██████████| 3/3 [00:00<00:00, 758.88it/s]
updating the means: 100%|██████████| 3/3 [00:00<00:00, 689.40it/s]



Iteration4:


calculating distances: 100%|██████████| 4/4 [00:00<00:00, 773.50it/s]
imputing: 100%|██████████| 3/3 [00:00<00:00, 546.87it/s]
imputing: 100%|██████████| 3/3 [00:00<00:00, 551.28it/s]



Calculating DRW-AROW distances based on the imputed data...


100%|██████████| 4/4 [00:00<00:00, 1544.15it/s]

DTW-CAI completed.





array([[ 0.        ,  4.5126086 ,  4.64578662, 15.42724862],
       [ 4.5126086 ,  0.        ,  4.8194681 , 14.12444689],
       [ 4.64578662,  4.8194681 ,  0.        , 14.11067366],
       [15.42724862, 14.12444689, 14.11067366,  0.        ]])