# Lagged Time-Series Clustering Simulations

In [4]:
import random
import os
from multiprocessing import Pool
from functools import partial
import numpy as np
from fastdtw import fastdtw
from ananke._database_rework import TimeSeriesData
from ananke._ts_simulation import gen_table
from ananke._input import fasta_to_ananke
from DBloomSCAN import DBloomSCAN

Cluster the data using a given distance function. Can do this in memory, or pull the data from disk. Only necessary for really large sets or my laptop.

In [5]:
def compute_distances(tsd, distance_function, in_memory=True, n_threads=1):
    n_objects = tsd._h5t["data/timeseries/matrix"].shape[0]
    
    if in_memory:
        data_matrix = np.empty(tsd._h5t["data/timeseries/matrix"].shape)
        print("Loading data matrix into memory")
        tsd._h5t["data/timeseries/matrix"].read_direct(data_matrix)
        def retrieve_data(index):
            #If the data is too large for RAM, this can be swapped around to read from disk
            #data = tsd._h5t["data/timeseries/matrix"][index, :]
            #return data/sum(data)
            return data_matrix[index,:]/sum(data_matrix[index,:])
    else:
        def retrieve_data(index):
            #If the data is too large for RAM, this can be swapped around to read from disk
            data = tsd._h5t["data/timeseries/matrix"][index, :]
            return data/sum(data)

    print("Initializing BloomDistance structure")
    dbl = DBloomSCAN(n_objects, distance_function, retrieve_data, 
                       dist_min = 0.0001, dist_max=0.015, dist_step=0.0005)
    print("Pre-computing distances")
    #This should be set to 1 unless you're using DDTW, but I think that crashes anyways.
    #Worth a shot somewhere with more RAM.
    dbl.compute_distances(n_threads=n_threads)
    return dbl

In [6]:
tsd = TimeSeriesData("Mendota.h5")
#tsd.initialize_from_metadata("/home/mwhall/Documents/Ananke/Ananke_PeerJ/McMahon_Mendota/sequence_data/METADATA_modified.txt", name_col="#SampleID", time_col="time_points")
#fasta_to_ananke(open("/home/mwhall/Documents/Ananke/Ananke_PeerJ/McMahon_Mendota/sequence_data/seq.fasta"), tsd, push_at=1e9)

In [7]:
print(tsd)
for group in tsd._h5t["data"]:
    print(group)
print(tsd._h5t["data/timeseries/matrix"].shape)

Origin version: 0.4.0
Num. of Time Points: 96
Num. of Time Series: 6116298
timeseries
(35788, 96)


In [8]:
matrix = tsd._h5t["data/timeseries/matrix"]
def chunks(N, nb):
    step = N / nb
    return [(round(step*i), round(step*(i+1))) for i in range(nb)]
#tsd = TimeSeriesData("Mendota.h5")
nrows, ncols = matrix.shape

threshold = 20
def filter_function(row):
    return np.count_nonzero(row) <= threshold

cursor = 0
#Grab big chunks for efficiency
for i, j in chunks(nrows, 10000):
    rows = matrix[i:j,:]
    
    for k in range(i,j):
        if not filter_function(rows[k-i,:]):
            if k != cursor:
                matrix[cursor, :] = rows[k-i,:]
            cursor += 1
matrix.resize(size=(cursor - 1, ncols))

In [8]:
distance_measure = "dtw"
#time_points = [int(x) for x in tsd._h5t["data/timeseries/time"][:]]
#time_delta = np.array(time_points[1:]) - np.array(time_points[0:-1])
#data_matrix = np.empty(tsd._h5t["data/timeseries/matrix"].shape)
#tsd._h5t["data/timeseries/matrix"].read_direct(data_matrix)

def compute_ddtw_distance(data1, data2):
    distance, path = DDTW(data1, data2)
    distance = distance[-1, -1]
    return distance

def compute_dtw_distance(data1, data2):
    distance, path = fastdtw(data1, data2)
    return distance

def compute_sts_distance(data1, data2):
    data1_delta = np.array(data1[1:]) - np.array(data1[0:-1])
    data2_delta = np.array(data2[1:]) - np.array(data2[0:-1])
    data1_slope = data1_delta / time_delta
    data2_slope = data2_delta / time_delta
    distance = data1_slope - data2_slope
    distance = np.square(distance)
    distance = np.sqrt(sum(distance))
    return distance

if distance_measure == "sts":
    distance_function = compute_sts_distance
elif distance_measure == "dtw":
    distance_function = compute_dtw_distance
elif distance_measure == "ddtw":
    distance_function = compute_ddtw_distance

dists = compute_distances(tsd, distance_function, in_memory=True, n_threads=1)

#Load once so we don't load it a billion times
   

Loading data matrix into memory
Initializing BloomDistance structure




After 71576 samples of the distances, the max distance was 1.951635
Pre-computing distances
54.01%

KeyboardInterrupt: 

These scores represent the best clustering result achievable, across all epsilon values, for that given seed signal. More intuitively, this represents the ability to recover the complete set of signals that are sampled/observed from some underlying process, given knowledge of that process.

In [None]:
#Print out the cluster that corresponds to a given tru

from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.graph_objs import Scatter, Figure, Layout
init_notebook_mode(connected=True)

def plot_cluster(true_signal, seed_index, dists):
    #Plot some of the clusters
    data = []
    signal = true_signal[seed_index,:]
    observed = tsd._h5t["data/timeseries/matrix"][seed_index,:]
    nearest_index = find_nearest_timeseries(signal/sum(signal), 
                                            data_matrix, distance_function, n_threads=1)
    epsilon = None
    cluster_id = None
    for epsilon in dists.dist_range:
        data = [{'name':'signal', 'x': timepoints, 'y': signal/sum(signal)},
                {'name':'actual', 'x': timepoints, 'y': observed/sum(observed)}]
        cluster_member_indexes = dists.DBSCAN(epsilon, expand_around=nearest_index)
        cluster_id = list(cluster_member_indexes.keys())[0]
        for ts_id in cluster_member_indexes[cluster_id]:
            ts = tsd._h5t["data/timeseries/matrix"][ts_id,:]
            data.append({'name':ts_id, 'y': ts/sum(ts), 'x': timepoints})
        iplot(data)
plot_cluster(true_signal, 2, dists)

In [None]:
def ranges(N, nb):
    step = N / nb
    return [(round(step*i), round(step*(i+1))) for i in range(nb)]

In [None]:
ranges(6219008, 100)

In [None]:
list(range(0,100))