# kNN Equal length Time Series Classifier using Locality Sensitive Hashing


In [1]:
import pandas as pd
import numpy as np
import random
from tslearn.utils import to_time_series_dataset
from tslearn.neighbors import KNeighborsTimeSeriesClassifier
from datetime import datetime

In [3]:
def generate_random_vectors(minimum=0, maximum=1, vectors_shape=1, num_vectors=1, seed=0):
    """
    Generate random vectors (or chuncks) to separate data

    Parameters
    ----------
    minimum: 
    
    Returns
    -------
    vehicles_now_df: cleaned pandas dataframe
    rides_now_df: cleaned pandas dataframe
    """
    np.random.seed(seed)
    return np.random.uniform(minimum, maximum, (num_vectors, vectors_shape)).T

In [4]:
def partition_into_bins(data, random_vectors):
    """ 
    Separate data into bins
    
    Parameters
    ----------
    data: (pandas DataFrame) corresponding to the dataset you wwant to train
    random_vectors: (numpy array) necessary to separate dataset into bins

    Returns
    -------
    k-dimensional binary array showing if the data point is over or under the vector 
    """
    return np.array(data.dot(random_vectors) >= 0, dtype=int)

In [5]:
def bins_table(bin_indices):
    """
    Update 'table' so that 'table[i]' is the list of document ids with bin index equal to i.
    
    Parameters
    ----------
    bin_indices: integer representation of bin indexes

    Returns
    -------
    dictionary containing integer representation data
    """ 
    table = dict()
    for data_index, bin_index in enumerate(bin_indices): 
        if bin_index not in table:
            table[bin_index] = []
        table[bin_index].append(data_index) #indice del bin
    return table

In [1]:
def train_LSH(data, num_vectors, seed):
    """
    Locality Sensitive Hashing training function
    
    Parameters
    ----------
    data: dataframe or array used to train
    num_vectors: number of chunks or bins
    seed: seed to replicate values

    Returns
    -------
    dictionary containing integer representation data
    """
    data = data.reshape(len(data),-1)
    random_vectors = generate_random_vectors(vectors_shape=data[0].shape[0], num_vectors=num_vectors, seed=seed)
    bin_index_bits = partition_into_bins(data, random_vectors)
    powers_of_two = 1 << np.arange(num_vectors-1, -1, -1)
    bin_indices = bin_index_bits.dot(powers_of_two)
    train_table = bins_table(bin_indices)
    trained_LSH = dict()
    trained_LSH['bin_index_bits'] = bin_index_bits
    trained_LSH['bin_indices'] = bin_indices
    trained_LSH['table'] = train_table
    trained_LSH['random_vectors'] = random_vectors
    return trained_LSH

In [2]:
def DTW_models(X_train, y_train, trained_LSH):
    """
    Fits a k-NN model for every chunk
    
    Parameters
    ----------
    X_train: (array) corresponding to the data features for the model to learn
    y_train: (array) corresponding to the data labels for the model to learn
    trained_LSH: (dictionary)  containing integer representation of data

    Returns
    -------
    dictionary containing integer representation data and trained models 
    for every bin or chuck
    """
    table_model = dict()
    for i in trained_LSH['table'].keys():
        X_train_ts = to_time_series_dataset(X_train[trained_LSH['table'][i]])
        y_train_ts = y_train[trained_LSH['table'][i]]
        knn = KNeighborsTimeSeriesClassifier(n_neighbors=2)
        table_model[i] = knn.fit(X_train_ts, y_train_ts)
    return table_model

In [3]:
def DTW_predict(table_LSH, X_test):
    """
    
    Function used to predict X_test array
    
    Parameters
    ----------
    trained_LSH: (dictionary)  containing integer representation of data
    X_test: (array) corresponding to the data features for the model to predict

    Returns
    -------
    array with predicted labels
    
    """
    num_vectors = len(table_LSH['bin_index_bits'][0])
    X_test = X_test.reshape(len(X_test),-1)
    
    bin_index_bits_test = partition_into_bins(X_test, table_LSH['random_vectors'])
    powers_of_two = 1 << np.arange(num_vectors-1, -1, -1)
    bin_indices_test = bin_index_bits_test.dot(powers_of_two)
    test_table = bins_table(bin_indices_test)
    y_hat = []
    for i in table_LSH['table'].keys():
        y_hat += list(zip(test_table[i], table_LSH['models'][i].predict(X_test[test_table[i]]))) # falta posicion de la prediccion
    y_hat = np.array(y_hat)
    return y_hat[y_hat[:,0].argsort()][:,1]

In [4]:
def DTW_classifier(X_train, y_train, num_vectors, seed):
    """
    Parameters
    ----------
    X_train: (array) corresponding to the data features for the model to learn
    y_train: (array) corresponding to the data labels for the model to learn
    num_vectors: number of chunks or bins
    seed: seed to replicate values

    Returns
    -------
    dictionary containing integer representation data and trained models 
    for every bin or chuck
    """
    trained_LSH = train_LSH(X_train, num_vectors, seed)
    trained_LSH['models'] = DTW_models(X_train, y_train, trained_LSH)
    return trained_LSH

In [10]:
table_LSH = DTW_classifier(X_train=X_train, y_train=y_train, num_vectors=6, seed=0)

In [15]:
y_hat = DTW_predict(table_LSH, X_test, y_test)