# SCC5977 - Aprendizado de Máquina para Séries Temporais (2024)

## Grupo
> André Guarnier De Mitri - 11395579 \
> Fabio \
> Giovanni

## Problema
Incentia 11k euclidiana vs ddtw

# VERSÃO 1.0 SEM PROCESSAMENTO PARALELO

In [None]:
# Import necessary libraries
from aeon.distances import pairwise_distance, ddtw_distance
import numpy as np
from aeon.datasets import load_from_tsfile
from itertools import combinations
import matplotlib.pyplot as plt
from tqdm import tqdm
import time


def load_and_validate_data(file_path):
    """Load and validate time series data."""
    X, y = load_from_tsfile(full_file_path_and_name=file_path)
    print(f"Loaded {X.shape[0]} time series with shape {X.shape[1:]}.")
    return X


def calculate_distances(X, metric_function, metric_name):
    """Calculate pairwise distances using the specified metric with progress tracking."""
    num_ts = X.shape[0]
    total_combinations = (num_ts * (num_ts - 1)) // 2  # Number of combinations

    print(f"Calculating {metric_name}...")
    distances = []
    start_time = time.time()

    with tqdm(total=total_combinations, desc=metric_name) as pbar:
        for (i, ts1), (j, ts2) in combinations(enumerate(X[:, 0]), 2):
            distances.append((float(metric_function(ts1, ts2)), i, j))
            pbar.update(1)  # Update the progress bar

    elapsed_time = time.time() - start_time
    print(f"{metric_name} completed in {elapsed_time:.2f} seconds.")
    return distances, elapsed_time

def summarize_distances(distances, distance_type):
    """Summarize and print statistics for the given distances."""
    distances_only = [d[0] for d in distances]
    smallest = min(distances, key=lambda x: x[0])
    largest = max(distances, key=lambda x: x[0])
    median_dist = np.median(distances_only)

    print(f"\n{distance_type} Analysis:")
    print(f"Median Distance: {median_dist:.4f}")
    print(f"Smallest Distance: {smallest[0]:.4f} (between series {smallest[1]} and {smallest[2]})")
    print(f"Largest Distance: {largest[0]:.4f} (between series {largest[1]} and {largest[2]})")

    return smallest, largest


def plot_time_series_comparison(X, pair, title, axs):
    """Plot two time series with a title."""
    ts1, ts2 = X[pair[1], 0], X[pair[2], 0]
    axs.plot(ts1, label="Time Series 1")
    axs.plot(ts2, label="Time Series 2")
    axs.set_title(title)
    axs.legend()
    axs.grid(True)


def main():
    # Load data
    file_path = './data/ts_files/train.ts'
    X = load_and_validate_data(file_path)

    # Calculate distances
    euclidean_distances, euclidean_time = calculate_distances(
        X, lambda x, y: pairwise_distance(x, y, metric="euclidean"), "Euclidean Distance"
    )
    ddtw_distances, ddtw_time = calculate_distances(X, ddtw_distance, "Derivative DTW Distance")

    # Summarize distances
    euclidean_smallest, euclidean_largest = summarize_distances(euclidean_distances, "Euclidean Distance")
    ddtw_smallest, ddtw_largest = summarize_distances(ddtw_distances, "Derivative DTW Distance")

    # Print time analysis
    print(f"\nTiming Analysis:")
    print(f"Euclidean Distance took {euclidean_time:.2f} seconds.")
    print(f"Derivative DTW Distance took {ddtw_time:.2f} seconds.")

    # Plot the time series comparisons
    fig, axs = plt.subplots(2, 2, figsize=(12, 8))

    plot_time_series_comparison(X, euclidean_smallest, "Smallest Euclidean Distance", axs[0, 0])
    plot_time_series_comparison(X, ddtw_smallest, "Smallest DDTW Distance", axs[0, 1])
    plot_time_series_comparison(X, euclidean_largest, "Largest Euclidean Distance", axs[1, 0])
    plot_time_series_comparison(X, ddtw_largest, "Largest DDTW Distance", axs[1, 1])

    plt.tight_layout()
    plt.show()


# Run the main function
main()

Loaded 6000 time series with shape (1, 137).
Calculating Euclidean Distance...


Euclidean Distance: 100%|██████████| 17997000/17997000 [20:18<00:00, 14773.68it/s]


Euclidean Distance completed in 1218.18 seconds.
Calculating Derivative DTW Distance...


Derivative DTW Distance:  13%|█▎        | 2251135/17997000 [16:23<1:52:56, 2323.64it/s]

### Similariedade por classes

In [None]:
from collections import Counter
from random import sample
import numpy as np
import matplotlib.pyplot as plt

def calculate_class_similarities(X, y, metric_function, metric_name):
    """Calculate pairwise distances within sampled instances of each class."""
    unique_classes = np.unique(y)
    class_samples = {}
    class_distances = {}

    for class_label in unique_classes:
        # Filter indices for the current class
        class_indices = np.where(y == class_label)[0]
        
        # Randomly sample 5 instances from the class
        sampled_indices = sample(list(class_indices), min(5, len(class_indices)))
        class_samples[class_label] = sampled_indices
        
        # Calculate distances within the sampled class
        distances = []
        for (i, ts1), (j, ts2) in combinations(enumerate(X[sampled_indices, 0]), 2):
            distances.append(float(metric_function(ts1, ts2)))

        class_distances[class_label] = distances

    # Summarize results
    for class_label, distances in class_distances.items():
        if distances:
            median_dist = np.median(distances)
            mean_dist = np.mean(distances)
            print(f"\nClass {class_label} ({metric_name}):")
            print(f"  Median Distance: {median_dist:.4f}")
            print(f"  Mean Distance: {mean_dist:.4f}")
        else:
            print(f"\nClass {class_label}: Insufficient data for calculation.")

    return class_samples, class_distances

def main_similarity_analysis():
    # Load data
    file_path = './data/ts_files/train.ts'
    X, y = load_from_tsfile(full_file_path_and_name=file_path)

    # Analyze similarities for each class
    print("\n=== Similarity Analysis ===")
    
    # Euclidean
    calculate_class_similarities(
        X, y,
        lambda x, y: pairwise_distance(x, y, metric="euclidean"),
        "Euclidean Distance"
    )

    # Derivative DTW
    calculate_class_similarities(
        X, y,
        ddtw_distance,
        "Derivative DTW Distance"
    )

# Run the similarity analysis
main_similarity_analysis()

# Trabalhos relacionados

# Algoritimos Utilizados
> DDTW \
> Baseline Euclidiana

# Metodologia

O conjunto de dados usado nesse cenário foi o Icentia11k, um banco de dados com sinais contínuos de ECG brutos, amostrados a 250Hz, abrangendo 11.000 pacientes e 2 bilhões de batimentos anotados.

Por conta dos custos computacionais, limitamos nossa análise aos primeiros 1.000 pacientes. No entanto, embora todos os pacientes tivessem as localizações dos batimentos anotadas por especialistas, a maioria não tinha rótulos de classe ou informações para outras categorias. No final, trabalhamos com 210 pacientes com batimentos anotados nas classes normal, PAC ou PVC.

Os passos de pré-processamento seguiram procedimentos semelhantes aos usados no Banco de Dados Europeu ST-T. Dividimos os pacientes em conjuntos de treinamento e teste, garantindo que cada paciente contribuísse apenas com batimentos de uma única classe. Além disso, descartamos os dois primeiros e os dois últimos batimentos de cada gravação. Após esse processo, obtivemos um conjunto com 168 pacientes para treinamento, com 2.000 batimentos por classe, e 42 pacientes para teste, com 500 batimentos por classe.

## Carregando o dataset

# Resultados e Discussões