# SCC5977 - Aprendizado de Máquina para Séries Temporais (2024)

## Grupo
> André Guarnier De Mitri - 11395579
> 
> 

## Problema


# VERSÃO 1.0 SEM PROCESSAMENTO PARALELO

In [1]:
# Import necessary libraries
from aeon.distances import pairwise_distance, ddtw_distance
import numpy as np
from aeon.datasets import load_from_tsfile
from itertools import combinations
import matplotlib.pyplot as plt
from tqdm import tqdm
import time


def load_and_validate_data(file_path):
    """Load and validate time series data."""
    X, y = load_from_tsfile(full_file_path_and_name=file_path)
    print(f"Loaded {X.shape[0]} time series with shape {X.shape[1:]}.")
    return X


def calculate_distances(X, metric_function, metric_name):
    """Calculate pairwise distances using the specified metric with progress tracking."""
    num_ts = X.shape[0]
    total_combinations = (num_ts * (num_ts - 1)) // 2  # Number of combinations

    print(f"Calculating {metric_name}...")
    distances = []
    start_time = time.time()

    with tqdm(total=total_combinations, desc=metric_name) as pbar:
        for (i, ts1), (j, ts2) in combinations(enumerate(X[:, 0]), 2):
            distances.append((metric_function(ts1, ts2), i, j))
            pbar.update(1)  # Update the progress bar

    elapsed_time = time.time() - start_time
    print(f"{metric_name} completed in {elapsed_time:.2f} seconds.")
    return distances, elapsed_time

def summarize_distances(distances, distance_type):
    """Summarize and print statistics for the given distances."""
    distances_only = [d[0] for d in distances]
    smallest = min(distances, key=lambda x: x[0])
    largest = max(distances, key=lambda x: x[0])
    median_dist = np.median(distances_only)

    print(f"\n{distance_type} Analysis:")
    print(f"Median Distance: {median_dist:.4f}")
    print(f"Smallest Distance: {smallest[0]:.4f} (between series {smallest[1]} and {smallest[2]})")
    print(f"Largest Distance: {largest[0]:.4f} (between series {largest[1]} and {largest[2]})")

    return smallest, largest


def plot_time_series_comparison(X, pair, title, axs):
    """Plot two time series with a title."""
    ts1, ts2 = X[pair[1], 0], X[pair[2], 0]
    axs.plot(ts1, label="Time Series 1")
    axs.plot(ts2, label="Time Series 2")
    axs.set_title(title)
    axs.legend()
    axs.grid(True)


def main():
    # Load data
    file_path = './data/ts_files/train.ts'
    X = load_and_validate_data(file_path)

    # Calculate distances
    euclidean_distances, euclidean_time = calculate_distances(
        X, lambda x, y: pairwise_distance(x, y, metric="euclidean"), "Euclidean Distance"
    )
    ddtw_distances, ddtw_time = calculate_distances(X, ddtw_distance, "Derivative DTW Distance")

    # Summarize distances
    euclidean_smallest, euclidean_largest = summarize_distances(euclidean_distances, "Euclidean Distance")
    ddtw_smallest, ddtw_largest = summarize_distances(ddtw_distances, "Derivative DTW Distance")

    # Print time analysis
    print(f"\nTiming Analysis:")
    print(f"Euclidean Distance took {euclidean_time:.2f} seconds.")
    print(f"Derivative DTW Distance took {ddtw_time:.2f} seconds.")

    # Plot the time series comparisons
    fig, axs = plt.subplots(2, 2, figsize=(12, 8))

    plot_time_series_comparison(X, euclidean_smallest, "Smallest Euclidean Distance", axs[0, 0])
    plot_time_series_comparison(X, ddtw_smallest, "Smallest DDTW Distance", axs[0, 1])
    plot_time_series_comparison(X, euclidean_largest, "Largest Euclidean Distance", axs[1, 0])
    plot_time_series_comparison(X, ddtw_largest, "Largest DDTW Distance", axs[1, 1])

    plt.tight_layout()
    plt.show()


# Run the main function
main()

Loaded 6000 time series with shape (1, 137).
Calculating Euclidean Distance...


Euclidean Distance:  14%|█▍        | 2576196/17997000 [03:07<18:41, 13744.51it/s]


KeyboardInterrupt: 

# TENTANDO PARALELIZAr

In [None]:
from aeon.distances import pairwise_distance, ddtw_distance
import numpy as np
from aeon.datasets import load_from_tsfile
from itertools import combinations
import matplotlib.pyplot as plt
from tqdm import tqdm
import time
from concurrent.futures import ProcessPoolExecutor


def load_and_validate_data(file_path):
    """Load and validate time series data."""
    X, y = load_from_tsfile(full_file_path_and_name=file_path)
    print(f"Loaded {X.shape[0]} time series with shape {X.shape[1:]}.")
    return X


def calculate_pairwise_distance(i, ts1, j, ts2, metric_function):
    """Helper function to calculate pairwise distance in parallel."""
    distance = metric_function(ts1, ts2)
    return (distance, i, j)


def calculate_distances(X, metric_function, metric_name):
    """Calculate pairwise distances using the specified metric with parallel processing."""
    num_ts = X.shape[0]
    total_combinations = (num_ts * (num_ts - 1)) // 2  # Number of combinations

    print(f"Calculating {metric_name}...")
    
    start_time = time.time()

    # Prepare list of arguments for parallel processing
    tasks = [
        (i, X[i, 0], j, X[j, 0], metric_function)
        for i, j in combinations(range(num_ts), 2)
    ]
    
    # Create a tqdm object to track progress
    with tqdm(total=total_combinations, desc=metric_name) as pbar:
        results = []
        # Use ProcessPoolExecutor to parallelize distance calculations
        with ProcessPoolExecutor() as executor:
            futures = [executor.submit(calculate_pairwise_distance, *task) for task in tasks]

            # As the futures complete, update the progress bar and collect results
            for future in futures:
                result = future.result()  # Block until the result is ready
                results.append(result)
                pbar.update(1)  # Update progress bar after each completed task

    elapsed_time = time.time() - start_time
    print(f"{metric_name} completed in {elapsed_time:.2f} seconds.")
    return results, elapsed_time


def summarize_distances(distances, distance_type):
    """Summarize and print statistics for the given distances."""
    distances_only = [d[0] for d in distances]
    smallest = min(distances, key=lambda x: x[0])
    largest = max(distances, key=lambda x: x[0])
    median_dist = np.median(distances_only)

    print(f"\n{distance_type} Analysis:")
    print(f"Median Distance: {median_dist:.4f}")
    print(f"Smallest Distance: {smallest[0]:.4f} (between series {smallest[1]} and {smallest[2]})")
    print(f"Largest Distance: {largest[0]:.4f} (between series {largest[1]} and {largest[2]})")

    return smallest, largest


def plot_time_series_comparison(X, pair, title, axs):
    """Plot two time series with a title."""
    ts1, ts2 = X[pair[1], 0], X[pair[2], 0]
    axs.plot(ts1, label="Time Series 1")
    axs.plot(ts2, label="Time Series 2")
    axs.set_title(title)
    axs.legend()
    axs.grid(True)


def main():
    # Load data
    file_path = './data/ts_files/train.ts'
    X = load_and_validate_data(file_path)

    # Calculate distances
    euclidean_distances, euclidean_time = calculate_distances(
        X, lambda x, y: pairwise_distance(x, y, metric="euclidean"), "Euclidean Distance"
    )
    ddtw_distances, ddtw_time = calculate_distances(X, ddtw_distance, "Derivative DTW Distance")

    # Summarize distances
    euclidean_smallest, euclidean_largest = summarize_distances(euclidean_distances, "Euclidean Distance")
    ddtw_smallest, ddtw_largest = summarize_distances(ddtw_distances, "Derivative DTW Distance")

    # Print time analysis
    print(f"\nTiming Analysis:")
    print(f"Euclidean Distance took {euclidean_time:.2f} seconds.")
    print(f"Derivative DTW Distance took {ddtw_time:.2f} seconds.")

    # Plot the time series comparisons
    fig, axs = plt.subplots(2, 2, figsize=(12, 8))

    plot_time_series_comparison(X, euclidean_smallest, "Smallest Euclidean Distance", axs[0, 0])
    plot_time_series_comparison(X, ddtw_smallest, "Smallest DDTW Distance", axs[0, 1])
    plot_time_series_comparison(X, euclidean_largest, "Largest Euclidean Distance", axs[1, 0])
    plot_time_series_comparison(X, ddtw_largest, "Largest DDTW Distance", axs[1, 1])

    plt.tight_layout()
    plt.show()


# Run the main function
main()

Loaded 6000 time series with shape (1, 137).
Calculating Euclidean Distance...


Euclidean Distance:   0%|          | 0/17997000 [00:00<?, ?it/s]

# Trabalhos relacionados

# Algoritimos Utilizados

# Metodologia

## Carregando o dataset

# Resultados e Discussões