In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tslearn.shapelets import ShapeletModel, grabocka_params_to_shapelet_size_dict
import matplotlib.pyplot as plt



In [2]:
# Funzione per caricare i file npy
def load_npy(filename):
    return np.load(filename)

# Configurazione iniziale
dir_path = 'time_series/'  # Cambia questo percorso con quello corretto
len_threshold = 1280

In [3]:
# Caricamento e preprocessing dei dati
X, y, ids = [], [], []

for file in os.listdir(dir_path):
    if os.path.splitext(file)[1] != '.npy':
        continue

    split = file.split("_")
    ids.append(split[0])  # track_id
    y.append(split[1][:-4])  # genre
    ts = load_npy(os.path.join(dir_path, file))
    
    if len(ts) > len_threshold:
        ts = ts[:len_threshold]
    else:
        pad = [ts[-1]] * (len_threshold - len(ts))  # fill with last observation
        ts = np.append(ts, pad)

    X.append(ts)

X, y, ids = np.array(X), np.array(y), np.array(ids)



In [4]:
# Normalizzazione delle serie temporali
X = (X - np.mean(X, axis=1, keepdims=True)) / np.std(X, axis=1, keepdims=True)


In [5]:
# Trasformazione delle etichette
y = LabelEncoder().fit_transform(y)

In [6]:
# Suddivisione del dataset
X_train, X_test, y_train, y_test, ids_train, ids_test = train_test_split(
    X, y, ids, test_size=0.2, random_state=42, stratify=y
)

In [7]:
# Addestramento del modello Shapelet
shapelet_sizes = grabocka_params_to_shapelet_size_dict(
    n_ts=X_train.shape[0],
    ts_sz=X_train.shape[1],
    n_classes=len(set(y_train)),
    l=0.1,
    r=1
)

shp_clf = ShapeletModel(n_shapelets_per_size=shapelet_sizes,
                        optimizer="sgd", weight_regularizer=.01,
                        max_iter=200)
shp_clf.fit(X_train, y_train)



In [8]:
# Ottenere le shapelets selezionate
selected_shapelets = shp_clf.shapelets_

In [9]:
# Funzione per estrarre le sotto-sequenze
def extract_subsequences(series, shapelet_size):
    return np.array([series[i:i + shapelet_size] for i in range(len(series) - shapelet_size + 1)])


In [10]:
# Funzione per trovare la serie temporale più vicina
def find_closest_series(shapelet, X_train):
    shapelet_length = len(shapelet)
    closest_indices = []
    closest_distances = []

    for idx, series in enumerate(X_train):
        series = series.flatten()
        subsequences = extract_subsequences(series, shapelet_length)
        distances = [np.linalg.norm(shapelet - subseq) for subseq in subsequences]
        min_distance = min(distances)
        closest_distances.append(min_distance)
        closest_indices.append(idx)

    return np.array(closest_distances), np.array(closest_indices)



In [11]:
# Trova le serie temporali più vicine per ciascuna shapelet
closest_series_indices = []
for shapelet in selected_shapelets:
    distances, indices = find_closest_series(shapelet, X_train)
    min_index = np.argmin(distances)
    closest_series_indices.append(indices[min_index])


KeyboardInterrupt: 

In [None]:
# Esegui l'encoding delle etichette originali dei generi
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [None]:
# Ottieni la mappatura da numeri a nomi dei generi
label_mapping = {index: label for index, label in enumerate(label_encoder.classes_)}



In [None]:
# Funzione per ottenere il nome del genere
def get_genre_name(encoded_label):
    return label_mapping[encoded_label]



In [None]:
# Associazioni shapelets -> serie temporali -> generi
shapelet_genres = []
for idx in closest_series_indices:
    genre_label = y_train[idx]
    genre_name = get_genre_name(genre_label)
    shapelet_genres.append(genre_name)



In [None]:
# Visualizzazione delle shapelets e dei loro generi musicali associati
plt.figure(figsize=(12, 8))

for i, shapelet in enumerate(selected_shapelets):
    plt.plot(shapelet, label=f'Shapelet {shapelet_indices[i]} - Genere {shapelet_genres[i]}')

plt.title('Shapelets e Generi Musicali Associati')
plt.xlabel('Time Index')
plt.ylabel('Value')
plt.legend()
plt.show()

# fig 2


In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score
import seaborn as sns

# Parametri ottimizzati per KNN
knn = KNeighborsClassifier(n_neighbors=50, weights='distance', p=2, metric='euclidean')
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)
y_prob_knn = knn.predict_proba(X_test)

# Confusion Matrix
conf_matrix_knn = confusion_matrix(y_test, y_pred_knn)
plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix_knn, annot=True, fmt="d", cmap="YlGnBu")
plt.title('Confusion Matrix - KNN')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

# ROC Curve
fpr_knn, tpr_knn, _ = roc_curve(y_test, y_prob_knn[:, 1], pos_label=1)
roc_auc_knn = roc_auc_score(y_test, y_prob_knn[:, 1])

plt.figure(figsize=(10, 7))
plt.plot(fpr_knn, tpr_knn, color='blue', lw=2, label='ROC curve (area = %0.2f)' % roc_auc_knn)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic - KNN')
plt.legend(loc="lower right")
plt.show()


# fug3

In [None]:
from sklearn.tree import DecisionTreeClassifier

# Parametri ottimizzati per Decision Tree
dt = DecisionTreeClassifier(max_depth=6, criterion='entropy')
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)
y_prob_dt = dt.predict_proba(X_test)

# Confusion Matrix
conf_matrix_dt = confusion_matrix(y_test, y_pred_dt)
plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix_dt, annot=True, fmt="d", cmap="YlGnBu")
plt.title('Confusion Matrix - Decision Tree')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

# ROC Curve
fpr_dt, tpr_dt, _ = roc_curve(y_test, y_prob_dt[:, 1], pos_label=1)
roc_auc_dt = roc_auc_score(y_test, y_prob_dt[:, 1])

plt.figure(figsize=(10, 7))
plt.plot(fpr_dt, tpr_dt, color='green', lw=2, label='ROC curve (area = %0.2f)' % roc_auc_dt)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic - Decision Tree')
plt.legend(loc="lower right")
plt.show()
