In [2]:
import torch
import numpy as np
import pandas as pd
from chronos import ChronosPipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
import matplotlib.pyplot as plt
from datasets import load_dataset

# 1. Load and prepare the NAB dataset
ds = load_dataset("merve/NAB", "realKnownCause", split="train")
ds.set_format("pandas")
df = ds[0:1000]  # Let's use the first 1000 points of the first time series

# Convert to numpy arrays
time_series = df['value'].to_numpy()
true_anomalies = df['label'].to_numpy()

# 2. Load Chronos model
pipeline = ChronosPipeline.from_pretrained(
    "amazon/chronos-t5-small",
    device_map="cuda" if torch.cuda.is_available() else "cpu",
    torch_dtype=torch.float32,
)

# 3. Function to create sliding windows
def create_windows(data, window_size, stride=1):
    windows = []
    for i in range(0, len(data) - window_size + 1, stride):
        windows.append(data[i:i + window_size])
    return np.array(windows)

# 4. Create windows and extract embeddings
window_size = 100
stride = 1
windows = create_windows(time_series, window_size, stride)

# Extract embeddings for each window
embeddings_list = []
batch_size = 32

for i in range(0, len(windows), batch_size):
    batch = torch.tensor(windows[i:i + batch_size])
    embeddings, _ = pipeline.embed(batch)
    # Average pooling over the sequence length dimension
    embeddings = embeddings.mean(dim=1)
    embeddings_list.append(embeddings.cpu().numpy())

embeddings = np.concatenate(embeddings_list, axis=0)

# 5. Train Isolation Forest for anomaly detection
clf = IsolationForest(contamination=0.1, random_state=42)
predictions = clf.fit_predict(embeddings)

# Convert predictions to binary (1: normal, -1: anomaly)
predicted_anomalies = np.zeros(len(time_series))
for i, pred in enumerate(predictions):
    if pred == -1:  # If window is classified as anomaly
        # Mark the center point of the window as anomaly
        center = i + window_size // 2
        if center < len(predicted_anomalies):
            predicted_anomalies[center] = 1

# 6. Visualize results
plt.figure(figsize=(15, 10))

# Plot original time series
plt.subplot(3, 1, 1)
plt.plot(time_series, label='Time Series')
plt.title('Original Time Series')
plt.legend()

# Plot true anomalies
plt.subplot(3, 1, 2)
plt.plot(time_series, 'b-', label='Time Series')
plt.scatter(np.where(true_anomalies)[0], 
           time_series[true_anomalies == 1], 
           color='red', 
           label='True Anomalies')
plt.title('True Anomalies')
plt.legend()

# Plot predicted anomalies
plt.subplot(3, 1, 3)
plt.plot(time_series, 'b-', label='Time Series')
plt.scatter(np.where(predicted_anomalies)[0], 
           time_series[predicted_anomalies == 1], 
           color='green', 
           label='Predicted Anomalies')
plt.title('Predicted Anomalies')
plt.legend()

plt.tight_layout()
plt.show()

# 7. Calculate metrics
from sklearn.metrics import precision_score, recall_score, f1_score

# Align predictions with true labels
aligned_predictions = predicted_anomalies[:len(true_anomalies)]
aligned_true = true_anomalies

print("\nPerformance Metrics:")
print(f"Precision: {precision_score(aligned_true, aligned_predictions):.3f}")
print(f"Recall: {recall_score(aligned_true, aligned_predictions):.3f}")
print(f"F1-Score: {f1_score(aligned_true, aligned_predictions):.3f}")

DatasetNotFoundError: Dataset 'merve/NAB' doesn't exist on the Hub or cannot be accessed.