In [3]:
import pandas as pd
from sklearn.linear_model import LinearRegression

# Sample data
data = pd.read_csv('./PEMS07_sorted.csv')

# Create DataFrame
df = pd.DataFrame(data)

# Prepare the feature matrix (X) and target vector (y)
X = df[['from', 'to']]
y = df['cost']

# Fit multiple linear regression model
linear_reg = LinearRegression()
linear_reg.fit(X, y)

# Predict the costs using the linear regression model
y_pred = linear_reg.predict(X)

# Calculate the residuals (differences between actual and predicted values)
residuals = y - y_pred

# Set a threshold for anomaly detection (e.g., 1 standard deviation)
threshold = residuals.std()

# Mark anomalies based on the threshold
anomalies = abs(residuals) > threshold

# Add anomaly column to DataFrame
df['predicted_cost'] = y_pred
df['residuals'] = residuals
df['anomaly'] = anomalies

print(df)


     from   to    cost  predicted_cost  residuals  anomaly
0     264  396  20.539        1.483168  19.055832     True
1     684  296  19.138        1.494105  17.643895     True
2     340  696  19.063        1.391610  17.671390     True
3     663  789  17.238        1.350060  15.887940     True
4     585   36  14.353        1.574912  12.778088     True
..    ...  ...     ...             ...        ...      ...
861   627  447   0.135        1.452209  -1.317209    False
862   417  476   0.112        1.452916  -1.340916    False
863   177   99   0.080        1.574327  -1.494327    False
864   689  382   0.079        1.468596  -1.389596    False
865    31  347   0.032        1.507822  -1.475822    False

[866 rows x 6 columns]


In [4]:
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans

# Assuming 'anomaly' column contains binary labels (True for anomaly, False for normal)
predicted_labels = df['anomaly'].values.astype(int)  # Convert True/False to 1/0

# Create two clusters based on predicted anomaly labels
kmeans = KMeans(n_clusters=2, random_state=42)
clusters = kmeans.fit_predict(predicted_labels.reshape(-1, 1))

# Calculate silhouette score
silhouette_avg = silhouette_score(predicted_labels.reshape(-1, 1), clusters)
print(f'Silhouette Score for Anomaly Detection: {silhouette_avg:.3f}')


Silhouette Score for Anomaly Detection: 1.000


  super()._check_params_vs_input(X, default_n_init=10)
