# Unsupervised Anomaly Detection + PCA
This notebook demonstrates the pipeline: load data, standardize, apply PCA for visualization, train IsolationForest and One-Class SVM, and compare anomaly scores.

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
df = pd.read_csv('../data/transactions.csv')
df.head()

In [None]:
X = df.drop(columns=['is_synthetic_anomaly']).values
y = df['is_synthetic_anomaly'].values
scaler = StandardScaler().fit(X)
Xs = scaler.transform(X)
pca = PCA(n_components=2).fit(Xs)
X_pca = pca.transform(Xs)
plt.figure(figsize=(6,5))
plt.scatter(X_pca[:,0], X_pca[:,1], s=8, alpha=0.6)
plt.title('PCA 2D scatter - all points')
plt.xlabel('PC1'); plt.ylabel('PC2')
plt.show()

In [None]:
iso = IsolationForest(n_estimators=200, contamination=0.02, random_state=42).fit(Xs)
iso_scores = -iso.score_samples(Xs)
plt.figure(figsize=(6,4))
plt.hist(iso_scores, bins=80)
plt.title('IsolationForest anomaly score distribution')
plt.show()

In [None]:
from sklearn.metrics import precision_recall_fscore_support
threshold = np.percentile(iso_scores, 97.5)
preds = (iso_scores >= threshold).astype(int)
precision, recall, f1, _ = precision_recall_fscore_support(y, preds, average='binary', zero_division=0)
print(f'Precision: {precision:.3f}, Recall: {recall:.3f}, F1: {f1:.3f}')

## Save models and visuals
You can run `src/train_and_score.py` to reproduce the models and save visuals to `outputs/`.