
# Git Anomaly Baseline (Week-by-Week, with Checkpoints)

- **Scope**: Git-only features (no developer-behavior baselines).
- **Loop**: Download 1 week → Extract features → Normalize → Weak label → Train → Metrics → Clean raw → Next week.
- **Checkpoints**: Raw → Silver (features) → Gold (normalized + labels) → Model → Metrics.
- **Progress**: `tqdm` bars with `ncols=30`.
- **Outputs**: Precision, Recall, ROC-AUC, PR-AUC per week.
- **Note**: Replace the download function and `DATA_SOURCE_CONFIG` with your paths/logic.


In [None]:

# === 0. Imports & Config ===
import os, sys, json, shutil, glob, math, gc
from datetime import datetime, timedelta
from pathlib import Path

import numpy as np
import pandas as pd

from tqdm import tqdm

from sklearn.preprocessing import MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import IsolationForest, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    precision_score, recall_score, roc_auc_score, average_precision_score,
    precision_recall_curve, roc_curve, auc
)
from sklearn.utils.class_weight import compute_class_weight

import joblib

# Display options
pd.set_option('display.max_columns', 100)

# === Configuration (edit as needed) ===
BASE_DIR = Path('./git_anomaly_runs').resolve()
BASE_DIR.mkdir(parents=True, exist_ok=True)

# Where to pull git logs from (YOU will fill this in)
DATA_SOURCE_CONFIG = {
    "type": "local_git_archives",  # e.g., 'gh_archive', 'git_logs'
    "source_path": "/replace/with/your/path",  # TODO: set after you share paths
    # Additional credentials/params as needed
}

START_DATE = datetime(2024, 1, 1)   # inclusive
NUM_WEEKS = 2                       # change this as needed
WEEK_SPAN_DAYS = 7                  # 1-week slices

CHECKPOINT_KEEP = {
    "raw": False,        # False: delete raw after processing
    "silver": True,      # Keep engineered features
    "gold": True,        # Keep normalized + labels
    "models": True,      # Keep trained models
    "metrics": True      # Keep metrics JSON/CSV
}

# Weak labeling thresholds on normalized (0~1) features
LOW_Q = 0.1
HIGH_Q = 0.9
LABEL_ANY = True   # True: label=1 if any feature is <=LOW_Q or >=HIGH_Q
LABEL_K = 2        # if LABEL_ANY is False, require at least K features out-of-band

# Features to compute from Git-only signals (commit-level)
# NOTE: This pipeline expects a commit-level table; adjust parsing if needed.
GIT_FEATURE_COLUMNS = [
    "lines_added", "lines_deleted", "files_changed",
    "churn", "is_binary_commit", "avg_line_len", "ext_entropy"
]

# Model choices (baseline)
USE_ISOFOREST = False      # Optional unsupervised baseline
USE_LOGISTIC = True        # Weak-label supervised baseline (probabilistic output)
USE_RF = False             # Alternative baseline

RANDOM_STATE = 42
TEST_SIZE = 0.3

# tqdm width
TQDM_NCOLS = 30


In [None]:

# === 1. Helpers ===

def week_ranges(start_date: datetime, num_weeks: int, span_days: int = 7):
    cur = start_date
    for _ in range(num_weeks):
        end = cur + timedelta(days=span_days)
        yield cur, end
        cur = end

def ensure_dirs(base: Path, week_id: str):
    d = {
        "root": base / week_id,
        "raw": base / week_id / "raw",
        "silver": base / week_id / "silver",
        "gold": base / week_id / "gold",
        "models": base / week_id / "models",
        "metrics": base / week_id / "metrics",
        "state": base / week_id / "state"
    }
    for v in d.values():
        v.mkdir(parents=True, exist_ok=True)
    return d

def save_json(path: Path, data: dict):
    path.parent.mkdir(parents=True, exist_ok=True)
    with open(path, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

def load_json(path: Path):
    if not path.exists():
        return None
    with open(path, 'r', encoding='utf-8') as f:
        return json.load(f)

def cleanup_folder(path: Path):
    if path.exists():
        shutil.rmtree(path, ignore_errors=True)

def memory_cleanup():
    gc.collect()



In [None]:

# === 2. Data Acquisition ===
# You should replace `download_week_data` with your actual downloader/reader.
# As a fallback, this function will generate synthetic commit-level data if no data is found.

def download_week_data(start_dt: datetime, end_dt: datetime, raw_dir: Path, cfg: dict, tqdm_ncols: int = 30):
    # TODO: Replace this with real download / parse of git logs into a CSV/Parquet
    # Expected output file: raw_dir / 'commits.parquet'
    raw_dir.mkdir(parents=True, exist_ok=True)
    out_fp = raw_dir / 'commits.parquet'
    
    # Example: detect if a pre-existing file is present
    existing = sorted(raw_dir.glob('*.parquet'))
    if existing:
        return existing[0]
    
    # Synthetic fallback for demo
    rng = np.random.RandomState(123)
    n = 1200  # approx one week of commits across repos (tune later)
    # Git-only commit features (no identity/behavior baselines)
    df = pd.DataFrame({
        "commit_id": [f"c{i:06d}" for i in range(n)],
        "ts": pd.date_range(start_dt, periods=n, freq="H").astype(str),
        "lines_added": rng.lognormal(mean=3.0, sigma=0.8, size=n).astype(int),
        "lines_deleted": rng.lognormal(mean=2.5, sigma=0.9, size=n).astype(int),
        "files_changed": rng.randint(1, 20, size=n),
        "binary_files_changed": rng.binomial(1, 0.07, size=n),
        "avg_line_len": rng.normal(loc=45, scale=12, size=n).clip(5, 120),
        "ext_entropy": rng.beta(a=2.0, b=5.0, size=n) * 4.0  # 0~4
    })
    # churn
    df["churn"] = df["lines_added"] + df["lines_deleted"]
    # binary commit if >=1 binary file
    df["is_binary_commit"] = (df["binary_files_changed"] > 0).astype(int)

    df.to_parquet(out_fp, index=False)
    return out_fp


In [None]:

# === 3. Feature Engineering (Git-only) ===

def make_git_features(raw_fp: Path, silver_dir: Path, tqdm_ncols: int = 30):
    df = pd.read_parquet(raw_fp)
    # Ensure required columns
    base_cols = [
        "lines_added", "lines_deleted", "files_changed",
        "churn", "is_binary_commit", "avg_line_len", "ext_entropy"
    ]
    for c in base_cols:
        if c not in df.columns:
            raise ValueError(f"Missing required column: {c}")
    feats = df[base_cols].copy()
    # Add any simple derived features if needed later
    silver_fp = silver_dir / "features.parquet"
    feats.to_parquet(silver_fp, index=False)
    return silver_fp


In [None]:

# === 4. Normalization & Weak Labeling ===

def normalize_and_label(silver_fp: Path, gold_dir: Path, low_q=0.1, high_q=0.9,
                        label_any=True, k=2, scaler_type="minmax", tqdm_ncols: int = 30):
    X = pd.read_parquet(silver_fp)
    cols = list(X.columns)
    
    if scaler_type == "robust":
        scaler = RobustScaler()
    else:
        scaler = MinMaxScaler()
    X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=cols)
    
    # Weak labeling
    low_mask = (X_scaled <= low_q)
    high_mask = (X_scaled >= high_q)
    out_of_band = (low_mask | high_mask).astype(int)
    oob_count = out_of_band.sum(axis=1)
    if label_any:
        y = (oob_count >= 1).astype(int)
    else:
        y = (oob_count >= k).astype(int)
    
    gold_fp = gold_dir / "gold.parquet"
    X_scaled.assign(label=y.values).to_parquet(gold_fp, index=False)
    
    # Save scaler
    joblib.dump(scaler, gold_dir / "scaler.joblib")
    return gold_fp, cols


In [None]:

# === 5. Modeling & Metrics ===

def train_and_evaluate(gold_fp: Path, models_dir: Path, metrics_dir: Path,
                       use_isoforest=False, use_logistic=True, use_rf=False,
                       random_state=42, test_size=0.3, tqdm_ncols: int = 30):
    df = pd.read_parquet(gold_fp)
    X = df.drop(columns=["label"]).values
    y = df["label"].values.astype(int)
    
    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=y
    )
    
    metrics = {}
    
    if use_isoforest:
        iso = IsolationForest(
            n_estimators=200, max_samples="auto", contamination=0.1, random_state=random_state
        )
        iso.fit(X_train)
        # decision_function: higher -> more normal; we'll invert to anomaly score
        iso_scores = -iso.decision_function(X_test)
        # To compute classification metrics, choose a threshold at top 10% anomalies
        thresh = np.quantile(iso_scores, 0.9)
        y_pred_iso = (iso_scores >= thresh).astype(int)
        metrics["isoforest"] = {
            "precision": float(precision_score(y_test, y_pred_iso, zero_division=0)),
            "recall": float(recall_score(y_test, y_pred_iso, zero_division=0)),
            "roc_auc": float(roc_auc_score(y_test, iso_scores)),
            "pr_auc": float(auc(*precision_recall_curve(y_test, iso_scores)[1::-1]))
        }
        joblib.dump(iso, models_dir / "isoforest.joblib")
    
    if use_logistic:
        # Handle class imbalance with class weights
        classes = np.unique(y_train)
        cw = compute_class_weight(class_weight="balanced", classes=classes, y=y_train)
        class_weight = {int(c): float(w) for c, w in zip(classes, cw)}
        
        logreg = LogisticRegression(
            max_iter=200, n_jobs=None, class_weight=class_weight, random_state=random_state
        )
        logreg.fit(X_train, y_train)
        prob = logreg.predict_proba(X_test)[:, 1]
        y_pred = (prob >= 0.5).astype(int)
        
        metrics["logistic_regression"] = {
            "precision": float(precision_score(y_test, y_pred, zero_division=0)),
            "recall": float(recall_score(y_test, y_pred, zero_division=0)),
            "roc_auc": float(roc_auc_score(y_test, prob)),
            "pr_auc": float(average_precision_score(y_test, prob))
        }
        joblib.dump(logreg, models_dir / "logreg.joblib")
    
    if use_rf:
        rf = RandomForestClassifier(
            n_estimators=300, max_depth=None, n_jobs=-1, random_state=random_state, class_weight="balanced_subsample"
        )
        rf.fit(X_train, y_train)
        prob = rf.predict_proba(X_test)[:, 1]
        y_pred = (prob >= 0.5).astype(int)
        metrics["random_forest"] = {
            "precision": float(precision_score(y_test, y_pred, zero_division=0)),
            "recall": float(recall_score(y_test, y_pred, zero_division=0)),
            "roc_auc": float(roc_auc_score(y_test, prob)),
            "pr_auc": float(average_precision_score(y_test, prob))
        }
        joblib.dump(rf, models_dir / "random_forest.joblib")
    
    # Save metrics
    save_json(metrics_dir / "metrics.json", metrics)
    return metrics


In [None]:

# === 6. Orchestration: Weekly loop with checkpoints & cleanup ===

all_weeks_metrics = []

for i, (ws, we) in enumerate(tqdm(week_ranges(START_DATE, NUM_WEEKS, WEEK_SPAN_DAYS), total=NUM_WEEKS, ncols=TQDM_NCOLS, desc="Weeks")):
    week_id = f"week_{ws.strftime('%Y%m%d')}_{we.strftime('%Y%m%d')}"
    dirs = ensure_dirs(BASE_DIR, week_id)
    
    # State file to allow resuming
    state_fp = dirs["state"] / "state.json"
    state = load_json(state_fp) or {"stage": "init", "week_id": week_id, "ws": ws.isoformat(), "we": we.isoformat()}
    
    try:
        # 1) Download
        if state["stage"] == "init":
            tqdm.write(f"[{week_id}] Downloading raw data...")
            raw_fp = download_week_data(ws, we, dirs["raw"], DATA_SOURCE_CONFIG, tqdm_ncols=TQDM_NCOLS)
            state["stage"] = "downloaded"
            state["raw_fp"] = str(raw_fp)
            save_json(state_fp, state)
        
        # 2) Feature engineering (git-only)
        if state["stage"] == "downloaded":
            tqdm.write(f"[{week_id}] Feature engineering...")
            silver_fp = make_git_features(Path(state["raw_fp"]), dirs["silver"], tqdm_ncols=TQDM_NCOLS)
            state["stage"] = "silver_done"
            state["silver_fp"] = str(silver_fp)
            save_json(state_fp, state)
        
        # 3) Normalize & weak label
        if state["stage"] == "silver_done":
            tqdm.write(f"[{week_id}] Normalize & weak label...")
            gold_fp, cols = normalize_and_label(
                Path(state["silver_fp"]), dirs["gold"],
                low_q=LOW_Q, high_q=HIGH_Q,
                label_any=LABEL_ANY, k=LABEL_K,
                scaler_type="minmax", tqdm_ncols=TQDM_NCOLS
            )
            state["stage"] = "gold_done"
            state["gold_fp"] = str(gold_fp)
            state["feature_cols"] = cols
            save_json(state_fp, state)
        
        # 4) Train & evaluate
        if state["stage"] == "gold_done":
            tqdm.write(f"[{week_id}] Train & evaluate...")
            metrics = train_and_evaluate(
                Path(state["gold_fp"]), dirs["models"], dirs["metrics"],
                use_isoforest=USE_ISOFOREST, use_logistic=USE_LOGISTIC, use_rf=USE_RF,
                random_state=RANDOM_STATE, test_size=TEST_SIZE, tqdm_ncols=TQDM_NCOLS
            )
            state["stage"] = "trained"
            state["metrics"] = metrics
            save_json(state_fp, state)
            all_weeks_metrics.append({"week_id": week_id, **{(k+'_'+m): v for k, vals in metrics.items() for m, v in vals.items()}})
        
        # 5) Cleanup raw (optional)
        if state["stage"] == "trained":
            if not CHECKPOINT_KEEP.get("raw", False):
                tqdm.write(f"[{week_id}] Cleanup raw dir...")
                cleanup_folder(dirs["raw"])
            state["stage"] = "done"
            save_json(state_fp, state)
            tqdm.write(f"[{week_id}] Done.")
        
        memory_cleanup()
    
    except Exception as e:
        tqdm.write(f"[{week_id}] ERROR: {e}")
        raise

# Summary table for all weeks
summary_df = pd.DataFrame(all_weeks_metrics)
display(summary_df)


In [None]:

# === 7. Print last-week metrics (Precision, Recall, ROC-AUC, PR-AUC) ===

def flatten_metrics(metrics_dict):
    rows = []
    for model_name, vals in metrics_dict.items():
        row = {"model": model_name}
        row.update(vals)
        rows.append(row)
    return pd.DataFrame(rows)

if 'state' in locals() and state.get("metrics"):
    dfm = flatten_metrics(state["metrics"])
    print("Last processed week metrics:")
    display(dfm)
else:
    print("No metrics found yet.")



## 🔧 What you should edit
- `DATA_SOURCE_CONFIG["source_path"]` to your Git archive or log root.
- Implement your own `download_week_data()` to actually read/parse weekly data into **`raw/commits.parquet`**.
- Adjust `NUM_WEEKS`, `CHECKPOINT_KEEP`, and model flags (`USE_LOGISTIC`, `USE_ISOFOREST`, `USE_RF`) as needed.
- The weak labeling thresholds are controlled by `LOW_Q` / `HIGH_Q` and `LABEL_ANY` / `LABEL_K`.

## ✅ Expected file structure per week
```
git_anomaly_runs/
  week_YYYYMMDD_YYYYMMDD/
    raw/        # (deleted if CHECKPOINT_KEEP['raw']=False)
    silver/     # features.parquet
    gold/       # gold.parquet + scaler.joblib
    models/     # model files
    metrics/    # metrics.json
    state/      # state.json
```
