In [None]:
# Merge risk CSVs -> meta dataset -> train small ANN -> output incident probabilities and severity
import os
import joblib
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras import layers, models, callbacks, optimizers
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc, f1_score

# ----------------- USER CONFIG -----------------
# Local file paths (change if your files are elsewhere)
# NOTE: adjust these paths to your actual CSVs.
DATA_RISK_CSV = "/mnt/data/results/jm1_notebook_results/jm1_data_risk_scores.csv"
CODE_RISK_CSV = "/mnt/data/code_risk_scores.csv"   # replace with your code-risk CSV path
# If you have a single combined file already, set one path and leave the other None.

# Column names (auto-detection below will try to find id and label columns)
ID_COL = None            # None => auto-detect common id column like 'id','file','module','name'
DATA_RISK_COL = "data_risk_score"
CODE_RISK_COL = "code_risk_score"
INCIDENT_LABEL_COL = "incident_label"   # if you have ground-truth incident labels
# If defects column exists in data_risk CSV it will be used as incident label if INCIDENT_LABEL_COL absent

OUT_DIR = "/mnt/data/meta_out"
os.makedirs(OUT_DIR, exist_ok=True)
MODEL_OUT = os.path.join(OUT_DIR, "meta_ann.h5")
SCALER_OUT = os.path.join(OUT_DIR, "meta_scaler.pkl")
META_CSV_OUT = os.path.join(OUT_DIR, "meta_dataset.csv")
PRED_CSV_OUT = os.path.join(OUT_DIR, "meta_predictions.csv")

# Model hyperparams
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)
TEST_SIZE = 0.2
BATCH_SIZE = 32
EPOCHS = 200
LR = 1e-3
PATIENCE = 8

# Severity thresholds for classification (you can change these)
THR_HIGH = 0.70
THR_MED = 0.40   # medium is between 0.40 and 0.70
# -------------------------------------------------

# --------- Load CSVs (with sanity checks) ----------
def safe_read(path):
    if path is None:
        return None
    if not os.path.exists(path):
        raise FileNotFoundError(f"File not found: {path}")
    return pd.read_csv(path)

df_data = safe_read(DATA_RISK_CSV)
df_code = safe_read(CODE_RISK_CSV)

if df_data is None and df_code is None:
    raise ValueError("At least one of DATA_RISK_CSV or CODE_RISK_CSV must be provided and exist.")

# Auto-detect id column if not set
def find_id_col(dfs):
    candidates = ['id','Id','ID','file','module','name']
    for c in candidates:
        ok = True
        for df in dfs:
            if df is None: continue
            if c not in df.columns:
                ok = False; break
        if ok:
            return c
    # fallback: try intersection of column names
    cols_sets = [set(df.columns) for df in dfs if df is not None]
    inter = set.intersection(*cols_sets) if cols_sets else set()
    for c in candidates:
        if c in inter:
            return c
    return None

ID_COL = ID_COL or find_id_col([df_data, df_code])
if ID_COL is None:
    # If no id, create an index-based id in both frames to merge by position
    print("No common id column found — merging by index position and creating 'meta_id'.")
    if df_data is not None:
        df_data = df_data.reset_index().rename(columns={'index': 'meta_id'})
    if df_code is not None:
        df_code = df_code.reset_index().rename(columns={'index': 'meta_id'})
    ID_COL = 'meta_id'

print("Using id column:", ID_COL)

# --- Prepare columns: ensure risk columns exist ---
def ensure_col(df, col, default=None):
    if df is None: return None
    if col not in df.columns:
        if default is None:
            # create missing column with NaNs
            df[col] = np.nan
        else:
            df[col] = default
    return df

if df_data is not None:
    # if data risk saved under different name, try to find it
    if DATA_RISK_COL not in df_data.columns:
        possible = [c for c in df_data.columns if 'risk' in c.lower() or 'data' in c.lower()]
        if possible:
            print("Found possible data risk columns in data CSV:", possible)
            DATA_RISK_COL = possible[0]
        else:
            df_data[DATA_RISK_COL] = np.nan

if df_code is not None:
    if CODE_RISK_COL not in df_code.columns:
        possible = [c for c in df_code.columns if 'risk' in c.lower() or 'code' in c.lower()]
        if possible:
            print("Found possible code risk columns in code CSV:", possible)
            CODE_RISK_COL = possible[0]
        else:
            df_code[CODE_RISK_COL] = np.nan

# --- Merge dataframes on ID_COL ---
if df_data is None:
    meta = df_code[[ID_COL, CODE_RISK_COL]].copy()
elif df_code is None:
    meta = df_data[[ID_COL, DATA_RISK_COL]].copy()
else:
    meta = pd.merge(df_data[[ID_COL, DATA_RISK_COL]], df_code[[ID_COL, CODE_RISK_COL]], on=ID_COL, how='outer')

# If defects column exists in data_risk CSV, use it as incident label if INCIDENT_LABEL_COL not present
if INCIDENT_LABEL_COL not in meta.columns:
    if df_data is not None and 'defects' in df_data.columns:
        meta = meta.merge(df_data[[ID_COL, 'defects']], on=ID_COL, how='left')
        meta.rename(columns={'defects': INCIDENT_LABEL_COL}, inplace=True)
    elif df_code is not None and 'defects' in df_code.columns:
        meta = meta.merge(df_code[[ID_COL, 'defects']], on=ID_COL, how='left')
        meta.rename(columns={'defects': INCIDENT_LABEL_COL}, inplace=True)

# Keep only relevant columns
cols_keep = [ID_COL, DATA_RISK_COL, CODE_RISK_COL]
if INCIDENT_LABEL_COL in meta.columns:
    cols_keep.append(INCIDENT_LABEL_COL)
meta = meta[cols_keep]

# Fill missing risk scores sensibly (impute 0 for missing code_risk if not available)
meta[DATA_RISK_COL] = meta[DATA_RISK_COL].fillna(0.0)
meta[CODE_RISK_COL] = meta[CODE_RISK_COL].fillna(0.0)

# Save meta dataset (before training)
meta.to_csv(META_CSV_OUT, index=False)
print("Saved meta dataset to", META_CSV_OUT)

# ----------------- Prepare training data -----------------
# If no incident_label present, create a synthetic label using a heuristic (optional)
if INCIDENT_LABEL_COL not in meta.columns or meta[INCIDENT_LABEL_COL].isna().all():
    print("No incident_label found — creating synthetic labels using heuristic.")
    # Simple heuristic: incident if both risks high
    meta[INCIDENT_LABEL_COL] = ((meta[DATA_RISK_COL] >= 0.7) & (meta[CODE_RISK_COL] >= 0.6)).astype(int)
    synthetic_label_used = True
else:
    # ensure int 0/1
    meta[INCIDENT_LABEL_COL] = meta[INCIDENT_LABEL_COL].fillna(0).astype(int)
    synthetic_label_used = False

# Features and target
feature_cols = [DATA_RISK_COL, CODE_RISK_COL]
X = meta[feature_cols].values.astype(float)
y = meta[INCIDENT_LABEL_COL].values.astype(int)

# Train/val split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=TEST_SIZE, stratify=y, random_state=RANDOM_SEED)

# Scale
scaler = StandardScaler().fit(X_train)
X_train_s = scaler.transform(X_train)
X_val_s = scaler.transform(X_val)
joblib.dump(scaler, SCALER_OUT)
print("Saved scaler to", SCALER_OUT)

# ----------------- Build & train small ANN -----------------
def build_meta_model(input_dim):
    inp = layers.Input(shape=(input_dim,))
    x = layers.Dense(16, activation='relu')(inp)
    x = layers.Dropout(0.1)(x)
    x = layers.Dense(8, activation='relu')(x)
    out = layers.Dense(1, activation='sigmoid')(x)
    m = models.Model(inp, out)
    m.compile(optimizer=optimizers.Adam(learning_rate=LR), loss='binary_crossentropy', metrics=['AUC'])
    return m

model = build_meta_model(X_train_s.shape[1])
es = callbacks.EarlyStopping(monitor='val_loss', patience=PATIENCE, restore_best_weights=True)

history = model.fit(X_train_s, y_train, validation_data=(X_val_s, y_val),
                    epochs=EPOCHS, batch_size=BATCH_SIZE, callbacks=[es], verbose=1)

# Save model
model.save(MODEL_OUT)
print("Saved meta model to", MODEL_OUT)

# --------------- Inference on full meta dataset ----------------
X_all_s = scaler.transform(meta[feature_cols].values.astype(float))
probs = model.predict(X_all_s).squeeze()
meta['incident_probability'] = probs

# Severity classification
def severity_from_prob(p):
    if p >= THR_HIGH:
        return "High"
    if p >= THR_MED:
        return "Medium"
    return "Low"

meta['incident_severity'] = meta['incident_probability'].apply(severity_from_prob)

# Save predictions
meta.to_csv(PRED_CSV_OUT, index=False)
print("Saved predictions to", PRED_CSV_OUT)

# --------------- Evaluation (if we have real labels) ----------------
if not synthetic_label_used:
    y_true_full = meta[INCIDENT_LABEL_COL].values
    y_score_full = meta['incident_probability'].values
    try:
        roc = roc_auc_score(y_true_full, y_score_full)
        prec, rec, _ = precision_recall_curve(y_true_full, y_score_full)
        pr_auc = auc(rec, prec)
        print(f"ROC-AUC: {roc:.4f}, PR-AUC: {pr_auc:.4f}")
    except Exception as e:
        print("Evaluation failed:", e)
else:
    print("Synthetic labels were used; evaluation metrics may be misleading.")

# --------------- Quick summary output ----------------
print("\nTop 10 predicted incidents (by prob):")
print(meta.sort_values('incident_probability', ascending=False).head(10)[[ID_COL, DATA_RISK_COL, CODE_RISK_COL, 'incident_probability', 'incident_severity']])

# End
