In [None]:
# Load saved Random Forest model bundle to trial on google colab
from pathlib import Path
import joblib
import pandas as pd
import numpy as np
import chardet  


MODEL_PATH = Path("/content/drive/MyDrive/Colab Notebooks/models/rf_tfidf.pkl")


bundle = joblib.load(MODEL_PATH)
clf = bundle["pipeline"]                     
LABEL_NAMES = bundle["label_names"]           
VIOLATION_IDS = set(bundle["violation_ids"])  


def predict_text(text: str):
    """Return (pred_id, pred_name, is_violation) for a single review string,
    with short explanation labels in the printout."""
    pred_id = int(clf.predict([text])[0])
    pred_name = LABEL_NAMES[pred_id]
    is_violation = pred_id in VIOLATION_IDS

    print(f"(pred_id={pred_id}, label='{pred_name}', is_violation={is_violation})")
    return pred_id, pred_name, is_violation

# Example usage:
# print(predict_text("Alex is a friendly waiter at the store"))




def read_csv_robust(path, text_col="text"):
    try:
        df = pd.read_csv(path)  
    except UnicodeDecodeError:
        try:
            df = pd.read_csv(path, encoding="latin1")
        except Exception:
       
            with open(path, "rb") as f:
                raw = f.read(50000)
                result = chardet.detect(raw)
            enc = result["encoding"] or "utf-8"
            print(f"[Info] Detected encoding={enc}, retrying...")
            df = pd.read_csv(path, encoding=enc)
    assert text_col in df.columns, f"CSV must contain column '{text_col}'"
    return df

def predict_csv_with_stats(in_csv: str,
                           text_col: str = "text",
                           out_csv: str = "/content/drive/MyDrive/Colab Notebooks/outputs/preds_rf.csv"):
  
    df = read_csv_robust(in_csv, text_col=text_col)
    X = df[text_col].astype(str)

  
    pred_ids = clf.predict(X)
    if hasattr(clf, "predict_proba"):
        pred_proba = clf.predict_proba(X) 
    else:
        
        classes_ = clf.classes_ if hasattr(clf, "classes_") else sorted(LABEL_NAMES.keys())
        pred_proba = np.zeros((len(X), len(classes_)), dtype=float)
        for i, pid in enumerate(pred_ids):
            idx = list(classes_).index(pid)
            pred_proba[i, idx] = 1.0

   
    pred_names = [LABEL_NAMES[int(i)] for i in pred_ids]
    is_violation = [int(i) in VIOLATION_IDS for i in pred_ids]

    top1_conf = pred_proba.max(axis=1)                     
    top2_idx = pred_proba.argsort(axis=1)[:, -2:]       
   
    top2_idx = np.flip(top2_idx, axis=1)


    classes_order = clf.classes_ if hasattr(clf, "classes_") else np.array(sorted(LABEL_NAMES.keys()))

    top2_labels = [[LABEL_NAMES[int(classes_order[i])] for i in row] for row in top2_idx]
    top2_probs  = [[pred_proba[j, i] for i in row] for j, row in enumerate(top2_idx)]


    text_len_chars = X.str.len()
    text_len_words = X.str.split().str.len()


    df_out = df.copy()
    df_out["pred_id"] = pred_ids
    df_out["pred_name"] = pred_names
    df_out["is_violation"] = is_violation
    df_out["confidence"] = np.round(top1_conf, 3)
    df_out["top2_labels"] = [", ".join(labs) for labs in top2_labels]
    df_out["top2_probs"] = [", ".join(f"{p:.3f}" for p in probs) for probs in top2_probs]
    df_out["len_chars"] = text_len_chars
    df_out["len_words"] = text_len_words

    for col_idx, class_id in enumerate(classes_order):
        lab_name = LABEL_NAMES[int(class_id)]
        df_out[f"proba_{lab_name}"] = np.round(pred_proba[:, col_idx], 3)

    out_path = Path(out_csv)
    out_path.parent.mkdir(parents=True, exist_ok=True)
    df_out.to_csv(out_path, index=False)
    print(f"[Done] Predictions + stats saved to: {out_path}")
    return df_out

# --- Example batch run ---
# df_preds = predict_csv_with_stats("/content/drive/MyDrive/Colab Notebooks/filename.csv")
# df_preds.head()
