In [None]:
import pandas as pd
import torch
import numpy as np
import joblib
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# --- 1. Load RandomForest model ---
RF_PATH = "/content/drive/MyDrive/rf_tfidf.pkl"
bundle = joblib.load(RF_PATH)
rf_clf = bundle["pipeline"]
LABEL_NAMES = bundle["label_names"]

# --- 2. Load BERT model ---
MODEL_PATH = "/content/drive/MyDrive/debert-v8"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

bert_model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH).to(device)
bert_tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
bert_model.eval()

# --- 3. Prediction with RF ---
def predict_rf(texts):
    return rf_clf.predict_proba(texts)   # shape (N, C)

# --- 4. Prediction with BERT ---
def predict_bert(texts, batch_size=16):
    all_probs = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        inputs = bert_tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=256).to(device)
        with torch.no_grad():
            outputs = bert_model(**inputs)
            probs = torch.softmax(outputs.logits, dim=1).cpu().numpy()
        all_probs.append(probs)
    return np.vstack(all_probs)   # shape (N, C)

# --- 5. Ensemble prediction (soft voting) ---
def predict_ensemble(rf_probs, bert_probs, alpha=0.5):
    combined = alpha * rf_probs + (1 - alpha) * bert_probs
    return combined.argmax(axis=1)

# --- 6. Load CSV ---
csv_path = "/content/drive/MyDrive/bad_reviews_alaska (2).csv"
try:
    df = pd.read_csv(csv_path, encoding="utf-8")
except UnicodeDecodeError:
    df = pd.read_csv(csv_path, encoding="latin-1")
print("✅ CSV loaded")

# Pick text column (adjust if needed)
text_column = "text"

# --- 7. Run predictions ---
texts = df[text_column].astype(str).tolist()

rf_probs = predict_rf(texts)
bert_probs = predict_bert(texts)

rf_preds = rf_probs.argmax(axis=1)
bert_preds = bert_probs.argmax(axis=1)
ensemble_preds = predict_ensemble(rf_probs, bert_probs, alpha=0.5)

# --- 8. Add to DataFrame ---
df["label_rf"] = [int(i) for i in rf_preds]
df["label_bert"] = [int(i) for i in bert_preds]
df["label_ensemble"] = [int(i) for i in ensemble_preds]


# --- 9. Save results ---
output_file = "/content/drive/MyDrive/labeled_results_ensemble123.csv"
df.to_csv(output_file, index=False, encoding="utf-8")
print(f"✅ Results saved to {output_file}")

# --- 10. Preview ---
print(df[[text_column, "label_rf", "label_bert", "label_ensemble"]].head())
