In [None]:
# train_rf.py  (labels: 0=Valid, 1=Spam/Ads/Irrelevant, 2=LowQuality, 3=RantWithoutVisit)
# code to train our random forest model using TF-IDF features on Google colab


from google.colab import drive
drive.mount('/content/drive')
import time
from pathlib import Path
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score, confusion_matrix

import seaborn as sns
import matplotlib.pyplot as plt
import joblib


DATA_PATH  = "/content/drive/MyDrive/Colab Notebooks/masterList1.csv"  
LABEL_COL  = "label"                   
TEST_SIZE  = 0.2
RANDOM_STATE = 2022


DEFAULT_LABEL_NAMES = {
    0: "Valid",
    1: "SpamAdsIrrelevant",
    2: "LowQuality",
    3: "RantWithoutVisit",
}
VIOLATION_IDS = {1, 2, 3}

TFIDF_KW = dict(
    max_features=20000,
    ngram_range=(1, 2),
    min_df=5,
    strip_accents="unicode",
    sublinear_tf=True,
)

MODEL_PATH = Path("/content/drive/MyDrive/Colab Notebooks/models/rf_tfidf.pkl")


df = pd.read_csv(DATA_PATH)
assert TEXT_COL in df.columns and LABEL_COL in df.columns, \
    f"CSV must contain '{TEXT_COL}' and '{LABEL_COL}'"

df = df.dropna(subset=[TEXT_COL, LABEL_COL]).copy()
X = df[TEXT_COL].astype(str)


raw_y = df[LABEL_COL].astype(str).str.strip()
y_num = pd.to_numeric(raw_y, errors="coerce")
if y_num.isna().mean() > 0:
    classes_sorted = sorted(raw_y.unique())
    name2id = {name: i for i, name in enumerate(classes_sorted)}
    y = raw_y.map(name2id).astype(int)
    LABEL_NAMES = {i: n for n, i in name2id.items()}
else:
    y = y_num.astype(int)
    uniq = sorted(np.unique(y))
    LABEL_NAMES = DEFAULT_LABEL_NAMES if set(uniq) == set(DEFAULT_LABEL_NAMES.keys()) \
                  else {i: str(i) for i in uniq}

print("Label counts:\n", pd.Series(y).value_counts().sort_index())


X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y
)


clf = Pipeline([
    ("tfidf", TfidfVectorizer(**TFIDF_KW)),
    ("rf", RandomForestClassifier(
        n_estimators=400,
        max_depth=None,
        n_jobs=-1,
        random_state=RANDOM_STATE,
        class_weight="balanced"
    ))
])


t0 = time.time()
clf.fit(X_train, y_train)
train_secs = time.time() - t0


y_pred = clf.predict(X_val)
target_names = [LABEL_NAMES[i] for i in sorted(LABEL_NAMES)]

print("\n=== Validation: TF-IDF + RandomForest ===")
print(f"Train time: {train_secs:.2f}s")
print(classification_report(y_val, y_pred, target_names=target_names, digits=3))
print("Macro F1:", f1_score(y_val, y_pred, average="macro"))


cm = confusion_matrix(y_val, y_pred, labels=sorted(LABEL_NAMES))
sns.heatmap(cm, annot=True, fmt="d",
            xticklabels=target_names, yticklabels=target_names)
plt.title("Confusion Matrix â€” RF")
plt.xlabel("Predicted"); plt.ylabel("True"); plt.tight_layout(); plt.show()


MODEL_PATH.parent.mkdir(parents=True, exist_ok=True)
joblib.dump(
    {"pipeline": clf, "label_names": LABEL_NAMES, "violation_ids": VIOLATION_IDS},
    MODEL_PATH
)
print(f"\nSaved model to: {MODEL_PATH.resolve()}")
