In [17]:
"""
Train XGBoost, LightGBM, SVM (with class weights + calibration), and a Voting ensemble
on the LawInformedAI/claudette_tos dataset from HuggingFace.

Save models and vectorizer at the end.
"""
import os
import joblib
import numpy as np
import pandas as pd

from datasets import load_dataset

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    classification_report, confusion_matrix, precision_recall_curve, auc
)
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# ---------- Settings ----------
RANDOM_STATE = 42
TEST_SIZE = 0.20
TFIDF_NGRAMS = (1, 2)           # unigrams + bigrams
MAX_FEATURES = 50000            # adjust if memory-constrained
USE_SMOTE = False               # if True, will perform oversampling on train set
SAVE_DIR = "./models"
os.makedirs(SAVE_DIR, exist_ok=True)

In [18]:
#creating a results array to store the results
results = []

In [19]:
# ---------- Load dataset from HuggingFace ----------
# NOTE: This uses the dataset id you provided; it loads the 'train' split (≈9.41k rows).
ds = load_dataset("LawInformedAI/claudette_tos", split="train")
# Convert to pandas
df = ds.to_pandas()

# Columns: typically 'text' and 'label' (int 0/1)
print("Columns in dataset:", df.columns.tolist())
print("Example counts:\n", df['label'].value_counts(normalize=False))
print("Shape:", df.shape)

Columns in dataset: ['text', 'label']
Example counts:
 label
0    8382
1    1032
Name: count, dtype: int64
Shape: (9414, 2)


In [20]:
# ---------- Prepare data ----------
# Make sure the text column is named correctly; adjust if your column name differs
TEXT_COL = "text"
LABEL_COL = "label"

X = df[TEXT_COL].astype(str).values
y = df[LABEL_COL].astype(int).values

# stratified split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y
)

In [21]:
# ---------- TF-IDF vectorization ----------
vectorizer = TfidfVectorizer(ngram_range=TFIDF_NGRAMS, max_features=MAX_FEATURES, min_df=2)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

print("TF-IDF shape:", X_train_tfidf.shape)

# Optional: SMOTE (uncomment if you want to use oversampling)
if USE_SMOTE:
    from imblearn.over_sampling import SMOTE
    sm = SMOTE(random_state=RANDOM_STATE)
    X_train_tfidf, y_train = sm.fit_resample(X_train_tfidf, y_train)
    print("After SMOTE, train shape:", X_train_tfidf.shape, np.bincount(y_train))

TF-IDF shape: (7531, 25900)


In [5]:
# ---------- Utility: evaluation printer ----------
def evaluate_and_print(name, model, X_val, y_val):
    """
    Print classification report + PR AUC and confusion matrix.
    If model has predict_proba use that for PR-AUC; otherwise use decision_function.
    """
    y_pred = model.predict(X_val)
    print(f"\n=== {name} ===")
    print(classification_report(y_val, y_pred, digits=4))
    # Confusion matrix
    cm = confusion_matrix(y_val, y_pred)
    print("Confusion matrix:\n", cm)
    # PR AUC
    try:
        if hasattr(model, "predict_proba"):
            y_scores = model.predict_proba(X_val)[:, 1]
        else:
            # fallback to decision_function (SVM)
            y_scores = model.decision_function(X_val)
        precision, recall, _ = precision_recall_curve(y_val, y_scores)
        pr_auc = auc(recall, precision)
        print(f"Precision-Recall AUC: {pr_auc:.4f}")
    except Exception as e:
        print("Could not compute PR-AUC:", e)

In [22]:
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_curve, auc
import os

# ---------- Utility: evaluation printer + file writer ----------
def evaluate_and_print(name, model, X_val, y_val, output_file="model_results_voting_ensemble.txt"):
    """
    Print classification report + PR AUC and confusion matrix.
    Store results in an external file (append mode).
    """
    y_pred = model.predict(X_val)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)

    print(f"{name} Results:")
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("-" * 30)

    # Append results to global list
    results.append({
        "model": name,
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall
    })
    # Open file in append mode
    with open(output_file, "a") as f:
        f.write(f"\n=== {name} ===\n")
        report = classification_report(y_val, y_pred, digits=4)
        f.write(report + "\n")

        # Confusion matrix
        cm = confusion_matrix(y_val, y_pred)
        f.write("Confusion matrix:\n" + str(cm) + "\n")

        # PR AUC
        try:
            if hasattr(model, "predict_proba"):
                y_scores = model.predict_proba(X_val)[:, 1]
            else:
                y_scores = model.decision_function(X_val)
            precision, recall, _ = precision_recall_curve(y_val, y_scores)
            pr_auc = auc(recall, precision)
            f.write(f"Precision-Recall AUC: {pr_auc:.4f}\n")
        except Exception as e:
            f.write(f"Could not compute PR-AUC: {e}\n")

    # Still print to console for convenience
    print(f"\n=== {name} ===")
    print(report)
    print("Confusion matrix:\n", cm)
    try:
        print(f"Precision-Recall AUC: {pr_auc:.4f}")
    except:
        print("Could not compute PR-AUC")


In [23]:
# ---------- Model 1: XGBoost ----------
# handle imbalance via scale_pos_weight = (neg_count / pos_count)
neg = (y_train == 0).sum()
pos = (y_train == 1).sum()
scale_pos_weight = neg / max(1, pos)

xgb = XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    objective="binary:logistic",
    eval_metric="logloss",
    use_label_encoder=False,
    scale_pos_weight=scale_pos_weight,
    random_state=RANDOM_STATE,
    n_jobs=-1,
)
print("Training XGBoost...")
xgb.fit(X_train_tfidf, y_train)
evaluate_and_print("XGBoost", xgb, X_test_tfidf, y_test)
joblib.dump(xgb, os.path.join(SAVE_DIR, "xgb_claudette.joblib"))

Training XGBoost...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost Results:
Accuracy: 0.9341476367498672
Precision: 0.6830357142857143
Recall: 0.7427184466019418
------------------------------

=== XGBoost ===
              precision    recall  f1-score   support

           0     0.9681    0.9577    0.9628      1677
           1     0.6830    0.7427    0.7116       206

    accuracy                         0.9341      1883
   macro avg     0.8255    0.8502    0.8372      1883
weighted avg     0.9369    0.9341    0.9353      1883

Confusion matrix:
 [[1606   71]
 [  53  153]]
Precision-Recall AUC: 0.7615


['./models/xgb_claudette.joblib']

In [24]:
# ---------- Model 2: LightGBM ----------
from lightgbm import early_stopping, log_evaluation
lgbm = LGBMClassifier(
    n_estimators=300,
    max_depth=-1,
    learning_rate=0.05,
    objective="binary",
    class_weight="balanced",  # alternatively use scale_pos_weight
    random_state=RANDOM_STATE,
    n_jobs=-1,
)
print("Training LightGBM...")
lgbm.fit(
    X_train_tfidf, 
    y_train,
    eval_set=[(X_test_tfidf, y_test)],
    callbacks=[early_stopping(stopping_rounds=30), log_evaluation(50)]
)
evaluate_and_print("LightGBM", lgbm, X_test_tfidf, y_test)
joblib.dump(lgbm, os.path.join(SAVE_DIR, "lgbm_claudette.joblib"))

Training LightGBM...
[LightGBM] [Info] Number of positive: 826, number of negative: 6705
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.073413 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 66807
[LightGBM] [Info] Number of data points in the train set: 7531, number of used features: 2420
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
Training until validation scores don't improve for 30 rounds
[50]	valid_0's binary_logloss: 0.299598
[100]	valid_0's binary_logloss: 0.217378
[150]	valid_0's binary_logloss: 0.186309
[200]	valid_0's binary_logloss: 0.171015
[250]	valid_0's binary_logloss: 0.162707
[300]	valid_0's binary_logloss: 0.159531
Did not meet early stopping. Best iteration is:
[291]	valid_0's binary_logloss: 0.159334
LightGBM Results:
Accuracy: 0



['./models/lgbm_claudette.joblib']

In [25]:

# ---------- Model 3: SVM (LinearSVC) with calibration ----------
# LinearSVC is fast and supports sparse input; class_weight='balanced' to handle class imbalance
svc = LinearSVC(class_weight="balanced", max_iter=20000, random_state=RANDOM_STATE)
# Calibrate to get probabilities (useful for PR curves / voting soft)
svc_cal = CalibratedClassifierCV(estimator=svc, cv=3, method="sigmoid")
print("Training SVM (LinearSVC) with calibration (this can take some time)...")
svc_cal.fit(X_train_tfidf, y_train)
evaluate_and_print("SVM (Calibrated LinearSVC)", svc_cal, X_test_tfidf, y_test)
joblib.dump(svc_cal, os.path.join(SAVE_DIR, "svm_calibrated_claudette.joblib"))

Training SVM (LinearSVC) with calibration (this can take some time)...
SVM (Calibrated LinearSVC) Results:
Accuracy: 0.9548592671269251
Precision: 0.8622754491017964
Recall: 0.6990291262135923
------------------------------

=== SVM (Calibrated LinearSVC) ===
              precision    recall  f1-score   support

           0     0.9639    0.9863    0.9749      1677
           1     0.8623    0.6990    0.7721       206

    accuracy                         0.9549      1883
   macro avg     0.9131    0.8427    0.8735      1883
weighted avg     0.9528    0.9549    0.9528      1883

Confusion matrix:
 [[1654   23]
 [  62  144]]
Precision-Recall AUC: 0.8297


['./models/svm_calibrated_claudette.joblib']

In [26]:
# ---------- Ensemble: Soft Voting (XGB + LGBM + SVM-calibrated) ----------
# Create fresh estimator instances (voting will fit them), but it's okay to reuse trained ones if you prefer to skip refit
voting = VotingClassifier(
    estimators=[
        ("xgb", XGBClassifier(
            n_estimators=200, max_depth=6, learning_rate=0.05,
            use_label_encoder=False, eval_metric="logloss", scale_pos_weight=scale_pos_weight, random_state=RANDOM_STATE)),
        ("lgbm", LGBMClassifier(n_estimators=200, learning_rate=0.05, class_weight="balanced", random_state=RANDOM_STATE)),
        ("svm", CalibratedClassifierCV(estimator=LinearSVC(class_weight="balanced", max_iter=20000, random_state=RANDOM_STATE), cv=3)),
    ],
    voting="soft",
    n_jobs=-1,
)
print("Training Voting ensemble (soft voting)...")
voting.fit(X_train_tfidf, y_train)
evaluate_and_print("Voting Ensemble (XGB+LGBM+SVM)", voting, X_test_tfidf, y_test)
joblib.dump(voting, os.path.join(SAVE_DIR, "voting_ensemble_claudette.joblib"))

Training Voting ensemble (soft voting)...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Voting Ensemble (XGB+LGBM+SVM) Results:
Accuracy: 0.9468932554434413
Precision: 0.7676767676767676
Recall: 0.7378640776699029
------------------------------

=== Voting Ensemble (XGB+LGBM+SVM) ===
              precision    recall  f1-score   support

           0     0.9680    0.9726    0.9703      1677
           1     0.7677    0.7379    0.7525       206

    accuracy                         0.9469      1883
   macro avg     0.8678    0.8552    0.8614      1883
weighted avg     0.9460    0.9469    0.9464      1883

Confusion matrix:
 [[1631   46]
 [  54  152]]
Precision-Recall AUC: 0.8183




['./models/voting_ensemble_claudette.joblib']

In [28]:
# ---------- Save vectorizer ----------
joblib.dump(vectorizer, os.path.join(SAVE_DIR, "tfidf_vectorizer.joblib"))
print("All models + vectorizer saved to", SAVE_DIR)

# ---------- Optional: adjust decision threshold for best F1 (example for xgb) ----------
def find_best_threshold(model, X_val, y_val):
    # returns threshold that maximizes F1
    if hasattr(model, "predict_proba"):
        probs = model.predict_proba(X_val)[:, 1]
    else:
        probs = model.decision_function(X_val)
    precision, recall, thresholds = precision_recall_curve(y_val, probs)
    f1_scores = 2 * (precision * recall) / (precision + recall + 1e-12)
    best_idx = np.nanargmax(f1_scores)
    best_threshold = thresholds[best_idx] if best_idx < len(thresholds) else 0.5
    return best_threshold, f1_scores[best_idx]

best_thr, best_f1 = find_best_threshold(xgb, X_test_tfidf, y_test)
print(f"XGBoost best threshold (on test) for F1 ~ {best_f1:.4f} at threshold {best_thr:.4f}")


All models + vectorizer saved to ./models
XGBoost best threshold (on test) for F1 ~ 0.7226 at threshold 0.6313


In [29]:
results

[{'model': 'XGBoost',
  'accuracy': 0.9341476367498672,
  'precision': 0.6830357142857143,
  'recall': 0.7427184466019418},
 {'model': 'LightGBM',
  'accuracy': 0.944768985661179,
  'precision': 0.755,
  'recall': 0.7330097087378641},
 {'model': 'SVM (Calibrated LinearSVC)',
  'accuracy': 0.9548592671269251,
  'precision': 0.8622754491017964,
  'recall': 0.6990291262135923},
 {'model': 'Voting Ensemble (XGB+LGBM+SVM)',
  'accuracy': 0.9468932554434413,
  'precision': 0.7676767676767676,
  'recall': 0.7378640776699029}]

In [30]:
#storing the results back into the file
# storing the results back into the file (append mode)
df_results = pd.DataFrame(results)

# Rename columns to match your CSV headers
df_results.rename(columns={
    "model": "Model",
    "accuracy": "Accuracy",
    "precision": "Precision",
    "recall": "Recall"
}, inplace=True)


file_path = "model_results_voting_ensemble_table.csv"

if not os.path.isfile(file_path):
    # File does not exist -> write with header
    df_results.to_csv(file_path, index=False, mode="w")
else:
    # File exists -> append without header
    df_results.to_csv(file_path, index=False, mode="a", header=False)

print(df_results)

                            Model  Accuracy  Precision    Recall
0                         XGBoost  0.934148   0.683036  0.742718
1                        LightGBM  0.944769   0.755000  0.733010
2      SVM (Calibrated LinearSVC)  0.954859   0.862275  0.699029
3  Voting Ensemble (XGB+LGBM+SVM)  0.946893   0.767677  0.737864
