In [1]:
# =========================
# Misclause Detector (Classical ML)
# =========================
# - Input: CSV with columns: text,label  (label: 0=fair, 1=unfair)
# - Models: Linear SVM, RBF SVM, Logistic Regression, Multinomial/Complement NB,
#           PassiveAggressive, RandomForest, GradientBoosting
# - Extras: TF-IDF (word + char), imbalance handling, metrics (P/R/F1 macro),
#           soft-vote Ensemble, simple calibration for LinearSVC
# =========================

import os
import re
import numpy as np
import pandas as pd
from scipy.sparse import hstack, csr_matrix

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import precision_recall_fscore_support, classification_report, confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression, PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier

RANDOM_STATE = 42


In [4]:
# -------------------------
# 0) Load your CSV
# -------------------------
CSV_PATH = "Untitled.ipynb.csv"   # change if needed

if not os.path.exists(CSV_PATH):
    # Tiny demo fallback if you just run this script with your snippet
    demo = """text,label
thanks for sending us good vibes by using the various services available within viber !,0
"you may be surprised , but we will refer to all such services , including additional apps published by viber media s.a.r.l , and also third party services which are powered by viber 's technology , as the `` services '' .",0
"the terms of use -lrb- or , the `` terms '' -rrb- presented below are the basic rights and obligations that you and us have between us when you use our services .",0
"when you use our services , in addition to enjoying a world of good vibes , you also agree to the terms and they affect your rights and obligations .",1
"""
    with open(CSV_PATH, "w", encoding="utf-8") as f:
        f.write(demo)

df = pd.read_csv(CSV_PATH)
df = df.dropna(subset=["text", "label"]).reset_index(drop=True)

# Ensure labels are ints
df["label"] = df["label"].astype(int)


In [6]:
# -------------------------
# 1) Basic cleaning
# -------------------------
def clean_text(s: str) -> str:
    s = s.lower()
    s = re.sub(r"\s+", " ", s)              # normalize spaces
    s = s.replace("``", '"').replace("''", '"')
    s = s.replace("-lrb-", "(").replace("-rrb-", ")")
    s = re.sub(r"[^a-z0-9 ()/\-_'\":;,.!?]", " ", s)  # keep common legal punctuation
    return s.strip()

df["text"] = df["text"].astype(str).apply(clean_text)



In [7]:
# -------------------------
# 2) Split (stratified)
# -------------------------
X_train, X_test, y_train, y_test = train_test_split(
    df["text"], df["label"], test_size=0.2, random_state=RANDOM_STATE, stratify=df["label"]
)

# -------------------------
# 3) Representations: TF-IDF (word + char) then hstack
#    Word ngrams capture semantics; char ngrams catch morphology & misspellings.
# -------------------------
word_vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),      # unigrams + bigrams
    max_features=40000,
    min_df=2,
    sublinear_tf=True
)

char_vectorizer = TfidfVectorizer(
    analyzer="char",
    ngram_range=(3, 5),      # character 3-5 grams
    max_features=30000,
    min_df=2,
    sublinear_tf=True
)

Xw_train = word_vectorizer.fit_transform(X_train)
Xw_test  = word_vectorizer.transform(X_test)

Xc_train = char_vectorizer.fit_transform(X_train)
Xc_test  = char_vectorizer.transform(X_test)

X_train_vec = hstack([Xw_train, Xc_train]).tocsr()
X_test_vec  = hstack([Xw_test,  Xc_test]).tocsr()


In [8]:
# -------------------------
# 4) Helper: train & evaluate any sklearn classifier
# -------------------------
def eval_model(name, clf, Xtr, ytr, Xte, yte):
    clf.fit(Xtr, ytr)
    preds = clf.predict(Xte)
    acc = accuracy_score(yte, preds)
    p   = precision_score(yte, preds, average="macro", zero_division=0)
    r   = recall_score(yte, preds, average="macro", zero_division=0)
    f1  = f1_score(yte, preds, average="macro", zero_division=0)
    print(f"\n{name}")
    print("-"*len(name))
    print(f"Accuracy:  {acc:.3f}")
    print(f"Precision: {p:.3f}  Recall: {r:.3f}  F1(macro): {f1:.3f}")
    print(classification_report(yte, preds, digits=3))
    return {"Model": name, "Accuracy": acc, "Precision": p, "Recall": r, "F1": f1, "clf": clf}

results = []


In [9]:
# -------------------------
# 5) Individual classifiers (you can add/remove freely)
# -------------------------

# C1: SVM—single model (Linear SVM is strong on TF-IDF)
lin_svc = LinearSVC(C=1.0, class_weight="balanced", random_state=RANDOM_STATE)
results.append(eval_model("C1 LinearSVC", lin_svc, X_train_vec, y_train, X_test_vec, y_test))

# C2: SVM—RBF kernel (slower, non-linear). We use limited C,gamma to keep runtime sane.
rbf_svc = SVC(kernel="rbf", C=2.0, gamma="scale", class_weight="balanced", probability=False, random_state=RANDOM_STATE)
results.append(eval_model("C2 RBF SVC", rbf_svc, X_train_vec, y_train, X_test_vec, y_test))

# Logistic Regression (strong baseline, interpretable)
lr = LogisticRegression(
    C=2.0, max_iter=2000, n_jobs=-1, class_weight="balanced", solver="saga"
)
results.append(eval_model("LogisticRegression", lr, X_train_vec, y_train, X_test_vec, y_test))

# Naive Bayes (fast; try both Multinomial and Complement)
mnb = MultinomialNB(alpha=0.5)
results.append(eval_model("MultinomialNB", mnb, X_train_vec, y_train, X_test_vec, y_test))

cnb = ComplementNB(alpha=0.5)
results.append(eval_model("ComplementNB", cnb, X_train_vec, y_train, X_test_vec, y_test))

# Passive-Aggressive (works well on sparse text)
pa = PassiveAggressiveClassifier(C=1.0, max_iter=1000, class_weight="balanced", random_state=RANDOM_STATE)
results.append(eval_model("PassiveAggressive", pa, X_train_vec, y_train, X_test_vec, y_test))

# Random Forest (ok on text but usually weaker than linear models; still good for ensemble)
rf = RandomForestClassifier(
    n_estimators=300, max_depth=None, n_jobs=-1, class_weight="balanced_subsample", random_state=RANDOM_STATE
)
results.append(eval_model("RandomForest", rf, X_train_vec, y_train, X_test_vec, y_test))

# Gradient Boosting (tree-based boosting)
gb = GradientBoostingClassifier(random_state=RANDOM_STATE)
results.append(eval_model("GradientBoosting", gb, X_train_vec, y_train, X_test_vec, y_test))




C1 LinearSVC
------------
Accuracy:  0.950
Precision: 0.874  Recall: 0.863  F1(macro): 0.869
              precision    recall  f1-score   support

           0      0.970     0.974     0.972      1677
           1      0.779     0.752     0.765       206

    accuracy                          0.950      1883
   macro avg      0.874     0.863     0.869      1883
weighted avg      0.949     0.950     0.949      1883


C2 RBF SVC
----------
Accuracy:  0.952
Precision: 0.926  Recall: 0.811  F1(macro): 0.857
              precision    recall  f1-score   support

           0      0.956     0.991     0.973      1677
           1      0.897     0.631     0.741       206

    accuracy                          0.952      1883
   macro avg      0.926     0.811     0.857      1883
weighted avg      0.950     0.952     0.948      1883






LogisticRegression
------------------
Accuracy:  0.937
Precision: 0.827  Recall: 0.875  F1(macro): 0.849
              precision    recall  f1-score   support

           0      0.974     0.954     0.964      1677
           1      0.680     0.796     0.734       206

    accuracy                          0.937      1883
   macro avg      0.827     0.875     0.849      1883
weighted avg      0.942     0.937     0.939      1883


MultinomialNB
-------------
Accuracy:  0.929
Precision: 0.903  Recall: 0.700  F1(macro): 0.759
              precision    recall  f1-score   support

           0      0.932     0.993     0.961      1677
           1      0.875     0.408     0.556       206

    accuracy                          0.929      1883
   macro avg      0.903     0.700     0.759      1883
weighted avg      0.926     0.929     0.917      1883


ComplementNB
------------
Accuracy:  0.924
Precision: 0.826  Recall: 0.742  F1(macro): 0.776
              precision    recall  f1-score   supp

In [11]:
# -------------------------
# 6) Calibrate LinearSVC to get probabilities for soft-voting ensemble
#    (LinearSVC has no predict_proba; calibration adds it via Platt scaling.)
# -------------------------
calibrated_lin_svc = CalibratedClassifierCV(
    estimator=LinearSVC(C=1.0, class_weight="balanced", random_state=RANDOM_STATE),
    method="sigmoid",
    cv=5
)
calibrated_lin_svc.fit(X_train_vec, y_train)

# Another probability-capable strong model
lr2 = LogisticRegression(
    C=2.0, max_iter=2000, n_jobs=-1, class_weight="balanced", solver="saga"
)
lr2.fit(X_train_vec, y_train)




0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,2.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'saga'
,max_iter,2000


In [12]:
# -------------------------
# 7) Ensemble (soft voting) similar to C8
#    You can tweak which models to include based on individual scores.
# -------------------------
ensemble = VotingClassifier(
    estimators=[
        ("cal_lin_svc", calibrated_lin_svc),
        ("lr", lr2),
        ("rf", rf)
    ],
    voting="soft",            # average predicted probabilities
    weights=[2, 2, 1]         # give more weight to linear models
)
results.append(eval_model("C8 Ensemble (Calibrated LinearSVC + LR + RF)", ensemble, X_train_vec, y_train, X_test_vec, y_test))





C8 Ensemble (Calibrated LinearSVC + LR + RF)
--------------------------------------------
Accuracy:  0.953
Precision: 0.886  Recall: 0.865  F1(macro): 0.875
              precision    recall  f1-score   support

           0      0.970     0.977     0.974      1677
           1      0.803     0.752     0.777       206

    accuracy                          0.953      1883
   macro avg      0.886     0.865     0.875      1883
weighted avg      0.952     0.953     0.952      1883



In [18]:
# -------------------------
# 8) Show summary table (sorted by F1)
# -------------------------
summary = pd.DataFrame([{k:v for k,v in r.items() if k != "clf"} for r in results]).sort_values("F1", ascending=False)
print("\n=== Summary (sorted by F1 macro) ===")
print(summary.to_string(index=False))

# Create summary DataFrame (you already have this part)
summary = pd.DataFrame([{k:v for k,v in r.items() if k != "clf"} for r in results]).sort_values("F1", ascending=False)

# Drop the F1 column
summary = summary.drop(columns=["F1"])

# Round values to 3 decimals
summary = summary.round(3)

file_path = "model_results_voting_ensemble_table.csv"

# Append to CSV with headers handled properly
if not os.path.isfile(file_path):
    # If CSV doesn’t exist, create with headers
    summary.to_csv(file_path, index=False, mode="w", float_format="%.3f")
else:
    # If CSV exists, append without headers
    summary.to_csv(file_path, index=False, mode="a", header=False, float_format="%.3f")

print("Summary appended to CSV successfully!")



=== Summary (sorted by F1 macro) ===
                                       Model  Accuracy  Precision   Recall       F1
C8 Ensemble (Calibrated LinearSVC + LR + RF)  0.952735   0.886466 0.864884 0.875255
                                C1 LinearSVC  0.949549   0.874305 0.863095 0.868583
                                  C2 RBF SVC  0.951673   0.926412 0.811062 0.857047
                           PassiveAggressive  0.946893   0.877380 0.838185 0.856332
                          LogisticRegression  0.936803   0.827460 0.875101 0.848963
                            GradientBoosting  0.932554   0.906037 0.719425 0.777441
                                ComplementNB  0.923526   0.825623 0.742034 0.775508
                               MultinomialNB  0.928837   0.903365 0.700306 0.758804
                                RandomForest  0.929368   0.911879 0.698475 0.758411
Summary appended to CSV successfully!


In [14]:
# -------------------------
# 9) Save best model + vectorizers (optional)
# -------------------------
try:
    import joblib
    best_row = summary.iloc[0]
    best_name = best_row["Model"]
    best_clf = [r["clf"] for r in results if r["Model"] == best_name][0]
    os.makedirs("artifacts", exist_ok=True)
    joblib.dump(best_clf, "artifacts/best_model.joblib")
    joblib.dump(word_vectorizer, "artifacts/word_vectorizer.joblib")
    joblib.dump(char_vectorizer, "artifacts/char_vectorizer.joblib")
    print(f"\nSaved best model ({best_name}) and vectorizers to ./artifacts/")
except Exception as e:
    print(f"\nCould not save model/vectorizers: {e}")




Saved best model (C8 Ensemble (Calibrated LinearSVC + LR + RF)) and vectorizers to ./artifacts/


In [15]:
# -------------------------
# 10) Inference helper (paste a clause and see prediction)
# -------------------------
def predict_clause(clause: str):
    clause = clean_text(clause)
    Xw = word_vectorizer.transform([clause])
    Xc = char_vectorizer.transform([clause])
    Xv = hstack([Xw, Xc]).tocsr()
    pred = best_clf.predict(Xv)[0]
    prob = None
    # Try probability if available
    if hasattr(best_clf, "predict_proba"):
        prob = float(best_clf.predict_proba(Xv)[0][pred])
    elif hasattr(best_clf, "decision_function"):
        # convert decision function to pseudo-prob via sigmoid for a quick sense
        df = best_clf.decision_function(Xv)
        if isinstance(df, np.ndarray):
            # binary case -> shape (1,) or (1,2)
            if df.ndim == 2 and df.shape[1] == 2:
                # softmax-like
                exps = np.exp(df - df.max())
                probs = exps / exps.sum(axis=1, keepdims=True)
                prob = float(probs[0, pred])
            else:
                # single score -> sigmoid
                score = float(df[0])
                prob = float(1 / (1 + np.exp(-score)))
    label_name = "UNFAIR (1)" if pred == 1 else "FAIR (0)"
    return label_name, prob


In [16]:
# quick demo
demo_clause = "we may change these terms at any time without notifying you."
lbl, pr = predict_clause(demo_clause)
print(f"\nDEMO: {demo_clause}\n → Prediction: {lbl}  Prob~{pr if pr is not None else 'N/A'}")


DEMO: we may change these terms at any time without notifying you.
 → Prediction: UNFAIR (1)  Prob~0.9283801562481895
