In [10]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    classification_report, confusion_matrix, precision_recall_curve, auc
)
import pandas as pd

In [11]:
data = pd.read_csv('Untitled.ipynb.csv');

In [12]:
sentences = data['text']

In [13]:
data.count()

text     9414
label    9414
dtype: int64

In [14]:
labels = data['label']

In [15]:
results = []

In [16]:
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_curve, auc
import os

# ---------- Utility: evaluation printer + file writer ----------
def evaluate_and_print(name, model, X_val, y_val, output_file="model_results_voting_ensemble.txt"):
    """
    Print classification report + PR AUC and confusion matrix.
    Store results in an external file (append mode).
    """
    y_pred = model.predict(X_val)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)

    print(f"{name} Results:")
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("-" * 30)

    # Append results to global list
    results.append({
        "model": name,
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall
    })
    # Open file in append mode
    with open(output_file, "a") as f:
        f.write(f"\n=== {name} ===\n")
        report = classification_report(y_val, y_pred, digits=4)
        f.write(report + "\n")

        # Confusion matrix
        cm = confusion_matrix(y_val, y_pred)
        f.write("Confusion matrix:\n" + str(cm) + "\n")

        # PR AUC
        try:
            if hasattr(model, "predict_proba"):
                y_scores = model.predict_proba(X_val)[:, 1]
            else:
                y_scores = model.decision_function(X_val)
            precision, recall, _ = precision_recall_curve(y_val, y_scores)
            pr_auc = auc(recall, precision)
            f.write(f"Precision-Recall AUC: {pr_auc:.4f}\n")
        except Exception as e:
            f.write(f"Could not compute PR-AUC: {e}\n")

    # Still print to console for convenience
    print(f"\n=== {name} ===")
    print(report)
    print("Confusion matrix:\n", cm)
    try:
        print(f"Precision-Recall AUC: {pr_auc:.4f}")
    except:
        print("Could not compute PR-AUC")


In [17]:
# Split
X_train, X_test, y_train, y_test = train_test_split(sentences, labels, test_size=0.2)

# Vectorize
vectorizer = TfidfVectorizer(ngram_range=(1,3), max_features=50000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [18]:

# Train model
clf = LogisticRegression()
clf.fit(X_train_tfidf, y_train)

# Evaluate
y_pred = clf.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))
evaluate_and_print("Logistic_Regression_With_N_grams", clf, X_test_tfidf, y_test)

              precision    recall  f1-score   support

           0       0.92      1.00      0.96      1652
           1       0.97      0.39      0.55       231

    accuracy                           0.92      1883
   macro avg       0.94      0.69      0.75      1883
weighted avg       0.93      0.92      0.91      1883

Logistic_Regression_With_N_grams Results:
Accuracy: 0.9229952203929899
Precision: 0.967391304347826
Recall: 0.3852813852813853
------------------------------

=== Logistic_Regression_With_N_grams ===
              precision    recall  f1-score   support

           0     0.9207    0.9982    0.9579      1652
           1     0.9674    0.3853    0.5511       231

    accuracy                         0.9230      1883
   macro avg     0.9441    0.6917    0.7545      1883
weighted avg     0.9264    0.9230    0.9080      1883

Confusion matrix:
 [[1649    3]
 [ 142   89]]
Precision-Recall AUC: 0.8423


In [19]:
clf = LogisticRegression(class_weight="balanced", max_iter=200)
clf.fit(X_train_tfidf, y_train)
# Evaluate
y_pred = clf.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))
evaluate_and_print("Balanced_Logistic_Regression_With_N_grams", clf, X_test_tfidf, y_test)

              precision    recall  f1-score   support

           0       0.98      0.95      0.96      1652
           1       0.70      0.85      0.76       231

    accuracy                           0.94      1883
   macro avg       0.84      0.90      0.86      1883
weighted avg       0.94      0.94      0.94      1883

Balanced_Logistic_Regression_With_N_grams Results:
Accuracy: 0.935740839086564
Precision: 0.6950354609929078
Recall: 0.8484848484848485
------------------------------

=== Balanced_Logistic_Regression_With_N_grams ===
              precision    recall  f1-score   support

           0     0.9781    0.9479    0.9628      1652
           1     0.6950    0.8485    0.7641       231

    accuracy                         0.9357      1883
   macro avg     0.8366    0.8982    0.8635      1883
weighted avg     0.9434    0.9357    0.9384      1883

Confusion matrix:
 [[1566   86]
 [  35  196]]
Precision-Recall AUC: 0.8497


In [20]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train_tfidf, y_train)

clf = LogisticRegression(max_iter=200)
clf.fit(X_resampled, y_resampled)
# Evaluate
y_pred = clf.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))
evaluate_and_print("SMOTE_With_N_grams", clf, X_test_tfidf, y_test)

              precision    recall  f1-score   support

           0       0.97      0.96      0.97      1652
           1       0.76      0.82      0.79       231

    accuracy                           0.95      1883
   macro avg       0.87      0.89      0.88      1883
weighted avg       0.95      0.95      0.95      1883

SMOTE_With_N_grams Results:
Accuracy: 0.9453000531067446
Precision: 0.756
Recall: 0.8181818181818182
------------------------------

=== SMOTE_With_N_grams ===
              precision    recall  f1-score   support

           0     0.9743    0.9631    0.9686      1652
           1     0.7560    0.8182    0.7859       231

    accuracy                         0.9453      1883
   macro avg     0.8651    0.8906    0.8773      1883
weighted avg     0.9475    0.9453    0.9462      1883

Confusion matrix:
 [[1591   61]
 [  42  189]]
Precision-Recall AUC: 0.8517


In [21]:
y_probs = clf.predict_proba(X_test_tfidf)[:,1]
y_pred_adjusted = (y_probs >= 0.3).astype(int)  # threshold = 0.3
print(classification_report(y_test, y_pred_adjusted))

              precision    recall  f1-score   support

           0       0.99      0.89      0.93      1652
           1       0.53      0.93      0.68       231

    accuracy                           0.89      1883
   macro avg       0.76      0.91      0.80      1883
weighted avg       0.93      0.89      0.90      1883



In [22]:
#storing the results back into the file
# storing the results back into the file (append mode)
df_results = pd.DataFrame(results)

# Rename columns to match your CSV headers
df_results.rename(columns={
    "model": "Model",
    "accuracy": "Accuracy",
    "precision": "Precision",
    "recall": "Recall"
}, inplace=True)


file_path = "model_results_voting_ensemble_table.csv"

if not os.path.isfile(file_path):
    # File does not exist -> write with header
    df_results.to_csv(file_path, index=False, mode="w")
else:
    # File exists -> append without header
    df_results.to_csv(file_path, index=False, mode="a", header=False)

print(df_results)


                                       Model  Accuracy  Precision    Recall
0           Logistic_Regression_With_N_grams  0.922995   0.967391  0.385281
1  Balanced_Logistic_Regression_With_N_grams  0.935741   0.695035  0.848485
2                         SMOTE_With_N_grams  0.945300   0.756000  0.818182


              precision    recall  f1-score   support

           0       0.92      1.00      0.96      1667
           1       0.93      0.35      0.51       216

    accuracy                           0.92      1883
   macro avg       0.92      0.67      0.73      1883
weighted avg       0.92      0.92      0.91      1883

