In [1]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    classification_report, confusion_matrix, precision_recall_curve, auc
)
import pandas as pd

In [2]:
data = pd.read_csv('Untitled.ipynb.csv');

In [3]:
sentences = data['text']

In [4]:
data.count()

text     9414
label    9414
dtype: int64

In [5]:
labels = data['label']

In [6]:
label[0]

NameError: name 'label' is not defined

In [7]:
#to store the results
results = []

In [8]:
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_curve, auc
import os

# ---------- Utility: evaluation printer + file writer ----------
def evaluate_and_print(name, model, X_val, y_val, output_file="model_results_voting_ensemble.txt"):
    """
    Print classification report + PR AUC and confusion matrix.
    Store results in an external file (append mode).
    """
    y_pred = model.predict(X_val)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)

    print(f"{name} Results:")
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("-" * 30)

    # Append results to global list
    results.append({
        "model": name,
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall
    })
    # Open file in append mode
    with open(output_file, "a") as f:
        f.write(f"\n=== {name} ===\n")
        report = classification_report(y_val, y_pred, digits=4)
        f.write(report + "\n")

        # Confusion matrix
        cm = confusion_matrix(y_val, y_pred)
        f.write("Confusion matrix:\n" + str(cm) + "\n")

        # PR AUC
        try:
            if hasattr(model, "predict_proba"):
                y_scores = model.predict_proba(X_val)[:, 1]
            else:
                y_scores = model.decision_function(X_val)
            precision, recall, _ = precision_recall_curve(y_val, y_scores)
            pr_auc = auc(recall, precision)
            f.write(f"Precision-Recall AUC: {pr_auc:.4f}\n")
        except Exception as e:
            f.write(f"Could not compute PR-AUC: {e}\n")

    # Still print to console for convenience
    print(f"\n=== {name} ===")
    print(report)
    print("Confusion matrix:\n", cm)
    try:
        print(f"Precision-Recall AUC: {pr_auc:.4f}")
    except:
        print("Could not compute PR-AUC")


In [9]:
# Split
X_train, X_test, y_train, y_test = train_test_split(sentences, labels, test_size=0.2)

# Vectorize
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [10]:

# Train model
clf = LogisticRegression()
clf.fit(X_train_tfidf, y_train)

# Evaluate
# y_pred = clf.predict(X_test_tfidf)
# print(classification_report(y_test, y_pred))
evaluate_and_print("Logistic_Regression", clf, X_test_tfidf, y_test)

Logistic_Regression Results:
Accuracy: 0.935740839086564
Precision: 0.9425287356321839
Recall: 0.41414141414141414
------------------------------

=== Logistic_Regression ===
              precision    recall  f1-score   support

           0     0.9354    0.9970    0.9652      1685
           1     0.9425    0.4141    0.5754       198

    accuracy                         0.9357      1883
   macro avg     0.9390    0.7056    0.7703      1883
weighted avg     0.9362    0.9357    0.9243      1883

Confusion matrix:
 [[1680    5]
 [ 116   82]]
Precision-Recall AUC: 0.7537


In [12]:
clf = LogisticRegression(class_weight="balanced", max_iter=200)
clf.fit(X_train_tfidf, y_train)
# Evaluate
y_pred = clf.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))
evaluate_and_print("Balanced_Logistic_Regression", clf, X_test_tfidf, y_test)

              precision    recall  f1-score   support

           0       0.98      0.92      0.95      1685
           1       0.54      0.81      0.65       198

    accuracy                           0.91      1883
   macro avg       0.76      0.87      0.80      1883
weighted avg       0.93      0.91      0.92      1883

Balanced_Logistic_Regression Results:
Accuracy: 0.9081253319171535
Precision: 0.5420875420875421
Recall: 0.8131313131313131
------------------------------

=== Balanced_Logistic_Regression ===
              precision    recall  f1-score   support

           0     0.9767    0.9193    0.9471      1685
           1     0.5421    0.8131    0.6505       198

    accuracy                         0.9081      1883
   macro avg     0.7594    0.8662    0.7988      1883
weighted avg     0.9310    0.9081    0.9159      1883

Confusion matrix:
 [[1549  136]
 [  37  161]]
Precision-Recall AUC: 0.7674


In [37]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train_tfidf, y_train)

clf = LogisticRegression(max_iter=200)
clf.fit(X_resampled, y_resampled)
# Evaluate
y_pred = clf.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.94      0.96      1667
           1       0.64      0.77      0.70       216

    accuracy                           0.92      1883
   macro avg       0.80      0.86      0.83      1883
weighted avg       0.93      0.92      0.93      1883



In [38]:
y_probs = clf.predict_proba(X_test_tfidf)[:,1]
y_pred_adjusted = (y_probs >= 0.3).astype(int)  # threshold = 0.3
print(classification_report(y_test, y_pred_adjusted))

              precision    recall  f1-score   support

           0       0.98      0.87      0.92      1667
           1       0.46      0.86      0.60       216

    accuracy                           0.87      1883
   macro avg       0.72      0.86      0.76      1883
weighted avg       0.92      0.87      0.89      1883



In [39]:
#using n grams

In [40]:
vectorizer = TfidfVectorizer(ngram_range=(1,3), max_features=50000)

In [41]:
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [42]:
#logistic regression + ngrams
# Train model
clf = LogisticRegression()
clf.fit(X_train_tfidf, y_train)

# Evaluate
y_pred = clf.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      1.00      0.96      1667
           1       0.93      0.35      0.51       216

    accuracy                           0.92      1883
   macro avg       0.92      0.67      0.73      1883
weighted avg       0.92      0.92      0.91      1883



In [113]:
#storing the results back into the file
# storing the results back into the file (append mode)
df_results = pd.DataFrame(results)

# Rename columns to match your CSV headers
df_results.rename(columns={
    "model": "Model",
    "accuracy": "Accuracy",
    "precision": "Precision",
    "recall": "Recall"
}, inplace=True)


file_path = "model_results_voting_ensemble_table.csv"

if not os.path.isfile(file_path):
    # File does not exist -> write with header
    df_results.to_csv(file_path, index=False, mode="w")
else:
    # File exists -> append without header
    df_results.to_csv(file_path, index=False, mode="a", header=False)

print(df_results)


                 model  accuracy  precision    recall
0  Logistic_Regression   0.93043   0.916667  0.417062


In [40]:
results

[{'model': 'Logistic_Regression',
  'accuracy': 0.9309612320764737,
  'precision': 0.9156626506024096,
  'recall': 0.38190954773869346},
 {'model': 'Balanced_Logistic_Regression',
  'accuracy': 0.9134360063728093,
  'precision': 0.5616438356164384,
  'recall': 0.8241206030150754},
 {'model': 'SMOTE',
  'accuracy': 0.9283058948486458,
  'precision': 0.6311475409836066,
  'recall': 0.7738693467336684}]