In [None]:
"""
Training classical baselines with 5-fold cross-validation on an 80% training split,
followed by final evaluation on a 20% holdout test set.

Dataset (after preprocessing and splitting):
- 477 Offensive samples
- 500 Non-offensive samples

This script assumes you already have:
- train.csv  (80% of data), test.csv (20% holdout) combined into a single full_dataset.csv
- This is same dataset used to Finetune transformers (PuoBERTa, Afro-XLM-R)

-  stored in a local ./data/ directory.
"""

import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score
from sklearn.preprocessing import LabelEncoder
import joblib

# Directory where preprocessed CSV files are stored.
# For public code, we assume a local ./data/ folder in the repo.
DATA_DIR = Path("data")

# === Step 1: Load Dataset ===
df = pd.read_csv('/DATA_DIR/full_dataset.csv')
df = df[['TEXT', 'TARGET']].dropna()

# === Step 2: Encode Labels ===
le = LabelEncoder()
df['label_encoded'] = le.fit_transform(df['TARGET'])

# === Step 3: Split into 80% train_val and 20% test ===
train_val_df, test_df = train_test_split(
    df, test_size=0.2, stratify=df['label_encoded'], random_state=42
)

# === Step 4: Vectorize ===
vectorizer = TfidfVectorizer(max_features=5000)
X_train_val = vectorizer.fit_transform(train_val_df['TEXT'])
y_train_val = train_val_df['label_encoded']

X_test = vectorizer.transform(test_df['TEXT'])
y_test = test_df['label_encoded']

# === Step 5: K-Fold CV and Save Best Model ===
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
f1_scores = []
models = []
best_f1 = 0
best_model = None

print("Cross-validation performance (on 80% training data):")
for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_val, y_train_val), 1):
    X_train, X_val = X_train_val[train_idx], X_train_val[val_idx]
    y_train, y_val = y_train_val.iloc[train_idx], y_train_val.iloc[val_idx]

    clf = LogisticRegression(class_weight={0: 1, 1: 1}, max_iter=1000)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_val)
    f1 = f1_score(y_val, y_pred, average='macro')
    f1_scores.append(f1)
    models.append(clf)

    print(f" Fold {fold}: F1 Macro = {f1:.4f}")

    if f1 > best_f1:
        best_f1 = f1
        best_model = clf
        joblib.dump(clf, f'/DATA_DIR/best_logreg_model_fold{fold}.joblib')

print(f"\nBest F1 Macro from CV: {best_f1:.4f}")

# === Step 6: Use best model on 20% holdout ===
y_pred_test = best_model.predict(X_test)
print("\nFinal Evaluation on 20% Holdout Set:")
print(classification_report(y_test, y_pred_test, target_names=le.classes_))



Mounted at /content/drive
Cross-validation performance (on 80% training data):
 Fold 1: F1 Macro = 0.7715
 Fold 2: F1 Macro = 0.8181
 Fold 3: F1 Macro = 0.7723
 Fold 4: F1 Macro = 0.8191
 Fold 5: F1 Macro = 0.8175

Best F1 Macro from CV: 0.8191

Final Evaluation on 20% Holdout Set:
               precision    recall  f1-score   support

Non-offensive       0.80      0.83      0.81       100
    Offensive       0.82      0.78      0.80        96

     accuracy                           0.81       196
    macro avg       0.81      0.81      0.81       196
 weighted avg       0.81      0.81      0.81       196


Additional Metrics LogisticRegression:
Matthews Correlation Coefficient (MCC): 0.6427
ROC–AUC: 0.8945


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score, classification_report
from sklearn.preprocessing import LabelEncoder
import joblib

# Directory where preprocessed CSV files are stored.
# For public code, we assume a local ./data/ folder in the repo.
DATA_DIR = Path("data")

# Load dataset
df = pd.read_csv("/DATA_DIR/full_dataset2.csv")
df = df[['TEXT', 'TARGET']]

# Encode labels
le = LabelEncoder()
df['label_encoded'] = le.fit_transform(df['TARGET'])

# Split into 80% train+val and 20% holdout test set
train_val_df, test_df = train_test_split(
    df, test_size=0.2, stratify=df['label_encoded'], random_state=42
)

# TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_train_val = vectorizer.fit_transform(train_val_df['TEXT'])
y_train_val = train_val_df['label_encoded'].values
X_test = vectorizer.transform(test_df['TEXT'])
y_test = test_df['label_encoded'].values

# Cross-validation on 80% to select best model
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
best_f1 = 0
best_model = None

for fold, (train_index, val_index) in enumerate(kf.split(X_train_val, y_train_val)):
    X_train, X_val = X_train_val[train_index], X_train_val[val_index]
    y_train, y_val = y_train_val[train_index], y_train_val[val_index]

    model = MultinomialNB()
    model.fit(X_train, y_train)
    y_val_pred = model.predict(X_val)
    f1 = f1_score(y_val, y_val_pred, average='macro')

    print(f"Fold {fold+1} F1 Score: {f1:.4f}")

    if f1 > best_f1:
        best_f1 = f1
        best_model = model
        joblib.dump(best_model, "/DATA_DIR/best_nb_model.joblib")

# Load best model and evaluate on final holdout test set
best_model = joblib.load("/DATA_DIR/best_nb_model.joblib")
y_pred = best_model.predict(X_test)

# Print evaluation
print("\nFinal Evaluation on 20% Holdout Test Set")
print(classification_report(y_test, y_pred, target_names=le.classes_))

from sklearn.metrics import matthews_corrcoef, roc_auc_score

# Additional metrics: MCC and ROC–AUC
mcc = matthews_corrcoef(y_test, y_test_pred)

# Use decision_function if available (LinearSVC); fallback to predict_proba if model supports it
y_test_score = None
if hasattr(best_model, "decision_function"):
    y_test_score = best_model.decision_function(X_test)
    # If decision_function returns (n_samples, n_classes), take the positive class column (1)
    if hasattr(y_test_score, "ndim") and y_test_score.ndim == 2:
        y_test_score = y_test_score[:, 1]
elif hasattr(best_model, "predict_proba"):
    y_test_score = best_model.predict_proba(X_test)[:, 1]

print("\nAdditional Metrics MultinomialNB:")
print(f"Matthews Correlation Coefficient (MCC): {mcc:.4f}")

if y_test_score is not None:
    try:
        roc_auc = roc_auc_score(y_test, y_test_score)
        print(f"ROC–AUC: {roc_auc:.4f}")
    except ValueError as e:
        print(f"ROC–AUC not computed: {e}")
else:
    print("ROC–AUC not available (no decision_function/predict_proba for this model).")


Mounted at /content/drive
Fold 1 F1 Score: 0.8333
Fold 2 F1 Score: 0.8782
Fold 3 F1 Score: 0.8077
Fold 4 F1 Score: 0.8269
Fold 5 F1 Score: 0.7817

Final Evaluation on 20% Holdout Test Set
               precision    recall  f1-score   support

Non-offensive       0.90      0.82      0.86       100
    Offensive       0.83      0.91      0.87        96

     accuracy                           0.86       196
    macro avg       0.86      0.86      0.86       196
 weighted avg       0.87      0.86      0.86       196


Additional Metrics MultinomialNB:
Matthews Correlation Coefficient (MCC): 0.6427
ROC–AUC: 0.9179


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score, classification_report
from sklearn.preprocessing import LabelEncoder
import joblib

# Load the dataset
df = pd.read_csv("/DATA_DIR/full_dataset2.csv")

# Encode string labels to numeric
le = LabelEncoder()
df['label_encoded'] = le.fit_transform(df['TARGET'])  # e.g., Non-offensive → 0, Offensive → 1

# Split into train (80%) and holdout test set (20%)
train_df, test_df = train_test_split(
    df, test_size=0.2, stratify=df['label_encoded'], random_state=42
)

# TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(train_df['TEXT'])
y_train = train_df['label_encoded'].values
X_test = vectorizer.transform(test_df['TEXT'])
y_test = test_df['label_encoded'].values

# Cross-validation on 80% training data
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
best_f1 = 0
best_model = None

for fold, (train_idx, val_idx) in enumerate(cv.split(X_train, y_train)):
    X_tr, X_val = X_train[train_idx], X_train[val_idx]
    y_tr, y_val = y_train[train_idx], y_train[val_idx]

    svm_clf = LinearSVC(class_weight='balanced', max_iter=10000)
    svm_clf.fit(X_tr, y_tr)
    y_val_pred = svm_clf.predict(X_val)
    f1 = f1_score(y_val, y_val_pred, average='macro')

    print(f"Fold {fold+1} F1 Macro: {f1:.4f}")
    if f1 > best_f1:
        best_f1 = f1
        best_model = svm_clf
        joblib.dump(best_model, 'best_svm_model.joblib')  # save best model

# Load best model and evaluate on holdout set
print("\nEvaluating best SVM model on holdout test set...")
best_model_ = joblib.load('best_svm_model.joblib')
y_test_pred = best_model_.predict(X_test)
print(classification_report(y_test, y_test_pred, target_names=le.classes_))

from sklearn.metrics import matthews_corrcoef, roc_auc_score

# Additional metrics: MCC and ROC–AUC
mcc = matthews_corrcoef(y_test, y_test_pred)

# Use decision_function if available (LinearSVC); fallback to predict_proba if model supports it
y_test_score = None
if hasattr(best_model, "decision_function"):
    y_test_score = best_model.decision_function(X_test)
    # If decision_function returns (n_samples, n_classes), take the positive class column (1)
    if hasattr(y_test_score, "ndim") and y_test_score.ndim == 2:
        y_test_score = y_test_score[:, 1]
elif hasattr(best_model, "predict_proba"):
    y_test_score = best_model.predict_proba(X_test)[:, 1]

print("\nAdditional Metrics SVM:")
print(f"Matthews Correlation Coefficient (MCC): {mcc:.4f}")

if y_test_score is not None:
    try:
        roc_auc = roc_auc_score(y_test, y_test_score)
        print(f"ROC–AUC: {roc_auc:.4f}")
    except ValueError as e:
        print(f"ROC–AUC not computed: {e}")
else:
    print("ROC–AUC not available (no decision_function/predict_proba for this model).")


def get_scores_for_auc(model, X):
    # Naive Bayes / Logistic Regression
    if hasattr(model, "predict_proba"):
        return model.predict_proba(X)[:, 1]
    # LinearSVC
    if hasattr(model, "decision_function"):
        s = model.decision_function(X)
        if getattr(s, "ndim", 1) == 2:  # rare multi-class path
            s = s[:, 1]
        return s
    raise ValueError("Model has neither predict_proba nor decision_function.")
y_test_score = get_scores_for_auc(best_model_, X_test)  # scores for AUC

mcc = matthews_corrcoef(y_test, y_test_pred)
auc = roc_auc_score(y_test, y_test_score)

print(mcc)
print(auc)


Fold 1 F1 Macro: 0.8253
Fold 2 F1 Macro: 0.8708
Fold 3 F1 Macro: 0.8323
Fold 4 F1 Macro: 0.8649
Fold 5 F1 Macro: 0.8266

Evaluating best SVM model on holdout test set...
               precision    recall  f1-score   support

Non-offensive       0.82      0.83      0.83       100
    Offensive       0.82      0.81      0.82        96

     accuracy                           0.82       196
    macro avg       0.82      0.82      0.82       196
 weighted avg       0.82      0.82      0.82       196


Additional Metrics SVM:
Matthews Correlation Coefficient (MCC): 0.6427
ROC–AUC: 0.9032
0.6426673830951934
0.9032291666666667
