In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, confusion_matrix, classification_report
)

# =========================
# 1. LOAD & PREPARE DATA
# =========================
df = pd.read_csv('/kaggle/input/memo-health-trigger-dataset/Memo_Dataset.csv')
df = df[['Question', 'Question_eng', 'Trigger']]
df['Trigger'] = df['Trigger'].astype(int)

# Balance dataset (sample up to 5000 per class)
min_count = min(5000, df['Trigger'].value_counts().min())
df_balanced = df.groupby('Trigger', group_keys=False).apply(
    lambda x: x.sample(min_count, random_state=42)
).reset_index(drop=True)

df_balanced = df_balanced[['Question', 'Trigger']]
df_balanced.columns = ['text', 'label']

# Train-test split
train_df, test_df = train_test_split(
    df_balanced,
    test_size=0.2,
    stratify=df_balanced['label'],
    random_state=42
)

print(f"Training samples: {len(train_df)}, Test samples: {len(test_df)}")

# =========================
# 2. TF-IDF VECTORIZATION
# =========================
tfidf = TfidfVectorizer(
    max_features=50000,
    ngram_range=(1, 2),   # unigrams + bigrams
    min_df=3
)

X_train = tfidf.fit_transform(train_df["text"])
X_test = tfidf.transform(test_df["text"])

y_train = train_df["label"].values
y_test = test_df["label"].values

# =========================
# 3. SVM CLASSIFIER
# =========================
svm = LinearSVC()
svm.fit(X_train, y_train)

# =========================
# 4. PREDICT
# =========================
preds = svm.predict(X_test)

# =========================
# 5. METRICS
# =========================
acc = accuracy_score(y_test, preds)
precision = precision_score(y_test, preds)
recall = recall_score(y_test, preds)
f1 = f1_score(y_test, preds)

cm = confusion_matrix(y_test, preds)

# SVM does not output softmax, but we can approximate confidence via distance
decision_scores = svm.decision_function(X_test)
conf = np.max(np.vstack([-decision_scores, decision_scores]).T, axis=1)

correct_conf = conf[preds == y_test].mean()
incorrect_conf = conf[preds != y_test].mean()

print("\nDATASET INFO")
print(f"Total test samples: {len(y_test)}")
print(f"Class 0 samples: {sum(y_test == 0)}")
print(f"Class 1 samples: {sum(y_test == 1)}")

print("\nMETRICS")
print(f"Accuracy:  {acc:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1 Score:  {f1:.4f}")

print("\nCONFUSION MATRIX")
print(cm)

print("\nCLASSIFICATION REPORT")
print(classification_report(y_test, preds, target_names=["Non-trigger", "Trigger"]))

print("\nCONFIDENCE ANALYSIS")
print(f"Avg confidence (correct):   {correct_conf:.4f}")
print(f"Avg confidence (incorrect): {incorrect_conf:.4f}")


  df_balanced = df.groupby('Trigger', group_keys=False).apply(


Training samples: 8000, Test samples: 2000

DATASET INFO
Total test samples: 2000
Class 0 samples: 1000
Class 1 samples: 1000

METRICS
Accuracy:  0.6450
Precision: 0.6462
Recall:    0.6410
F1 Score:  0.6436

CONFUSION MATRIX
[[649 351]
 [359 641]]

CLASSIFICATION REPORT
              precision    recall  f1-score   support

 Non-trigger       0.64      0.65      0.65      1000
     Trigger       0.65      0.64      0.64      1000

    accuracy                           0.65      2000
   macro avg       0.65      0.65      0.64      2000
weighted avg       0.65      0.65      0.64      2000


CONFIDENCE ANALYSIS
Avg confidence (correct):   0.6112
Avg confidence (incorrect): 0.4235
