In [12]:
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import classification_report, confusion_matrix

#########################################
# Load Data
#########################################
df_train = pd.read_csv('df_train_sbert.csv')
df_test = pd.read_csv('df_test_sbert.csv')

def parse_embedding(embedding):
    if isinstance(embedding, str):  # If stored as a string
        embedding = embedding.replace("[", "").replace("]", "")  # Remove brackets
        return np.array([float(x) for x in embedding.split()])  # Convert to float array
    return np.array(embedding)  # If already numeric, keep as is

# Convert all embeddings properly
df_train["sbert_embedding"] = df_train["sbert_embedding"].apply(parse_embedding)
df_test["sbert_embedding"] = df_test["sbert_embedding"].apply(parse_embedding)

#########################################
# Step 1: Train Binary Classifier (XGB + Logistic)
#########################################
# Convert embeddings to NumPy arrays
X_train_bin = np.vstack(df_train['sbert_embedding'].apply(lambda x: np.array(x, dtype=np.float32)))
X_test_bin  = np.vstack(df_test['sbert_embedding'].apply(lambda x: np.array(x, dtype=np.float32)))
y_train_bin = df_train['label'].values
y_test_bin  = df_test['label'].values

# Train XGBoost binary classifier
xgb_bin = XGBClassifier(
    objective="binary:logistic",
    random_state=42,
    eval_metric="logloss",
    learning_rate=0.1,
    max_depth=7,
    n_estimators=200,
    n_jobs=-1
)
xgb_bin.fit(X_train_bin, y_train_bin)
print("✅ XGB binary model trained.")

# Train logistic regression for probability calibration
y_prob_train_bin = xgb_bin.predict_proba(X_train_bin)[:, 1].reshape(-1, 1)
y_prob_test_bin  = xgb_bin.predict_proba(X_test_bin)[:, 1].reshape(-1, 1)

log_reg_bin = make_pipeline(StandardScaler(), LogisticRegression(class_weight="balanced", max_iter=1000))
log_reg_bin.fit(y_prob_train_bin, y_train_bin)
print("✅ Logistic regression calibration trained.")

def binary_ensemble_predict(X):
    """Return binary predictions (0: out-of-topic, 1: in-topic) using the ensemble."""
    probs = xgb_bin.predict_proba(X)[:, 1].reshape(-1, 1)
    return log_reg_bin.predict(probs)

#########################################
# Step 2: Prepare Multiclass Dataset (Only True & False Positives)
#########################################
binary_preds_train = binary_ensemble_predict(X_train_bin)
binary_preds_test  = binary_ensemble_predict(X_test_bin)

# Get indices of in-topic predictions (true or false positives)
train_in_topic_indices = np.where(binary_preds_train == 1)[0]
test_in_topic_indices  = np.where(binary_preds_test == 1)[0]

print(f"🔹 Train: {len(train_in_topic_indices)} in-topic predictions")
print(f"🔹 Test: {len(test_in_topic_indices)} in-topic predictions")

# Subset df_train to only include these predictions
df_train_multi = df_train.iloc[train_in_topic_indices].copy()
df_test_multi  = df_test.iloc[test_in_topic_indices].copy()

# Set false positives to '0' in topic_id
df_train_multi['topic_id'] = df_train_multi.apply(
    lambda row: row['topic_id'] if row['label'] == 1 else 0.0, axis=1
)
df_test_multi['topic_id'] = df_test_multi.apply(
    lambda row: row['topic_id'] if row['label'] == 1 else 0.0, axis=1
)

# Convert embeddings
X_train_multi = np.vstack(df_train_multi['sbert_embedding'].apply(lambda x: np.array(x, dtype=np.float32)))
X_test_multi  = np.vstack(df_test_multi['sbert_embedding'].apply(lambda x: np.array(x, dtype=np.float32)))

# Map topic_id to integers
unique_labels = np.sort(np.unique(df_train_multi['topic_id']))
topic_mapping = {label: idx for idx, label in enumerate(unique_labels)}
y_train_multi = np.array([topic_mapping[label] for label in df_train_multi['topic_id']])
y_test_multi  = np.array([topic_mapping[label] for label in df_test_multi['topic_id']])
num_classes = len(unique_labels)

print("Multiclass: Unique labels:", unique_labels)
print("Number of classes:", num_classes)

#########################################
# Step 3: Train Multiclass Classifier (SVM)
#########################################
subset_fraction = 0.2
X_train_multi_subset, _, y_train_multi_subset, _ = train_test_split(
    X_train_multi, y_train_multi, test_size=1 - subset_fraction, random_state=42, stratify=y_train_multi
)
print("Reduced multiclass training set shape:", X_train_multi_subset.shape)

param_grid = [
    {'kernel': ['rbf'], 'C': [1, 10], 'gamma': [0.001, 0.01]}
]

svm = SVC(random_state=42)

grid_search = GridSearchCV(
    estimator=svm,
    param_grid=param_grid,
    scoring='f1_macro',
    cv=3,
    verbose=2,
    n_jobs=-1
)
grid_search.fit(X_train_multi_subset, y_train_multi_subset)
print("Best SVM parameters:", grid_search.best_params_)
print("Best SVM macro F1 score:", grid_search.best_score_)

best_svm = grid_search.best_estimator_

# Evaluate on full test set
y_pred_multi = best_svm.predict(X_test_multi)
print("Multiclass classification report:")
print(classification_report(y_test_multi, y_pred_multi))
print("Multiclass confusion matrix:")
print(confusion_matrix(y_test_multi, y_pred_multi))

#########################################
# Step 4: Hierarchical Prediction
#########################################
def hierarchical_predict(X):
    """
    Hierarchical prediction:
      1. Use the binary ensemble to decide if an instance is in-topic (1) or out-of-topic (0).
      2. For in-topic instances, use the best SVM to predict a topic (multiclass).
      3. For out-of-topic instances, assign "out-of-topic".
    
    Args:
      X: Input embeddings (should be the same for binary and multiclass tasks).
    
    Returns:
      final_preds: Array of final predictions (either an integer topic label or "out-of-topic").
    """
    binary_preds = binary_ensemble_predict(X)
    final_preds = []
    for i, bp in enumerate(binary_preds):
        if bp == 1:
            topic_pred = best_svm.predict(X[i].reshape(1, -1))[0]
            final_preds.append(topic_pred)
        else:
            final_preds.append("out-of-topic")
    return np.array(final_preds)

final_predictions = hierarchical_predict(X_test_bin)
print("Hierarchical predictions sample:", final_predictions[:10])


✅ XGB binary model trained.
✅ Logistic regression calibration trained.
🔹 Train: 17252 in-topic predictions
🔹 Test: 4171 in-topic predictions
Multiclass: Unique labels: [  0. 543. 544. 546. 547. 550. 552. 554. 556. 600. 602.]
Number of classes: 11
Reduced multiclass training set shape: (3450, 384)
Fitting 3 folds for each of 4 candidates, totalling 12 fits
Best SVM parameters: {'C': 10, 'gamma': 0.01, 'kernel': 'rbf'}
Best SVM macro F1 score: 0.5633535425810002
Multiclass classification report:
              precision    recall  f1-score   support

           0       0.46      0.25      0.33      1174
           1       0.52      0.71      0.60       520
           2       0.35      0.34      0.35       430
           3       0.60      0.78      0.68       407
           4       0.80      0.66      0.72       371
           5       0.53      0.73      0.62       425
           6       0.58      0.31      0.40        58
           7       0.34      0.31      0.33        48
           8  

Can observe that in the binary stage, there are too many false negative (i.e. in-topic predicted as out-of-topic), which results in the underperform on the downstream multiclassifcation. So in the following step, I turn to use logistic regression on the first classifer, which has the highest recall on label True (i.e. there should be more false positive than false negative).

In [18]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import classification_report, confusion_matrix

############################################
# Load Data
############################################
df_train = pd.read_csv('df_train_sbert.csv')
df_test = pd.read_csv('df_test_sbert.csv')

def parse_embedding(embedding):
    if isinstance(embedding, str):  # If stored as a string
        embedding = embedding.replace("[", "").replace("]", "")  # Remove brackets
        return np.array([float(x) for x in embedding.split()])  # Convert to float array
    return np.array(embedding)  # If already numeric, keep as is

# Convert all embeddings properly
df_train["sbert_embedding"] = df_train["sbert_embedding"].apply(parse_embedding)
df_test["sbert_embedding"] = df_test["sbert_embedding"].apply(parse_embedding)

############################################
# Step 1: Train Binary Classifier (Logistic Regression)
############################################
X_train_bin = np.vstack(df_train["sbert_embedding"].apply(lambda x: np.array(x, dtype=np.float32)))
X_test_bin  = np.vstack(df_test["sbert_embedding"].apply(lambda x: np.array(x, dtype=np.float32)))
y_train_bin = df_train["label"].values
y_test_bin  = df_test["label"].values

# Train Logistic Regression for binary classification
binary_model = LogisticRegression(C=1, max_iter=1000, class_weight="balanced")
binary_model.fit(X_train_bin, y_train_bin)
print("✅ Binary classifier (Logistic Regression) trained.")

def binary_predict(X):
    """Predict binary labels (0: out-of-topic, 1: in-topic)."""
    return binary_model.predict(X)

############################################
# Step 2: Prepare Multiclass Dataset (Only True & False Positives)
############################################

binary_preds_train = binary_predict(X_train_bin)
binary_preds_test  = binary_predict(X_test_bin)

# Get indices of in-topic predictions (true or false positives)
train_in_topic_indices = np.where(binary_preds_train == 1)[0]
test_in_topic_indices  = np.where(binary_preds_test == 1)[0]

print(f"🔹 Train: {len(train_in_topic_indices)} in-topic predictions")
print(f"🔹 Test: {len(test_in_topic_indices)} in-topic predictions")

# Subset df_train to only include these predictions
df_train_multi = df_train.iloc[train_in_topic_indices].copy()
df_test_multi  = df_test.iloc[test_in_topic_indices].copy()

# Set false positives to '0' in topic_id
df_train_multi['topic_id'] = df_train_multi.apply(
    lambda row: row['topic_id'] if row['label'] == 1 else 0.0, axis=1
)
df_test_multi['topic_id'] = df_test_multi.apply(
    lambda row: row['topic_id'] if row['label'] == 1 else 0.0, axis=1
)

# Convert embeddings
X_train_multi = np.vstack(df_train_multi['sbert_embedding'].apply(lambda x: np.array(x, dtype=np.float32)))
X_test_multi  = np.vstack(df_test_multi['sbert_embedding'].apply(lambda x: np.array(x, dtype=np.float32)))

# Map topic_id to integers
unique_labels = np.sort(np.unique(df_train_multi['topic_id']))
topic_mapping = {label: idx for idx, label in enumerate(unique_labels)}
y_train_multi = np.array([topic_mapping[label] for label in df_train_multi['topic_id']])
y_test_multi  = np.array([topic_mapping[label] for label in df_test_multi['topic_id']])
num_classes = len(unique_labels)

print("Multiclass: Unique labels:", unique_labels)
print("Number of classes:", num_classes)

############################################
# Step 3: Train Multiclass Classifier (SVM)
############################################
subset_fraction = 0.2
X_train_multi_subset, _, y_train_multi_subset, _ = train_test_split(
    X_train_multi, y_train_multi, test_size=1 - subset_fraction, random_state=42, stratify=y_train_multi
)
print("Reduced multiclass training set shape:", X_train_multi_subset.shape)

param_grid = [
    {'kernel': ['rbf'], 'C': [1, 10], 'gamma': [0.001, 0.01]}
]

svm = SVC(random_state=42)

grid_search = GridSearchCV(
    estimator=svm,
    param_grid=param_grid,
    scoring='f1_macro',
    cv=3,
    verbose=2,
    n_jobs=-1
)
grid_search.fit(X_train_multi_subset, y_train_multi_subset)
print("✅ Best SVM parameters:", grid_search.best_params_)
print("✅ Best SVM macro F1 score:", grid_search.best_score_)

best_svm = grid_search.best_estimator_

# Evaluate on full test set
y_pred_multi = best_svm.predict(X_test_multi)
print("Multiclass classification report:")
print(classification_report(y_test_multi, y_pred_multi))
print("Multiclass confusion matrix:")
print(confusion_matrix(y_test_multi, y_pred_multi))

#########################################
# Step 4: Hierarchical Prediction
#########################################
def hierarchical_predict(X):
    """
    Hierarchical prediction:
      1. Use the binary ensemble to decide if an instance is in-topic (1) or out-of-topic (0).
      2. For in-topic instances, use the best SVM to predict a topic (multiclass).
      3. For out-of-topic instances, assign "out-of-topic".
    
    Args:
      X: Input embeddings (should be the same for binary and multiclass tasks).
    
    Returns:
      final_preds: Array of final predictions (either an integer topic label or "out-of-topic").
    """
    binary_preds = binary_ensemble_predict(X)
    final_preds = []
    for i, bp in enumerate(binary_preds):
        if bp == 1:
            topic_pred = best_svm.predict(X[i].reshape(1, -1))[0]
            final_preds.append(topic_pred)
        else:
            final_preds.append("out-of-topic")
    return np.array(final_preds)

final_predictions = hierarchical_predict(X_test_bin)
print("Hierarchical predictions sample:", final_predictions[:10])


✅ Binary classifier (Logistic Regression) trained.
🔹 Train: 23818 in-topic predictions
🔹 Test: 5997 in-topic predictions
Multiclass: Unique labels: [  0. 543. 544. 546. 547. 550. 552. 554. 556. 600. 602.]
Number of classes: 11
Reduced multiclass training set shape: (4763, 384)
Fitting 3 folds for each of 4 candidates, totalling 12 fits
✅ Best SVM parameters: {'C': 10, 'gamma': 0.01, 'kernel': 'rbf'}
✅ Best SVM macro F1 score: 0.43514134383724556
Multiclass classification report:
              precision    recall  f1-score   support

           0       0.59      0.81      0.68      2530
           1       0.58      0.57      0.58       559
           2       0.46      0.18      0.25       513
           3       0.67      0.48      0.56       492
           4       0.86      0.57      0.69       413
           5       0.62      0.52      0.57       501
           6       0.55      0.09      0.15        67
           7       1.00      0.03      0.06        70
           8       0.86      

When you tune the binary classifier to maximize recall, it tends to pass many samples as "in-topic" even if they're actually out-of-topic. This increases the number of false positives that the multiclass classifier has to process. As a result, the multiclass stage ends up receiving a mix of genuine in-topic samples and many false positives, which can confuse it and lead to poorer performance overall (lower precision and lower F1 scores for the in-topic classes).

In this case, the downstream task (multiclassification) is noisy. Maybe boosting method is a more robust choice than SVM?

In [19]:
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

############################################
# Load Data
############################################
df_train = pd.read_csv('df_train_sbert.csv')
df_test = pd.read_csv('df_test_sbert.csv')

def parse_embedding(embedding):
    if isinstance(embedding, str):  # If stored as a string
        embedding = embedding.replace("[", "").replace("]", "")  # Remove brackets
        return np.array([float(x) for x in embedding.split()])  # Convert to float array
    return np.array(embedding)  # If already numeric, keep as is

# Convert all embeddings properly
df_train["sbert_embedding"] = df_train["sbert_embedding"].apply(parse_embedding)
df_test["sbert_embedding"] = df_test["sbert_embedding"].apply(parse_embedding)

############################################
# Stage 1: Train Binary Classifier (Logistic Regression)
############################################
X_train_bin = np.vstack(df_train["sbert_embedding"].apply(lambda x: np.array(x, dtype=np.float32)))
X_test_bin  = np.vstack(df_test["sbert_embedding"].apply(lambda x: np.array(x, dtype=np.float32)))
y_train_bin = df_train["label"].values
y_test_bin  = df_test["label"].values

# Train Logistic Regression for binary classification
binary_model = LogisticRegression(C=1, max_iter=1000, class_weight="balanced")
binary_model.fit(X_train_bin, y_train_bin)
print("✅ Binary classifier (Logistic Regression) trained.")

def binary_predict(X):
    """Predict binary labels (0: out-of-topic, 1: in-topic)."""
    return binary_model.predict(X)

############################################
# Stage 2: Prepare Multiclass Dataset (Only True & False Positives)
############################################
binary_preds_train = binary_predict(X_train_bin)
binary_preds_test  = binary_predict(X_test_bin)

# Get indices of in-topic predictions (true or false positives)
train_in_topic_indices = np.where(binary_preds_train == 1)[0]
test_in_topic_indices  = np.where(binary_preds_test == 1)[0]

print(f"🔹 Train: {len(train_in_topic_indices)} in-topic predictions")
print(f"🔹 Test: {len(test_in_topic_indices)} in-topic predictions")

# Subset df_train to only include these predictions
df_train_multi = df_train.iloc[train_in_topic_indices].copy()
df_test_multi  = df_test.iloc[test_in_topic_indices].copy()

# Set false positives to "0" in topic_id
df_train_multi['topic_id'] = df_train_multi.apply(
    lambda row: row['topic_id'] if row['label'] == 1 else 0.0, axis=1
)
df_test_multi['topic_id'] = df_test_multi.apply(
    lambda row: row['topic_id'] if row['label'] == 1 else 0.0, axis=1
)

# Convert embeddings
X_train_multi = np.vstack(df_train_multi["sbert_embedding"].apply(lambda x: np.array(x, dtype=np.float32)))
X_test_multi  = np.vstack(df_test_multi["sbert_embedding"].apply(lambda x: np.array(x, dtype=np.float32)))

# Map topic_id to contiguous integers
unique_labels = np.sort(np.unique(df_train_multi["topic_id"]))
topic_mapping = {label: idx for idx, label in enumerate(unique_labels)}
y_train_multi = np.array([topic_mapping[label] for label in df_train_multi["topic_id"]])
y_test_multi  = np.array([topic_mapping[label] for label in df_test_multi["topic_id"]])
num_classes = len(unique_labels)

print("Multiclass: Unique labels:", unique_labels)
print("Number of classes:", num_classes)

############################################
# Stage 3: Train Multiclass Classifier (XGBoost)
############################################
# Use best params from previous tuning
multi_model = XGBClassifier(
    objective='multi:softmax',
    num_class=num_classes,
    eval_metric='mlogloss',
    learning_rate=0.1,
    max_depth=7,
    n_estimators=200,
    random_state=42,
    n_jobs=-1
)
multi_model.fit(X_train_multi, y_train_multi)
print("✅ Multiclass XGBoost model trained.")

# Evaluate on full test set
y_pred_multi = multi_model.predict(X_test_multi)
print("Multiclass classification report:")
print(classification_report(y_test_multi, y_pred_multi))
print("Multiclass confusion matrix:")
print(confusion_matrix(y_test_multi, y_pred_multi))

#########################################
# Step 4: Hierarchical Prediction
#########################################
def hierarchical_predict(X):
    """
    Hierarchical prediction:
      1. Use the binary ensemble to decide if an instance is in-topic (1) or out-of-topic (0).
      2. For in-topic instances, use the best SVM to predict a topic (multiclass).
      3. For out-of-topic instances, assign "out-of-topic".
    
    Args:
      X: Input embeddings (should be the same for binary and multiclass tasks).
    
    Returns:
      final_preds: Array of final predictions (either an integer topic label or "out-of-topic").
    """
    binary_preds = binary_ensemble_predict(X)
    final_preds = []
    for i, bp in enumerate(binary_preds):
        if bp == 1:
            topic_pred = best_svm.predict(X[i].reshape(1, -1))[0]
            final_preds.append(topic_pred)
        else:
            final_preds.append("out-of-topic")
    return np.array(final_preds)

final_predictions = hierarchical_predict(X_test_bin)
print("Hierarchical predictions sample:", final_predictions[:10])


✅ Binary classifier (Logistic Regression) trained.
🔹 Train: 23818 in-topic predictions
🔹 Test: 5997 in-topic predictions
Multiclass: Unique labels: [  0. 543. 544. 546. 547. 550. 552. 554. 556. 600. 602.]
Number of classes: 11
✅ Multiclass XGBoost model trained.
Multiclass classification report:
              precision    recall  f1-score   support

           0       0.59      0.84      0.69      2530
           1       0.58      0.54      0.56       559
           2       0.38      0.19      0.25       513
           3       0.67      0.48      0.56       492
           4       0.85      0.53      0.65       413
           5       0.61      0.48      0.54       501
           6       0.73      0.12      0.21        67
           7       0.42      0.07      0.12        70
           8       0.50      0.15      0.23        53
           9       0.80      0.55      0.65       207
          10       0.72      0.57      0.64       592

    accuracy                           0.61      5997

In your setup, the SVM for the multiclass stage appears to perform better than XGBoost. The SVM yields higher macro F1 scores and overall better classification performance on the in-topic classes, making it a preferable choice in the hierarchical pipeline.

If the binary classifier is already tuned for high recall (even if it passes on more noise), using SVM for the multiclass stage seems to be a better option to mitigate that noise compared to XGBoost.