In [2]:
import pandas as pd
df_train = pd.read_csv('df_train_sbert.csv')
df_test = pd.read_csv('df_test_sbert.csv')

In [None]:
pip install sentence_transformers

In [3]:
import numpy as np
import pandas as pd
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.metrics import accuracy_score, classification_report

# ---- Step 1: Load Pretrained SBERT ----
model_name = "sentence-transformers/all-MiniLM-L6-v2"  # SBERT variant
tokenizer = AutoTokenizer.from_pretrained(model_name)

# ---- Step 2: Prepare the Data for Hugging Face Trainer ----
def preprocess_function(examples):
    return tokenizer([str(text) for text in examples["processed_text"]], 
                     padding="max_length", 
                     truncation=True, 
                     max_length=128)


# Convert Pandas DataFrame to Hugging Face Dataset
train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)

# Apply tokenization
train_dataset = train_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)

# Set the format for PyTorch
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

# ---- Step 3: Load SBERT with a Classification Head ----
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)  # Binary classification

# ---- Step 4: Training Arguments ----
training_args = TrainingArguments(
    output_dir="./results_binary_sbert",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs_binary_sbert",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)

# ---- Step 5: Define Evaluation Metrics ----
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_score(labels, predictions)
    return {"accuracy": accuracy}

# ---- Step 6: Initialize Trainer and Fine-Tune SBERT ----
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()  # 🚀 Fine-Tuning SBERT for Binary Classification

# ---- Step 7: Evaluate SBERT Binary Model ----
eval_results = trainer.evaluate()
print("SBERT Binary Classification Evaluation Results:", eval_results)



Map:   0%|          | 0/88911 [00:00<?, ? examples/s]

Map:   0%|          | 0/22228 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sentence-transformers/all-MiniLM-L6-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2068,0.142347,0.943675
2,0.1311,0.129338,0.948039
3,0.0405,0.146082,0.949793


SBERT Binary Classification Evaluation Results: {'eval_loss': 0.146081805229187, 'eval_accuracy': 0.9497930538060104, 'eval_runtime': 57.6213, 'eval_samples_per_second': 385.76, 'eval_steps_per_second': 12.062, 'epoch': 3.0}


**overfitting here, should consider the optimization hyperparam.**

In [6]:
from sklearn.metrics import classification_report, confusion_matrix

# Assume predictions_output is the result from trainer.predict(test_dataset)
# For models that output logits, use argmax to convert to predicted labels.
predictions_output = trainer.predict(test_dataset)
predicted_labels = np.argmax(predictions_output.predictions, axis=1)
true_labels = predictions_output.label_ids

print("Classification Report:")
print(classification_report(true_labels, predicted_labels))
print("Confusion Matrix:")
print(confusion_matrix(true_labels, predicted_labels))

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.96      0.97     18301
           1       0.83      0.90      0.86      3927

    accuracy                           0.95     22228
   macro avg       0.90      0.93      0.92     22228
weighted avg       0.95      0.95      0.95     22228

Confusion Matrix:
[[17582   719]
 [  397  3530]]


In [4]:
import numpy as np
import pandas as pd

# ---- Step 1: Extract Predicted Positives from Binary Classifier ----
X_train_bin = df_train["sbert_embedding"]
y_train_bin = df_train["label"]
# - binary_predict is defined from the upstream task

# Get binary predictions on the training set
# from the upstream task
binary_preds_output = trainer.predict(train_dataset)
binary_preds_train = np.argmax(binary_preds_output.predictions, axis=1) # array of 0s and 1s

# Find indices where the binary classifier predicts positive (in-topic)
positive_indices = np.where(binary_preds_train == 1)[0]

print("Number of samples predicted as in-topic:", len(positive_indices))

# ---- Step 2: Build a New Training Set for the Downstream Classifier ----
# We'll extract rows from df_train corresponding to predicted positives.
# Then, for each extracted sample:
#   - If the true binary label is 1 (i.e., it is a true positive), keep its original 'topic_id'
#   - If the true binary label is 0 (i.e., a false positive), set its 'topic_id' to "NP"

df_downstream = df_train.iloc[positive_indices].copy()

# Create a new column for the downstream topic label:
df_downstream['downstream_topic'] = df_downstream.apply(
    lambda row: row['topic_id'] if row['label'] == 1 else 0.0, axis=1
)

# Now, df_downstream contains only the samples predicted as in-topic.
# Their 'downstream_topic' column holds the original topic for true positives,
# and "NP" for false positives.

print("Downstream training set shape:", df_downstream.shape)
print("Value counts for downstream topics:")
print(df_downstream['downstream_topic'].value_counts())

# ---- Step 3: (Optional) Prepare Data for Downstream BERT Fine-Tuning ----
# For instance, if you want to fine-tune a BERT classifier on this subset:
# Make sure your downstream training set contains:
# - 'processed_text': the input text.
# - 'downstream_topic': the new multiclass labels (including "NP").

# You might need to remap 'downstream_topic' to contiguous integers, for example:
unique_topics = np.sort(df_downstream['downstream_topic'].unique())
topic_mapping = {topic: idx for idx, topic in enumerate(unique_topics)}
df_downstream['mapped_topic'] = df_downstream['downstream_topic'].map(topic_mapping)

print("Unique downstream topics mapping:", topic_mapping)

# At this point, you can use df_downstream to train your downstream classifier.
# For example, you could convert it to a Hugging Face Dataset and fine-tune a BERT model:
from datasets import Dataset
downstream_dataset = Dataset.from_pandas(df_downstream)

# Get binary predictions on df_test using your binary model Trainer (assumed already trained).
# This returns an object; we extract predictions and then take argmax to get 0/1.
binary_predictions_output = trainer.predict(test_dataset)  # 'trainer' is your binary model Trainer
binary_preds_test = np.argmax(binary_predictions_output.predictions, axis=1)

# Find indices where the binary classifier predicts in-topic (1)
positive_indices_test = np.where(binary_preds_test == 1)[0]
print("Number of test samples predicted as in-topic:", len(positive_indices_test))

# Create downstream test DataFrame from df_test (for multiclass stage).
df_downstream_test = df_test.iloc[positive_indices_test].copy()

# Create a new column 'downstream_topic':
# If the true binary label is 1, use the true 'topic_id'; otherwise, mark as "NP".
df_downstream_test['downstream_topic'] = df_downstream_test.apply(
    lambda row: row['topic_id'] if row['label'] == 1 else 0.0, axis=1
)

# Map the downstream_topic to contiguous integers using topic_mapping.
# (Ensure that topic_mapping is defined; for example, you might have built it from df_downstream.)
df_downstream_test['mapped_topic'] = df_downstream_test['downstream_topic'].map(topic_mapping)

# Create a Hugging Face Dataset for the downstream test set.
downstream_dataset_test = Dataset.from_pandas(df_downstream_test)


Number of samples predicted as in-topic: 16523
Downstream training set shape: (16523, 14)
Value counts for downstream topics:
downstream_topic
602.0    2646
543.0    2331
546.0    2261
544.0    2149
550.0    2085
0.0      1818
547.0    1498
600.0     875
554.0     367
556.0     255
552.0     238
Name: count, dtype: int64
Unique downstream topics mapping: {0.0: 0, 543.0: 1, 544.0: 2, 546.0: 3, 547.0: 4, 550.0: 5, 552.0: 6, 554.0: 7, 556.0: 8, 600.0: 9, 602.0: 10}


Number of test samples predicted as in-topic: 4249


In [5]:
# ---- Step 1: Convert Multiclass Data to Hugging Face Dataset ----
downstream_dataset = Dataset.from_pandas(df_downstream)
downstream_test_dataset = Dataset.from_pandas(df_downstream_test)

# Apply tokenization
downstream_dataset = downstream_dataset.map(preprocess_function, batched=True)
downstream_test_dataset = downstream_test_dataset.map(preprocess_function, batched=True)

# Set the format for PyTorch
downstream_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "mapped_topic"])
downstream_test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "mapped_topic"])

# Rename mapped_topic to "labels" for Hugging Face Trainer compatibility
downstream_dataset = downstream_dataset.rename_column("mapped_topic", "labels")
downstream_test_dataset = downstream_test_dataset.rename_column("mapped_topic", "labels")

# ---- Step 2: Load SBERT with Classification Head ----
num_classes = len(np.unique(df_downstream["mapped_topic"].values))  # Number of unique topics
model_downstream = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_classes)

# ---- Step 3: Training Arguments for Multiclass ----
training_args_multi = TrainingArguments(
    output_dir="./results_multiclass_sbert",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs_multiclass_sbert",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)

# ---- Step 4: Initialize Trainer and Fine-Tune SBERT for Multiclass ----
trainer_multi = Trainer(
    model=model_downstream,
    args=training_args_multi,
    train_dataset=downstream_dataset,
    eval_dataset=downstream_test_dataset,
    compute_metrics=compute_metrics,
)

trainer_multi.train()  # 🚀 Fine-Tuning SBERT for Multiclass Classification

# ---- Step 5: Evaluate SBERT Multiclass Model ----
eval_results_multi = trainer_multi.evaluate()
print("SBERT Multiclass Classification Evaluation Results:", eval_results_multi)


Map:   0%|          | 0/16523 [00:00<?, ? examples/s]

Map:   0%|          | 0/4249 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sentence-transformers/all-MiniLM-L6-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.8101,0.888143,0.728877
2,0.6899,0.774424,0.747705
3,0.6186,0.765631,0.749823


SBERT Multiclass Classification Evaluation Results: {'eval_loss': 0.7656312584877014, 'eval_accuracy': 0.749823487879501, 'eval_runtime': 9.9046, 'eval_samples_per_second': 428.991, 'eval_steps_per_second': 13.428, 'epoch': 3.0}


**Does not seem to converge, also need to tune the optimization param.**

In [7]:
predictions_output = trainer_multi.predict(downstream_test_dataset)
predicted_labels = np.argmax(predictions_output.predictions, axis=1)
true_labels = predictions_output.label_ids

print("Downstream Classification Report:")
print(classification_report(true_labels, predicted_labels))
print("Downstream Confusion Matrix:")
print(confusion_matrix(true_labels, predicted_labels))

Downstream Classification Report:
              precision    recall  f1-score   support

           0       0.48      0.04      0.08       719
           1       0.75      0.93      0.83       555
           2       0.66      0.56      0.61       489
           3       0.76      0.94      0.84       529
           4       0.89      0.96      0.92       400
           5       0.68      0.92      0.78       515
           6       0.75      0.79      0.77        53
           7       0.72      0.92      0.81        76
           8       0.81      1.00      0.89        46
           9       0.86      0.98      0.92       213
          10       0.75      0.98      0.85       654

    accuracy                           0.75      4249
   macro avg       0.74      0.82      0.76      4249
weighted avg       0.71      0.75      0.69      4249

Downstream Confusion Matrix:
[[ 32  67 103 118  32 127  10  22   8  30 170]
 [  8 515   8   1   4   2   0   0   0   1  16]
 [ 15  90 274  22   7  70   0 

Seems to underperform BERT?? Maybe should choose different embedding for different task?? But BERT's outperformance is marginal, and SBERT is way faster. And it seems that SBERT suffer more from the direct choice of optimization param, so maybe tuning it a bit and it will work better.

# 📌 Possible Improvements for Model Training

## 🔹 1. Binary Classification Model (Upstream)

The binary classifier achieves **95% accuracy**, but signs of **overfitting** appear in the final epoch. The focus should be on improving **generalization** and **stability**.

### ✅ Suggested Improvements:
- **Early Stopping**  
  - Stop training when validation loss stops improving to prevent overfitting.  
  - Can be implemented using `EarlyStoppingCallback`.  

- **Lower Learning Rate in Later Epochs**  
  - Reduce the learning rate **after epoch 2** to slow down overfitting.  
  - A scheduler like `get_scheduler("linear")` can be used.  

- **Regularization (Dropout & Weight Decay)**  
  - Add **dropout (0.1-0.3)** to the classifier head.  
  - Apply **L2 weight decay** (`weight_decay=0.01`).  

- **Adjust Decision Threshold for Upstream Classification**  
  - Instead of the default **0.5 threshold**, increase it to **0.6 or 0.7** to reduce false positives.  

---

## 🔹 2. Multi-Class Classification Model (Downstream)

The multi-class classifier reaches **75% accuracy**, but validation loss **stabilizes early**, suggesting that improvements can be made in **training strategy, data filtering, and model complexity**.

### ✅ Suggested Improvements:
- **Extend Training with Lower Learning Rate**  
  - Since validation loss is still decreasing, increase training to **5 epochs** while lowering the learning rate.  

- **Class-Balanced Loss Function**  
  - Some classes are underrepresented, leading to imbalance.  
  - Use **weighted cross-entropy loss** to address this.  

- **Filter Low-Confidence Samples from Upstream Model**  
  - Some false positives from the binary classifier introduce noise.  
  - Remove samples with low upstream confidence scores (**probability < 0.7**).  

- **Data Augmentation for Rare Classes**  
  - Use **paraphrasing techniques** (e.g., `nlpaug`) to generate more examples for underrepresented topics.  

- **Increase Model Complexity**  
  - Instead of using a single-layer MLP, explore **multi-layer MLPs or attention-based classifiers** for better generalization.  

---

## 🚀 Next Steps
- ✅ Implement **early stopping & regularization** for the binary model.  
- ✅ Introduce **confidence-based filtering & class weighting** for the multi-class model.  
- ✅ Experiment with **extended training & alternative classifiers**.  

---
