In [2]:
import pandas as pd
# --- Mount Google Drive ---
from google.colab import drive
drive.mount('/content/drive')

# --- Path to your file ---
BASE_DIR = "/content/drive/MyDrive/cbdc-type"
FILE_PATH = f"{BASE_DIR}/cbdc_type_training.csv"

# --- Load ---
df = pd.read_csv(FILE_PATH)

# --- Count sentences per label ---
label_counts = df["label"].value_counts()

print("=== Sentence Count per Label ===")
print(label_counts)

# Optional: percentage distribution
print("\n=== Percentage Distribution ===")
print((label_counts / len(df) * 100).round(2))


Mounted at /content/drive
=== Sentence Count per Label ===
label
General/Unspecified    545
Retail CBDC            543
Wholesale CBDC         329
Name: count, dtype: int64

=== Percentage Distribution ===
label
General/Unspecified    38.46
Retail CBDC            38.32
Wholesale CBDC         23.22
Name: count, dtype: float64


In [None]:
# ========================
# 0) Setup & Install
# ========================
# !pip -q install -U transformers datasets evaluate accelerate scikit-learn

import os, json, random
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
import torch

from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    EarlyStoppingCallback,
)

# Colab Drive
from google.colab import drive
drive.mount('/content/drive')

# ========================
# 1) Paths & Params
# ========================
DATA_PATH  = "/content/drive/MyDrive/cbdc-type/cbdc_type_training.csv"
OUTPUT_DIR = "/content/drive/MyDrive/cbdc-type/cb-bert-cbdc-type"

MODEL_NAME = "bilalzafar/CentralBank-BERT"   # domain-adapted BERT
SEED       = 42
MAX_LEN    = 192
BATCH_TRAIN= 8
BATCH_EVAL = 16
EPOCHS     = 5
LR         = 2e-5
WARMUP     = 0.1
WEIGHT_DEC = 0.01

# Label schema
label_list = ["Retail CBDC", "Wholesale CBDC", "General/Unspecified"]
label2id = {lbl: i for i, lbl in enumerate(label_list)}
id2label = {i: lbl for lbl, i in label2id.items()}

# Reproducibility
def set_seed(seed=SEED):
    random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
set_seed()

# ========================
# 2) Load & Split
# ========================
df = pd.read_csv(DATA_PATH)

# basic cleaning
df = df[["sentence", "label"]].dropna().copy()
df["label"] = df["label"].map(lambda x: x.strip())

# keep only expected labels
df = df[df["label"].isin(label_list)].reset_index(drop=True)

# map labels to ids
df["labels"] = df["label"].map(label2id)

# stratified 80/10/10 split on label ids
train_df, temp_df = train_test_split(
    df, test_size=0.20, stratify=df["labels"], random_state=SEED
)
val_df, test_df = train_test_split(
    temp_df, test_size=0.50, stratify=temp_df["labels"], random_state=SEED
)

print("Counts by label (overall):")
print(df["label"].value_counts())
print("\nSplit sizes:", len(train_df), len(val_df), len(test_df))

# ========================
# 3) Hugging Face Datasets
# ========================
ds = DatasetDict({
    "train": Dataset.from_pandas(train_df[["sentence", "labels"]]),
    "validation": Dataset.from_pandas(val_df[["sentence", "labels"]]),
    "test": Dataset.from_pandas(test_df[["sentence", "labels"]]),
})

# ========================
# 4) Tokenizer & Tokenization
# ========================
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

def tok_fn(batch):
    return tokenizer(
        batch["sentence"],
        padding=False,
        truncation=True,
        max_length=MAX_LEN,
    )

tokenized = ds.map(tok_fn, batched=True)
collator = DataCollatorWithPadding(tokenizer=tokenizer, pad_to_multiple_of=8)

# ========================
# 5) Class Weights (from TRAIN ONLY)
# ========================
train_counts = train_df["labels"].value_counts().reindex(range(len(label_list)), fill_value=0)
num_classes = len(label_list)
N = len(train_df)

# Inverse frequency: total/(num_classes * count)
weights = {i: (N / (num_classes * cnt)) for i, cnt in train_counts.items()}
class_weights_tensor = torch.tensor([weights[i] for i in range(num_classes)], dtype=torch.float)

print("\nClass weights (by id):")
for i in range(num_classes):
    print(f"  {i} ({id2label[i]}): {class_weights_tensor[i]:.3f}")

# ========================
# 6) Model & Config
# ========================
config = AutoConfig.from_pretrained(
    MODEL_NAME,
    num_labels=num_classes,
    id2label=id2label,
    label2id=label2id,
)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, config=config)

# ========================
# 7) Metrics
# ========================
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    acc = accuracy_score(labels, preds)
    f1_macro = f1_score(labels, preds, average="macro")
    f1_weighted = f1_score(labels, preds, average="weighted")
    return {"accuracy": acc, "f1_macro": f1_macro, "f1_weighted": f1_weighted}

# ========================
# 8) TrainingArguments
# ========================
args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    overwrite_output_dir=True,
    per_device_train_batch_size=BATCH_TRAIN,
    per_device_eval_batch_size=BATCH_EVAL,
    gradient_accumulation_steps=1,
    learning_rate=LR,
    weight_decay=WEIGHT_DEC,
    warmup_ratio=WARMUP,
    num_train_epochs=EPOCHS,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,
    logging_steps=50,
    report_to="none",
    fp16=True,
    dataloader_num_workers=2,
    seed=SEED,
)

# ========================
# 9) Weighted Trainer
# ========================
from transformers import Trainer

class WeightedTrainer(Trainer):
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights

    # Accepts the new kwarg `num_items_in_batch` to be compatible with latest transformers
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.get("labels")
        outputs = model(**{k: v for k, v in inputs.items() if k != "labels"})
        logits = outputs.logits
        loss_fct = torch.nn.CrossEntropyLoss(weight=self.class_weights.to(logits.device))
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

trainer = WeightedTrainer(
    model=model,
    args=args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    data_collator=collator,
    processing_class=tokenizer,  # replaces deprecated `tokenizer=` argument
    compute_metrics=compute_metrics,
    class_weights=class_weights_tensor,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2, early_stopping_threshold=1e-4)],
)

# ========================
# 10) Train
# ========================
train_result = trainer.train()
trainer.save_model(OUTPUT_DIR)

# Save label maps for downstream use
with open(os.path.join(OUTPUT_DIR, "label_mapping.json"), "w") as f:
    json.dump({"label2id": label2id, "id2label": id2label}, f, indent=2)

# ========================
# 11) Evaluate on Test
# ========================
test_out = trainer.predict(tokenized["test"])
test_preds = test_out.predictions.argmax(axis=-1)

print("\nTest metrics:", test_out.metrics)
report = classification_report(
    test_df["labels"], test_preds, target_names=[id2label[i] for i in range(num_classes)]
)
cm = confusion_matrix(test_df["labels"], test_preds)

print("\nClassification report:\n", report)
print("\nConfusion matrix (rows=true, cols=pred):\n", cm)

# Persist reports
with open(os.path.join(OUTPUT_DIR, "test_classification_report.txt"), "w") as f:
    f.write(report)
np.savetxt(os.path.join(OUTPUT_DIR, "test_confusion_matrix.csv"), cm, fmt="%d", delimiter=",")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Counts by label (overall):
label
General/Unspecified    545
Retail CBDC            543
Wholesale CBDC         329
Name: count, dtype: int64

Split sizes: 1133 142 142


Map:   0%|          | 0/1133 [00:00<?, ? examples/s]

Map:   0%|          | 0/142 [00:00<?, ? examples/s]

Map:   0%|          | 0/142 [00:00<?, ? examples/s]


Class weights (by id):
  0 (Retail CBDC): 0.870
  1 (Wholesale CBDC): 1.436
  2 (General/Unspecified): 0.866


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bilalzafar/CentralBank-BERT and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted
1,0.744,0.299283,0.873239,0.881949,0.872889
2,0.2769,0.281721,0.880282,0.88944,0.879349
3,0.2052,0.325982,0.887324,0.896186,0.88718
4,0.0444,0.392861,0.894366,0.904835,0.895135
5,0.0262,0.382513,0.901408,0.910438,0.901408



Test metrics: {'test_loss': 0.33563169836997986, 'test_accuracy': 0.8873239436619719, 'test_f1_macro': 0.8981249728913281, 'test_f1_weighted': 0.887306155701575, 'test_runtime': 0.5351, 'test_samples_per_second': 265.385, 'test_steps_per_second': 16.82}

Classification report:
                      precision    recall  f1-score   support

        Retail CBDC       0.86      0.87      0.86        55
     Wholesale CBDC       0.97      0.97      0.97        33
General/Unspecified       0.87      0.85      0.86        54

           accuracy                           0.89       142
          macro avg       0.90      0.90      0.90       142
       weighted avg       0.89      0.89      0.89       142


Confusion matrix (rows=true, cols=pred):
 [[48  0  7]
 [ 1 32  0]
 [ 7  1 46]]


In [3]:
from transformers import pipeline

model_dir = "/content/drive/MyDrive/cbdc-type/cb-bert-cbdc-type"
clf = pipeline("text-classification", model=model_dir, tokenizer=model_dir,
               truncation=True, max_length=128)

sentences = [
    # General/Unspecified
    "CBDCs can modernize national payment systems.",
    "The central bank is exploring various CBDC models without committing to one.",
    "We will continue research on CBDCs to balance innovation and stability.",
    # Retail CBDC
    "The digital euro would be available to all citizens for everyday use.",
    "Retail CBDCs can improve financial inclusion in underserved regions.",
    "We are designing a digital currency wallet for public transactions under strict privacy rules.",
    # Wholesale CBDC
    "Wholesale CBDCs could streamline interbank settlements.",
    "Cross-border payments between central banks may use a wholesale CBDC.",
    "We are piloting a wholesale CBDC for large-value securities transactions."
]

for s in sentences:
    pred = clf(s)[0]
    print(f"{pred['label']} {pred['score']:.4f}  | {s}")

Device set to use cuda:0


General/Unspecified 0.9972  | CBDCs can modernize national payment systems.
General/Unspecified 0.9983  | The central bank is exploring various CBDC models without committing to one.
General/Unspecified 0.9981  | We will continue research on CBDCs to balance innovation and stability.
Retail CBDC 0.9985  | The digital euro would be available to all citizens for everyday use.
Retail CBDC 0.9988  | Retail CBDCs can improve financial inclusion in underserved regions.
Retail CBDC 0.9979  | We are designing a digital currency wallet for public transactions under strict privacy rules.
Wholesale CBDC 0.9990  | Wholesale CBDCs could streamline interbank settlements.
Wholesale CBDC 0.9990  | Cross-border payments between central banks may use a wholesale CBDC.
Wholesale CBDC 0.9990  | We are piloting a wholesale CBDC for large-value securities transactions.
