# SafeAeroBERT — All‑Factor Multi‑Label Classifier ⚙️✈️

This Colab notebook fine‑tunes **SafeAeroBERT** with a single LoRA adapter that predicts **all four contributing‑factor labels** (*aircraft, procedure, weather, human‑factors*) in one forward pass.

*Steps*  
1. Install libraries (Transformers, Datasets, PEFT, evaluate, W&B)  
2. Load & prep NASA ASRS dataset (multi‑hot labels)  
3. Build LoRA‑wrapped `AutoModelForSequenceClassification` with `num_labels=4`  
4. Train for a few epochs, validating every 200 steps  
5. Evaluate on a held‑out test split and print metrics in the final cell

In [None]:
!pip -q install --upgrade evaluate datasets transformers peft wandb

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.9/40.9 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m103.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m472.3/472.3 kB[0m [31m31.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m22.2/22.2 MB[0m [31m97.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
from pathlib import Path
import os, random, json
import numpy as np, pandas as pd, torch

# Reproducibility
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(RANDOM_SEED)

print("🌱 Seeds set; using", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")

🌱 Seeds set; using NVIDIA A100-SXM4-40GB


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# ─── 1. Load & rename ─────────────────────────────────────────────────────────
DATA_PATH = '/content/drive/MyDrive/2025 REU/NASA ASRS BIG CSV/NASA_ASRS.csv'
assert os.path.exists(DATA_PATH)

df = pd.read_csv(DATA_PATH)

df = df.rename(columns={
    'Reports': 'text',
    'Primary Factors': 'primary_factor'
})

# ─── 2. Keep only the four top-level labels ───────────────────────────────────
label_map = {
    'Aircraft':       'aircraft',
    'Procedure':      'procedure',
    'Weather':        'weather',
    'Human Factors':  'human_factors'
}

df = df[df['primary_factor'].isin(label_map.keys())].copy()
df['primary_factor'] = df['primary_factor'].map(label_map)

# ─── 3. One-hot encode ────────────────────────────────────────────────────────
label_cols = ["aircraft", "procedure", "weather", "human_factors"]

# zero-fill then set 1 where the row’s factor matches
for col in label_cols:
    df[col] = (df['primary_factor'] == col).astype(int)

# ─── 4. House-keeping & sanity check ──────────────────────────────────────────
df = df.dropna(subset=['text']).reset_index(drop=True)

print("Dataset shape:", df.shape)
print("Positive counts per label:")
print(df[label_cols].sum())

Dataset shape: (185368, 9)
Positive counts per label:
aircraft          47888
procedure         25084
weather            8854
human_factors    103542
dtype: int64


In [None]:
from sklearn.model_selection import train_test_split

train_df, temp_df = train_test_split(df, test_size=0.3, random_state=RANDOM_SEED, stratify=df[label_cols])
val_df, test_df   = train_test_split(temp_df, test_size=0.5, random_state=RANDOM_SEED, stratify=temp_df[label_cols])

print(f"Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")

Train: 129757, Val: 27805, Test: 27806


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import Dataset
from peft import LoraConfig, get_peft_model
from transformers import TrainingArguments, Trainer
import evaluate

BASE_MODEL = "NASA-AIML/MIKA_SafeAeroBERT"
tok = AutoTokenizer.from_pretrained(BASE_MODEL)

def tokenize(batch):
    return tok(batch["text"], truncation=True, padding="max_length")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/401 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [None]:
def build_lora_model():
    base = AutoModelForSequenceClassification.from_pretrained(BASE_MODEL, num_labels=4)
    config = LoraConfig(
        r=8,
        lora_alpha=32,
        target_modules=["query", "value"],
        lora_dropout=0.05,
        bias="none"
    )
    return get_peft_model(base, config)

model = build_lora_model()
model.config.problem_type = "multi_label_classification" # Tell HF we're doing multi-label, not multi-class
print(model.print_trainable_parameters())

config.json:   0%|          | 0.00/664 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at NASA-AIML/MIKA_SafeAeroBERT and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 294,912 || all params: 109,780,228 || trainable%: 0.2686
None


In [None]:
train_ds = Dataset.from_pandas(train_df[["text"] + label_cols]).map(tokenize, batched=True)
val_ds   = Dataset.from_pandas(val_df[["text"] + label_cols]).map(tokenize, batched=True)
test_ds  = Dataset.from_pandas(test_df[["text"] + label_cols]).map(tokenize, batched=True)

def to_labels(example):
    example["labels"] = [float(example[c]) for c in label_cols]     # cast each 0/1 to float so Trainer -> collator -> torch.tensor(..., dtype=torch.float32)
    return example


train_ds = train_ds.map(to_labels, remove_columns=["text"] + label_cols)
val_ds   = val_ds.map(to_labels, remove_columns=["text"] + label_cols)
test_ds  = test_ds.map(to_labels, remove_columns=["text"] + label_cols)

print(train_ds)

Map:   0%|          | 0/129757 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Map:   0%|          | 0/27805 [00:00<?, ? examples/s]

Map:   0%|          | 0/27806 [00:00<?, ? examples/s]

Map:   0%|          | 0/129757 [00:00<?, ? examples/s]

Map:   0%|          | 0/27805 [00:00<?, ? examples/s]

Map:   0%|          | 0/27806 [00:00<?, ? examples/s]

Dataset({
    features: ['__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 129757
})


In [None]:
import torch

metric_f1 = evaluate.load("f1", average="weighted")
metric_acc = evaluate.load("accuracy")

from sklearn.metrics import f1_score, accuracy_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs  = torch.sigmoid(torch.tensor(logits))
    y_pred = (probs > 0.5).int().numpy()
    y_true = labels

    # “subset” accuracy = exact match of all 4 labels
    subset_acc = (y_pred == y_true).all(axis=1).mean()

    # micro / macro F1
    f1_micro = f1_score(y_true, y_pred, average="micro", zero_division=0)
    f1_macro = f1_score(y_true, y_pred, average="macro", zero_division=0)

    return {
        "subset_accuracy": subset_acc,
        "f1_micro": f1_micro,
        "f1_macro": f1_macro
    }

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

In [None]:

# --- Weights & Biases setup and epoch‑level checkpointing ---
import datetime, os, wandb
from transformers import TrainerCallback

# Log in to W&B (assumes environment variable WANDB_API_KEY is set, otherwise anonymous logging)
try:
    wandb.login()
except wandb.errors.UsageError:
    wandb.login(anonymous="allow")

run = wandb.init(project="SafeAeroBERT_AllFactor",
                 name=f"run-{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}",
                 reinit=True)

# Directory where we'll store local checkpoints
os.makedirs("checkpoints", exist_ok=True)

class WandbModelSaverCallback(TrainerCallback):
    """Save model weights at the end of **every** epoch and log them to Weights & Biases."""
    def on_epoch_end(self, args, state, control, model=None, **kwargs):
        # Grab the integer epoch number
        epoch = int(state.epoch)
        # Save model locally
        save_dir = f"checkpoints/epoch_{epoch}"
        model.save_pretrained(save_dir)

        # Log as a W&B Artifact so you can retrieve it later
        artifact = wandb.Artifact(name=f"SafeAeroBERT-epoch-{epoch}", type="model")
        artifact.add_dir(save_dir)
        run.log_artifact(artifact)
        print(f"✅ Saved and logged model for epoch {epoch} → {save_dir}")
        return control


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mar103789[0m ([33mar103789-worcester-polytechnic-institute[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
args = TrainingArguments(
    output_dir="checkpoints",
    learning_rate=2e-5,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_strategy="steps",
    eval_steps=200,
    save_strategy="epoch",
    logging_steps=100,
    weight_decay=0.01,
    seed=RANDOM_SEED,
    report_to="wandb",
    label_names=["labels"]

)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    compute_metrics=compute_metrics,
    callbacks=[WandbModelSaverCallback],
)

In [None]:
trainer.train()



Step,Training Loss,Validation Loss,Subset Accuracy,F1 Micro,F1 Macro
200,0.5554,0.4909,0.55285,0.55612,0.178959
400,0.4718,0.462034,0.558137,0.558697,0.179404
600,0.4699,0.457241,0.555943,0.559213,0.179759
800,0.4623,0.453095,0.556339,0.559878,0.179996
1000,0.466,0.448103,0.514188,0.573129,0.190093
1200,0.4406,0.436326,0.529042,0.57858,0.191178
1400,0.4298,0.419669,0.521237,0.60112,0.256731
1600,0.422,0.407067,0.563891,0.62511,0.291615
1800,0.4069,0.397445,0.610897,0.663619,0.355708
2000,0.4141,0.391229,0.619493,0.669803,0.358376


[34m[1mwandb[0m: Adding directory to artifact (./checkpoints/epoch_1)... Done. 0.0s


✅ Saved and logged model for epoch 1 → checkpoints/epoch_1


[34m[1mwandb[0m: Adding directory to artifact (./checkpoints/epoch_2)... Done. 0.0s


✅ Saved and logged model for epoch 2 → checkpoints/epoch_2


[34m[1mwandb[0m: Adding directory to artifact (./checkpoints/epoch_3)... Done. 0.0s


✅ Saved and logged model for epoch 3 → checkpoints/epoch_3


TrainOutput(global_step=48660, training_loss=0.33068271520154446, metrics={'train_runtime': 58939.9652, 'train_samples_per_second': 6.605, 'train_steps_per_second': 0.826, 'total_flos': 1.0277601055082496e+17, 'train_loss': 0.33068271520154446, 'epoch': 3.0})

In [None]:
test_metrics = trainer.evaluate(test_ds)
print("\n✅ Test metrics (multi‑label):")
for k, v in test_metrics.items():
    print(f"{k}: {v:.4f}" if isinstance(v, float) else f"{k}: {v}")


✅ Test metrics (multi‑label):
eval_loss: 0.2988
eval_subset_accuracy: 0.7239
eval_f1_micro: 0.7627
eval_f1_macro: 0.6152
eval_runtime: 212.8532
eval_samples_per_second: 130.6350
eval_steps_per_second: 16.3310
epoch: 3.0000


In [None]:
# ─── Extra: per-factor breakdown on the test set ──────────────────────────────
import numpy as np, pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# 1) Run the (fine-tuned) model on the test set  ➜ logits & labels
pred_out      = trainer.predict(test_ds)
logits        = pred_out.predictions            # shape: (N, 4)
y_true_onehot = pred_out.label_ids              # shape: (N, 4)

# 2) Collapse one-hot → single class index
y_true_idx = y_true_onehot.argmax(axis=1)       # gold primary factor (0-3)
y_pred_idx = logits.argmax(axis=1)              # model’s top-logit factor

# 3) Compute metrics “this factor vs rest” (binary view) for each class
rows = []
for i, name in enumerate(label_cols):           # label_cols = ["aircraft", …]
    # TP / FP / FN / TN derived implicitly by sklearn
    acc  = accuracy_score(y_true_idx == i, y_pred_idx == i)
    prec = precision_score(y_true_idx, y_pred_idx, labels=[i],
                           average="macro", zero_division=0)
    rec  = recall_score   (y_true_idx, y_pred_idx, labels=[i],
                           average="macro", zero_division=0)
    f1   = f1_score       (y_true_idx, y_pred_idx, labels=[i],
                           average="macro", zero_division=0)
    rows.append([name.replace('_', ' ').title(), acc, prec, rec, f1])

# 4) Pretty-print like the paper/table
col_order = ["Accuracy", "Precision", "Recall", "F-1"]
cat_df = (pd.DataFrame(rows,
                       columns=["Contributing Factor"] + col_order)
            .set_index("Contributing Factor")
            .round(3))

print(cat_df.to_markdown())        # ↩︎ nice markdown table
cat_df                             # ↩︎ and the nicer Colab HTML view


| Contributing Factor   |   Accuracy |   Precision |   Recall |   F-1 |
|:----------------------|-----------:|------------:|---------:|------:|
| Aircraft              |      0.89  |       0.782 |    0.799 | 0.79  |
| Procedure             |      0.881 |       0.579 |    0.432 | 0.495 |
| Weather               |      0.958 |       0.603 |    0.384 | 0.469 |
| Human Factors         |      0.81  |       0.805 |    0.871 | 0.837 |


Unnamed: 0_level_0,Accuracy,Precision,Recall,F-1
Contributing Factor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Aircraft,0.89,0.782,0.799,0.79
Procedure,0.881,0.579,0.432,0.495
Weather,0.958,0.603,0.384,0.469
Human Factors,0.81,0.805,0.871,0.837
