In [1]:
from sklearn.metrics import fbeta_score
import pandas as pd

def calculate_micro_f2(data, label, pos_label=1):
    """
    Calculate micro F2 score for a specific label.

    Args:
        data (pd.DataFrame): DataFrame containing true and predicted labels.
        label (str): The column name of the label to calculate F2 score.
        pos_label (int): The label to treat as positive (default is 1).

    Returns:
        float: Micro F2 score for the given label.
    """
    return fbeta_score(
        data[f"{label}_true"],
        data[f"{label}_pred"],
        beta=2.0,
        pos_label=pos_label,
        average="binary",
        zero_division=0
    )

def evaluate_micro_f2(labels, predictions):
    """
    Evaluate micro F2 scores for all target labels.

    Args:
        labels (pd.DataFrame): True labels for the dataset.
        predictions (pd.DataFrame): Predicted labels for the dataset.

    Returns:
        dict: Dictionary containing micro F2 scores for each label and overall score.
    """
    # Merge the true labels and predictions on id and sentence_index
    merged = pd.merge(labels, predictions, on=["id", "sentence_index"], suffixes=("_true", "_pred"))

    # Define the target labels
    target_labels = [
        "measure", "extension", "atelectasis", "satellite",
        "lymphadenopathy", "pleural", "distant"
    ]

    # Calculate Overall micro F2.0 (Measure ~ Distant)
    overall_true = merged[[f"{label}_true" for label in target_labels]].values.ravel()
    overall_pred = merged[[f"{label}_pred" for label in target_labels]].values.ravel()
    overall_f2 = fbeta_score(overall_true, overall_pred, beta=2.0, pos_label=1, average="binary", zero_division=0)

    # Calculate Inclusion micro F2.0 (Omittable with pos_label=0)
    inclusion_f2 = calculate_micro_f2(merged, "omittable", pos_label=0)

    # Initialize a dictionary to store results
    results = {"Overall micro F2.0": overall_f2, "Inclusion micro F2.0": inclusion_f2}

    # Calculate F2 scores for individual labels
    for label in target_labels:
        results[f"{label.capitalize()} micro F2.0"] = calculate_micro_f2(merged, label)

    return results

# Example usage:
# metrics = evaluate_micro_f2(labels, predictions)
# print(metrics)




In [9]:
label = pd.read_csv('../radnlp_2024_train_val_20240731/ja/sub_task/val/label.csv')
submission = pd.read_csv('../sentence_classifications_gpt-4o-2024-05-13.csv')

evaluate_micro_f2(label, submission)

{'Overall micro F2.0': 0.7467330429371499,
 'Inclusion micro F2.0': 0.8404255319148936,
 'Measure micro F2.0': 0.579064587973274,
 'Extension micro F2.0': 0.8547008547008546,
 'Atelectasis micro F2.0': 0.8602150537634409,
 'Satellite micro F2.0': 0.8,
 'Lymphadenopathy micro F2.0': 0.8552631578947368,
 'Pleural micro F2.0': 0.8590308370044053,
 'Distant micro F2.0': 0.6637168141592921}

In [2]:
label = pd.read_csv('../radnlp_2024_train_val_20240731/ja/sub_task/train/label.csv')
submission = pd.read_csv('../sentence_classifications_o1-preview-2024-09-12.csv')

evaluate_micro_f2(label, submission)

{'Overall micro F2.0': 0.8040752351097179,
 'Inclusion micro F2.0': 0.9038461538461539,
 'Measure micro F2.0': 0.5889281507656066,
 'Extension micro F2.0': 0.85667215815486,
 'Atelectasis micro F2.0': 0.8895705521472391,
 'Satellite micro F2.0': 0.8399999999999999,
 'Lymphadenopathy micro F2.0': 0.9595070422535211,
 'Pleural micro F2.0': 0.8783783783783784,
 'Distant micro F2.0': 0.8541666666666666}

## 最終subへの準備

In [3]:
from sklearn.metrics import fbeta_score
import pandas as pd

def calculate_micro_f2(data, label, pos_label=1):
    return fbeta_score(
        data[f"{label}_true"],
        data[f"{label}_pred"],
        beta=2.0,
        pos_label=pos_label,
        average="binary",
        zero_division=0
    )

def evaluate_micro_f2(labels, predictions):
    """
    Evaluate micro F2 scores for all target labels.
    Additionally, return rows where predictions differ from labels.
    """
    # Merge the true labels and predictions on id and sentence_index
    merged = pd.merge(labels, predictions, on=["id", "sentence_index"], suffixes=("_true", "_pred"))

    # Define the target labels
    target_labels = [
        "measure", "extension", "atelectasis", "satellite",
        "lymphadenopathy", "pleural", "distant"
    ]

    # Calculate Overall micro F2.0 (Measure ~ Distant)
    overall_true = merged[[f"{label}_true" for label in target_labels]].values.ravel()
    overall_pred = merged[[f"{label}_pred" for label in target_labels]].values.ravel()
    overall_f2 = fbeta_score(overall_true, overall_pred, beta=2.0, pos_label=1, average="binary", zero_division=0)

    # Calculate Inclusion micro F2.0 (Omittable with pos_label=0)
    inclusion_f2 = calculate_micro_f2(merged, "omittable", pos_label=0)

    # Initialize a dictionary to store results
    results = {"Overall micro F2.0": overall_f2, "Inclusion micro F2.0": inclusion_f2}

    # Calculate F2 scores for individual labels
    for label in target_labels:
        results[f"{label.capitalize()} micro F2.0"] = calculate_micro_f2(merged, label)

    # --------------------------------------------------------------
    # ここから「正解と予測が異なる行のみ」を抽出するロジック
    # --------------------------------------------------------------
    # 1) ターゲットとする列名を作成
    target_true_cols = [f"{lbl}_true" for lbl in target_labels]
    target_pred_cols = [f"{lbl}_pred" for lbl in target_labels]
    
    # 2) 上で定義した全ラベルの true列 と pred列 が 1つでも不一致の行を取り出す
    mismatch_mask = (merged[target_true_cols].values != merged[target_pred_cols].values).any(axis=1)
    diff_rows = merged[mismatch_mask].copy()

    # 必要なら omittable_true / omittable_pred も含める場合は以下のように拡張：
    # target_true_cols.append("omittable_true")
    # target_pred_cols.append("omittable_pred")
    # 再度マスクを取り直す

    return results, diff_rows


In [4]:
results, diff = evaluate_micro_f2(label, submission)

In [10]:
diff.to_csv('../model_outputs/o1preview_diff_sentences.csv')