# F1macro Implementation

- code is based on https://www.kaggle.com/cpmpml/faster-metric-computation
- fixed bugs with hungarian algorithm

In [1]:
import pandas as pd
import numpy as np

from scipy.optimize import linear_sum_assignment

## Hungarian mapping

In [229]:
def calc_overlap(row):
    """
    Calculates the overlap between prediction and
    ground truth and overlap percentages used for determining
    true positives.
    """
    set_pred = set(row.predictionstring_pred.split(" "))
    set_gt = set(row.predictionstring_gt.split(" "))
    
    # Length of each and intersection
    len_gt = len(set_gt)
    len_pred = len(set_pred)
    
    inter = len(set_gt.intersection(set_pred))
    
    overlap_1 = inter / len_gt
    overlap_2 = inter / len_pred
    
    return [overlap_1, overlap_2]


def score_feedback_comp_micro(pred_df, gt_df):
    """
    A function that scores for the kaggle
        Student Writing Competition

    Uses the steps in the evaluation page here:
        https://www.kaggle.com/c/feedback-prize-2021/overview/evaluation
    """
    gt_df = (
        gt_df[["id", "discourse_type", "predictionstring"]]
        .reset_index(drop=True)
        .copy()
    )
    pred_df = pred_df[["id", "class", "predictionstring"]].reset_index(drop=True).copy()
    pred_df["pred_id"] = pred_df.index
    gt_df["gt_id"] = gt_df.index
    
    # Step 1. all ground truths and predictions for a given class are compared.
    joined = pred_df.merge(
        gt_df,
        left_on=["id", "class"],
        right_on=["id", "discourse_type"],
        how="outer",
        suffixes=("_pred", "_gt"),
    )
    joined["predictionstring_gt"] = joined["predictionstring_gt"].fillna(" ")
    joined["predictionstring_pred"] = joined["predictionstring_pred"].fillna(" ")

    joined["overlaps"] = joined.apply(calc_overlap, axis=1)

    # 2. If the overlap between the ground truth and prediction is >= 0.5,
    # and the overlap between the prediction and the ground truth >= 0.5,
    # the prediction is a match and considered a true positive.
    # If multiple matches exist, the match with the highest pair of overlaps is taken.
    joined["overlap1"] = joined["overlaps"].apply(lambda x: eval(str(x))[0])
    joined["overlap2"] = joined["overlaps"].apply(lambda x: eval(str(x))[1])

    joined["potential_TP"] = (joined["overlap1"] >= 0.5) & (joined["overlap2"] >= 0.5)
    joined["max_overlap"] = joined[["overlap1", "overlap2"]].max(axis=1)
    
    # 3. hungarian algorithm
    hungarian_df = pd.pivot_table(joined.query("potential_TP"),
                                  index='gt_id',
                                  columns='pred_id',
                                  values='max_overlap')
    hungarian_matrix = hungarian_df.fillna(0).values * -1.
    gt_index, pred_index = linear_sum_assignment(hungarian_matrix)

    # calc microf1
    gt_n = gt_df.gt_id.nunique()
    pred_n = pred_df.pred_id.nunique()

    # Get numbers of each type
    TP = len(gt_index)
    FP = pred_n - TP
    FN = gt_n - TP
    
    print(f'TP: {TP}   FP : {FP}   FN : {FN}')
    my_f1_score = TP / (TP + 0.5 * (FP + FN))
    
    return my_f1_score


def score_feedback_comp(pred_df, gt_df, return_class_scores=False):
    
    class_scores = {}
    pred_df = pred_df[["id", "class", "predictionstring"]].reset_index(drop=True).copy()
    
    for discourse_type, gt_subset in gt_df.groupby("discourse_type"):
        pred_subset = (
            pred_df.loc[pred_df["class"] == discourse_type]
            .reset_index(drop=True)
            .copy()
        )
        class_score = score_feedback_comp_micro(pred_subset, gt_subset)
        class_scores[discourse_type] = class_score
    
    f1 = np.mean([v for v in class_scores.values()])
    
    if return_class_scores:
        return f1, class_scores

    return f1

## Test cases

In [230]:
gt_df = pd.DataFrame({'discourse_type': [1, 1, 1], 'id': [1, 1, 1], 'predictionstring': ['1 2 3 4 5', '6 7 8', '21 22 23 24 25']})
pred_df = pd.DataFrame({'class': [1, 1], 'id': [1, 1], 'predictionstring': ['1 2', '6 7 8']})
score_feedback_comp(pred_df, gt_df, 1)

TP: 1   FP : 1   FN : 2


(0.4, {1: 0.4})

In [231]:
gt_df = pd.DataFrame({'discourse_type': [1, 1], 'id': [1, 1], 'predictionstring': ['0 1', '2 3']})
pred_df = pd.DataFrame({'class': [1,], 'id': [1], 'predictionstring': ['0 1 2 3']})
score_feedback_comp(pred_df, gt_df, 1)

TP: 1   FP : 0   FN : 1


(0.6666666666666666, {1: 0.6666666666666666})

In [232]:
gt_df = pd.DataFrame({'discourse_type': [1], 'id': [1], 'predictionstring': ['0 1 2 3']})
pred_df = pd.DataFrame({'class': [1, 1], 'id': [1, 1], 'predictionstring': ['0 1', '2 3']})
score_feedback_comp(pred_df, gt_df, 1)

TP: 1   FP : 1   FN : 0


(0.6666666666666666, {1: 0.6666666666666666})

In [233]:
gt_df = pd.DataFrame({'discourse_type': [1, 1, 1], 'id': [1, 1, 1], 'predictionstring': ['0 1', '2 3', '4 5']})
pred_df = pd.DataFrame({'class': [1,], 'id': [1], 'predictionstring': ['0 1 2 3 4 5']})
score_feedback_comp(pred_df, gt_df, 1)

TP: 0   FP : 1   FN : 3


(0.0, {1: 0.0})

In [234]:
gt_df = pd.DataFrame({'discourse_type': [1, 1, 1], 'id': [1, 1, 1], 'predictionstring': ['0 1', '2 3 4', '6']})
pred_df = pd.DataFrame({'class': [1,], 'id': [1], 'predictionstring': ['0 1 2 3 4 5']})
score_feedback_comp(pred_df, gt_df, 1)

TP: 1   FP : 0   FN : 2


(0.5, {1: 0.5})

In [235]:
gt_df = pd.DataFrame({'discourse_type': [1, 1], 'id': [1, 1], 'predictionstring': ['0 1', '2 3']})
pred_df = pd.DataFrame({'class': [1, 1], 'id': [1, 1], 'predictionstring': ['0 1 2 3', '3 4']})
score_feedback_comp(pred_df, gt_df, 1)

TP: 2   FP : 0   FN : 0


(1.0, {1: 1.0})

In [236]:
gt_df = pd.DataFrame({'discourse_type': [1, 1], 'id': [1, 1], 'predictionstring': ['0 1', '2 3']})
pred_df = pd.DataFrame({'class': [1, 1, 1], 'id': [1, 1, 1], 'predictionstring': ['0 1 2 3', '3 4', '6']})
score_feedback_comp(pred_df, gt_df, 1)

TP: 2   FP : 1   FN : 0


(0.8, {1: 0.8})