In [10]:
class CONFIG:
    external_data = False

In [6]:
# download the datasets

import os
import opendatasets as od
import pandas as pd
import json

data_path = "./pii-detection-removal-from-educational-data/"

# download the data from kaggle
if not os.path.exists(data_path):
    print("Dataset not found, downloading from Kaggle")
    dataset = "https://www.kaggle.com/competitions/pii-detection-removal-from-educational-data/data"
    od.download(dataset)
else:
    print("Dataset found in disk")

# check for the files present there
assert os.path.exists(data_path + "train.json"), "train.json file missing"
assert os.path.exists(data_path + "test.json"), "test.json file missing"


train_df = pd.read_json(open(data_path + "train.json"))
print("train_df loaded")

test_df = pd.read_json(open(data_path + "test.json"))
print("test_df loaded")

Dataset found in disk
train_df loaded
test_df loaded


In [11]:
# include external data

if CONFIG.external_data:
    # code to process external data
    pass

In [12]:
# scoring
from sklearn.metrics import fbeta_score

def fbeta_score(pred_df: pd.DataFrame, orig_df: pd.DataFrame, beta=5) -> float:
    '''
    PARAMS:
        pred_df - Dataframe consisting of predicted PII labels
        orig_df - Dataframe consisting of original PII labels
        beta    - Hyperparameter controlling the tradeoff between precision and recall (default 5)

    RETURNS:
        score   - calculated fbeta score
    '''

    tmp_df = pred_df.merge(orig_df, how = "outer", on = ["document", "token"], suffixes=["_pred", "_orig"])
    
    # create a temp column
    tmp_df["_label"] = ""

    # find all the rows in 'orig_df' which doesn't have a label and assign it as False Positive
    tmp_df.loc[tmp_df.label_orig.isna(), "_label"] = "FP"

    # find all the rows in 'pred_df' which doesn;t have a label and assign it as False Negative
    tmp_df.loc[tmp_df.label_pred.isna(), "_label"] = "FN"

    # find all the mismatch labels from both the dataframes and assign it as a False Negative
    tmp_df.loc[(tmp_df.label_orig.notna()) & (tmp_df.label_pred.notna()) & (tmp_df.label_pred != tmp_df.label_orig), "_label"] = "FN"

    # find all the matched labels from both the dataframes and assign it as a False Negative
    tmp_df.loc[(tmp_df.label_orig.notna()) & (tmp_df.label_pred.notna()) & (tmp_df.label_pred == tmp_df.label_orig), "_label"] = "TP"

    FP = (tmp_df["_label"] == "FP").sum()
    FN = (tmp_df["_label"] == "DN").sum()
    TP = (tmp_df["_label"] == "TP").sum()

    score = (1 + (beta**2)) * TP / (((1 + (beta**2)) * TP) + ((beta**2)*FN) + FP)

    return score

