In [1]:
import pandas as pd
import numpy as np
from IPython.display import display, HTML

In [2]:
OUTCOME_TYPES = ["all"]#, "mortality", "objective", "subjective"]
DOC_OUTCOMES = ["ac-doc-judgment", "rsg-doc-judgment"] + \
                    ["boa-doc-judgment-{0}".format(outcome_type) for outcome_type in OUTCOME_TYPES] + \
                    ["bpp-doc-judgment-{0}".format(outcome_type) for outcome_type in OUTCOME_TYPES]

In [3]:
train_df = pd.read_csv("data/splits/train-df.csv")
dev_df  =  pd.read_csv("data/splits/dev-df.csv")

In [4]:
train_df.columns

Index(['Unnamed: 0', 'ac-doc-judgment', 'ac-rationale', 'boa-doc-judgment-all',
       'boa-doc-judgment-mortality', 'boa-doc-judgment-objective',
       'boa-doc-judgment-subjective', 'boa-rationale-all',
       'boa-rationale-mortality', 'boa-rationale-objective',
       'boa-rationale-subjective', 'bpp-doc-judgment-all',
       'bpp-doc-judgment-mortality', 'bpp-doc-judgment-objective',
       'bpp-doc-judgment-subjective', 'bpp-rationale-all',
       'bpp-rationale-mortality', 'bpp-rationale-objective',
       'bpp-rationale-subjective', 'doc_id', 'doi', 'pmid', 'rsg-doc-judgment',
       'rsg-rationale', 'sentence'],
      dtype='object')

Group high and unclear

In [5]:
grouper = lambda x : "high/unclear" if x in ("high", "unclear") else x
for outcome in DOC_OUTCOMES:
    train_df['{0}-grouped'.format(outcome)] = train_df[outcome].apply(grouper)
    dev_df['{0}-grouped'.format(outcome)] = dev_df[outcome].apply(grouper)

In [6]:
train_df['{0}-grouped'.format(DOC_OUTCOMES[0])].unique()

array(['low', 'high/unclear', 'unk'], dtype=object)

In [8]:
doc_ids_2_doc_lbls = {}
data_by_pmid = list(train_df.groupby("doc_id"))
for id_, doc_rows in data_by_pmid:
    doc_ids_2_doc_lbls[id_] = {}
    for outcome in DOC_OUTCOMES:
        doc_ids_2_doc_lbls[id_][outcome] = doc_rows['{0}-grouped'.format(outcome)].values[0]

dev_doc_ids_2_doc_lbls = {}
data_by_pmid = list(dev_df.groupby("doc_id"))
for id_, doc_rows in data_by_pmid:
    dev_doc_ids_2_doc_lbls[id_] = {}
    for outcome in DOC_OUTCOMES:
        dev_doc_ids_2_doc_lbls[id_][outcome] = doc_rows['{0}-grouped'.format(outcome)].values[0]
    

In [None]:
dev

In [9]:
doc_lbl_df = pd.DataFrame(doc_ids_2_doc_lbls)
doc_lbl_df = doc_lbl_df.transpose()
doc_lbl_df.head()

Unnamed: 0,ac-doc-judgment,boa-doc-judgment-all,bpp-doc-judgment-all,rsg-doc-judgment
10.1002/acp.808,high/unclear,unk,unk,unk
10.1002/ajim.10209,high/unclear,low,unk,high/unclear
10.1002/ajim.10254,unk,unk,unk,unk
10.1002/ajim.20103,high/unclear,unk,unk,high/unclear
10.1002/ajim.20192,high/unclear,unk,unk,unk


In [10]:
dev_doc_lbl_df = pd.DataFrame(dev_doc_ids_2_doc_lbls)
dev_doc_lbl_df = dev_doc_lbl_df.transpose()
dev_doc_lbl_df.head()

Unnamed: 0,ac-doc-judgment,boa-doc-judgment-all,bpp-doc-judgment-all,rsg-doc-judgment
10.1002/ajim.20634,high/unclear,high/unclear,unk,high/unclear
10.1002/ana.410190204,unk,unk,unk,unk
10.1002/art.1780110506,unk,unk,unk,unk
10.1002/art.1780310503,high/unclear,high/unclear,unk,high/unclear
10.1002/art.1790030409,high/unclear,unk,unk,unk


In [11]:
baseline_accs = {}
dev_baseline_accs = {}
for outcome in DOC_OUTCOMES:
    baseline_accs[outcome] = doc_lbl_df[outcome].value_counts()
    dev_baseline_accs[outcome] = dev_doc_lbl_df[outcome].value_counts()
# pd.DataFrame(baseline_accs, index=[0])
baseline_df = pd.DataFrame(baseline_accs)
dev_baseline_df = pd.DataFrame(dev_baseline_accs)

In [12]:
baseline_df = baseline_df.drop(["unk"])
baseline_df

Unnamed: 0,ac-doc-judgment,boa-doc-judgment-all,bpp-doc-judgment-all,rsg-doc-judgment
high/unclear,9529,3770,3403,6101
low,5714,2503,1823,6239


In [13]:
dev_baseline_df = dev_baseline_df.drop(["unk"])
dev_baseline_df

Unnamed: 0,ac-doc-judgment,boa-doc-judgment-all,bpp-doc-judgment-all,rsg-doc-judgment
high/unclear,2923,1170,1071,1911
low,1877,821,582,1979


In [14]:
baseline_dict = {}
for domain in baseline_df.columns:
    baseline_dict[domain] = baseline_df[domain]["high/unclear"]/baseline_df[domain].sum()
baseline_dict

{'ac-doc-judgment': 0.62513940825296854,
 'boa-doc-judgment-all': 0.60098836282480472,
 'bpp-doc-judgment-all': 0.65116724071947951,
 'rsg-doc-judgment': 0.49440842787682332}

In [15]:
dev_baseline_dict = {}
for domain in dev_baseline_df.columns:
    dev_baseline_dict[domain] = dev_baseline_df[domain]["high/unclear"]/dev_baseline_df[domain].sum()
dev_baseline_dict


{'ac-doc-judgment': 0.60895833333333338,
 'boa-doc-judgment-all': 0.58764439979909588,
 'bpp-doc-judgment-all': 0.64791288566243199,
 'rsg-doc-judgment': 0.49125964010282774}