In [1]:
import itertools as itt
import biomappings
from collections import Counter
import pandas as pd

In [17]:
positive_mappings = biomappings.load_mappings()
print(f"There are {len(positive_mappings):,} positive mappings")

There are 8,560 positive mappings


In [18]:
negative_mappings = biomappings.load_false_mappings()
print(f"There are {len(negative_mappings):,} negative mappings")

There are 1,122 negative mappings


In [20]:
predicted_mappings = biomappings.load_predictions()
print(f"There are {len(predicted_mappings):,} predicted mappings")

There are 41,178 predicted mappings


In [26]:
def source_target_counter(mappings) -> Counter:
    return Counter(
        tuple(sorted((mapping["source prefix"], mapping["target prefix"])))
        for mapping in mappings
    )

In [28]:
keyed_mappings = [
    ("positive", positive_mappings), 
    ("negative", negative_mappings),
    ('predicted', predicted_mappings),
]

# Biased Measurement of Precision

There are two kinds of mappings we'll consider:

1. Manually curated positive (i.e., correct) mappings. These correspond to true positives.
2. Manually curated negative (i.e., incorrect) mappings. These correspond to false positives.

Most of these mappings originated as predictions from Gilda, with an additional small number being input by curators directly (which is small enough that we'll disregard).

Unfortunately, it's hard to accurately count false negatives and true negatives as the space of potential negative mappings is massive, and also can be defined in several ways, such as if you make the assumption that mappings must be one-to-one and not one-to-many, many-to-one, or many-to-many.

## Caveats

The metrics reported in this section are **heavily biased** due to non-random ways curation was done:

1. Curation was done based on task-based need, meaning that certain resources were included and others were excluded
2. Curation was prioritized based on confidence in order to maximize positive mapping throughput with respect to curator effort

In [4]:
tp = len(positive_mappings)
fp = len(negative_mappings)

precision = tp / (tp + fp)


print(f"The overall unweighed precision is {precision:.2%}.")

The overall unweighed precision is 88.41%.


## Precision by Source

Because mappings are first predicted on the basis of source, the overall precision can be split based on the source prefix of each mapping.

In [29]:
pairs = {
    key: Counter(
        mapping["source prefix"]
        for mapping in mappings
    )
    for key, mappings in keyed_mappings
}
df = pd.DataFrame(pairs).fillna(0).astype(int)
df.index.name = "source"
df = df[df["positive"] + df["negative"] > 20]
df["precision"] = (df["positive"] / (df["positive"] + df["negative"])).round(2)
# df["fdr"] = df["negative"] / (df["positive"] + df["negative"])
df.sort_values("precision", inplace=True, ascending=False)
df

Unnamed: 0_level_0,positive,negative,predicted,precision
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
uniprot.chain,27,0,0,1.0
chebi,1585,11,9883,0.99
uberon,131,3,68,0.98
mondo,207,7,78,0.97
ccle,625,40,105,0.94
mesh,4301,316,26922,0.93
pr,80,9,0,0.9
wikipathways,467,60,1685,0.89
agrovoc,142,20,0,0.88
doid,390,87,2417,0.82


A more granular view is calculated by stratifying by both the source and target prefix of the mappings.

In [31]:
pairs = {
    key: source_target_counter(mappings)
    for key, mappings in keyed_mappings
}
df = pd.DataFrame(pairs).fillna(0).astype(int)
df.index.set_names(["source", "target"], inplace=True)
df = df[df["positive"] + df["negative"] > 20]
df["precision"] = (df["positive"] / (df["positive"] + df["negative"])).round(2)


# if there are no predictions, then we know that
# the false negative is zero.
# trivially: sensitivity=1,
# We can also calculate metrics that use that as well
# TP, FP, FN

df.sort_values("precision", inplace=True, ascending=False)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,positive,negative,predicted,precision
source,target,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
mesh,uniprot,378,0,10970,1.0
go,wikipathways,366,0,0,1.0
ncbiprotein,uniprot.chain,26,0,0,1.0
kegg.pathway,wikipathways,70,0,0,1.0
kegg.pathway,reactome,61,0,0,1.0
chebi,mesh,2666,21,12096,0.99
reactome,wikipathways,73,1,0,0.99
mesh,ncit,888,14,11751,0.98
ccle,efo,516,9,58,0.98
mesh,uberon,131,3,68,0.98


# Remaining Curation Effort

In [24]:
pd.DataFrame(
    [(s,t,c)
    for (s,t), c in reversed(source_target_counter(predicted_mappings).most_common())
    ],
    columns=["source", "target", "count"]
)

Unnamed: 0,source,target,count
0,ccle,depmap,3
1,go,reactome,20
2,go,mesh,44
3,ccle,cellosaurus,44
4,doid,mesh,58
5,ccle,efo,58
6,mesh,uberon,68
7,doid,efo,72
8,mesh,mondo,78
9,efo,mesh,166
