In [1]:
import pandas as pd
full_df_filtered = pd.read_feather("../data/gnina_processed/LIT-PCBA/filtered.feather")
n_df = pd.read_csv("../data/raw_data/LIT-PCBA/AVE_unbiased/n.csv", index_col = 0)

In [2]:
full_df = pd.read_feather("../data/gnina_processed/LIT-PCBA/full.feather")

In [3]:
sort_according_to = "CNNaffinity"

In [4]:
full_df_filtered_affinity = full_df.sort_values(sort_according_to, ascending = False).drop_duplicates(["_Name", "directory", "ligands"])
full_df_filtered_affinity["active_groundtruth"] = full_df_filtered_affinity["ligands"].str.contains("_active_")
# full_df_filtered.reset_index().rename(columns = {"level_0": "unfiltered_index"}).to_feather(join(inputdir, "filtered.feather"))

In [5]:
def get_subset(df, dir):
    return df[df["directory"] == dir]
def get_data(df):
    dirs = df["directory"].unique()
    output_data = {}
    for dir in dirs:
        subset = get_subset(df, dir)
        one_percent = int(len(subset)*0.01)+1
        top1 = subset.iloc[:one_percent]
        n_true_actives, n_true_total = n_df.loc[dir]
        normalizer = (min(one_percent, n_true_actives) / one_percent) / (n_true_actives / n_true_total)
        ef1 = (top1["active_groundtruth"].sum()/len(top1)) / (n_true_actives / n_true_total)

        output_data[dir] = dict(EF1 = ef1,
                                NEF1 = ef1/normalizer,
                                n_actives_found = top1["active_groundtruth"].sum(),
                                length = one_percent,
                                gt_active_proportion = (n_true_actives / n_true_total),
                                total_succeeded = len(subset),
                                total_ligands = n_true_total
                                )
    out_df = pd.DataFrame(output_data).T.astype({"length": int, "n_actives_found": int, "total_succeeded": int, "total_ligands": int})
    out_df.rename(columns = {"length": r"1% is equal to"}, inplace = True)
    return out_df

In [6]:
VS = get_data(full_df_filtered).sort_index()
affinity = get_data(full_df_filtered_affinity).sort_index()

In [7]:
VS.reset_index().rename(columns = {"index": "target"})

Unnamed: 0,target,EF1,NEF1,n_actives_found,1% is equal to,gt_active_proportion,total_succeeded,total_ligands
0,ADRB2,13.337431,0.133333,2,3124,4.8e-05,312319,312496
1,ALDH1,1.321735,0.065257,71,1088,0.049373,108700,108846
2,ESR1_ago,9.077922,0.090909,1,56,0.001967,5595,5592
3,ESR1_ant,10.883529,0.215686,11,51,0.019818,5049,5046
4,FEN1,2.453468,0.024523,9,3556,0.001032,355552,355767
5,IDH1,3.60444,0.027027,1,2715,0.000102,271422,362084
6,KAT2A,1.563567,0.015625,3,3485,0.000551,348486,348738
7,MAPK1,2.94272,0.029412,9,629,0.004862,62815,62933
8,MTORC1,2.103037,0.021053,2,331,0.002873,33043,33065
9,OPRK1,0.0,0.0,0,1852,8.2e-05,185171,269836


In [9]:
print(VS["EF1"].median())


2.278252401678487


In [26]:
print(VS["NEF1"].mean())
print(affinity["NEF1"].mean())


0.052248044846281476
0.060791471896805234


In [308]:
print(out_df["EF1"].median())
print(out_df["NEF1"].median())

2.278252401678487
0.025775093894984898


In [6]:
out_df

Unnamed: 0,EF1,NEF1,n_actives_found,length,gt_active_proportion,total_succeeded,total_ligands
ADRB2,0.0,0.0,0,3124,4.8e-05,312319,312496
VDR,1.059398,0.01059,7,2671,0.002474,267002,267200
ESR1_ago,18.155844,0.181818,2,56,0.001967,5595,5592
PKM2,0.919361,0.009191,5,2460,0.002211,245901,246065
ALDH1,2.047758,0.101103,110,1088,0.049373,108700,108846
IDH1,46.857717,0.351351,13,2715,0.000102,271422,362084
MAPK1,1.961813,0.019608,6,629,0.004862,62815,62933
ESR1_ant,4.947059,0.098039,5,51,0.019818,5049,5046
PPARG,0.0,0.0,0,53,0.004776,5237,5234
MTORC1,1.051519,0.010526,1,331,0.002873,33043,33065


In [260]:
out_df["total_ligands"] - out_df["total_succeeded"]

VDR           198
MAPK1         118
ESR1_ago       -3
ESR1_ant       -3
ADRB2         177
ALDH1         146
PKM2          164
MTORC1         22
IDH1          151
OPRK1       84665
PPARG          -3
KAT2A         252
FEN1          215
TP53           -4
dtype: int64