In [1]:
import pandas as pd
import numpy as np

IN_TSV = "/content/filtered_CHOP/CDL-068-99.filtered_denovo.tsv"

df = pd.read_csv(IN_TSV, sep="\t")
print("Total filtered variants:", df.shape[0])
df.head(2)


Total filtered variants: 88


Unnamed: 0,CHROM,POS,REF,ALT,TYPE,QUAL,FILTER,VQSLOD,QD,FS,...,FATHER_GT,FATHER_DP,FATHER_GQ,FATHER_AD,FATHER_VAF,PROBAND_GT,PROBAND_DP,PROBAND_GQ,PROBAND_AD,PROBAND_VAF
0,chr1,9999315,G,A,SNV,464.03,PASS,16.33,12.21,4.815,...,0/0,41,99,410,0,0/1,38,99,1919,0.5
1,chr1,90679453,T,C,SNV,500.03,PASS,16.68,13.16,3.053,...,0/0,35,99,350,0,0/1,38,99,1919,0.5


In [2]:
num_cols = [
    "QUAL","VQSLOD","QD","FS","SOR","MQ","MQRankSum","ReadPosRankSum",
    "MOTHER_DP","MOTHER_GQ","MOTHER_VAF",
    "FATHER_DP","FATHER_GQ","FATHER_VAF",
    "PROBAND_DP","PROBAND_GQ","PROBAND_VAF",
]

for c in num_cols:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce")


In [3]:
def clip01(x):
    return np.clip(x, 0.0, 1.0)

def norm_high_is_good(x, lo, hi):
    return clip01((x - lo) / (hi - lo))

def norm_low_is_good(x, lo, hi):
    return 1.0 - clip01((x - lo) / (hi - lo))

def vaf_het_score(vaf):
    """
    Max score at 0.5, linearly decays to 0 at 0.35 or 0.65
    """
    if pd.isna(vaf):
        return 0.0
    dist = abs(vaf - 0.5)
    return float(clip01(1.0 - dist / 0.15))

def parent_clean_score(vaf, max_ok=0.01):
    """
    1.0 if vaf <= max_ok, decays to 0 by vaf=0.05
    """
    if pd.isna(vaf):
        return 0.5
    if vaf <= max_ok:
        return 1.0
    return float(norm_low_is_good(vaf, max_ok, 0.05))


In [4]:
def compute_tp_score(row):

    # Variant-level metrics
    qual   = norm_high_is_good(row.get("QUAL"), 30, 200)
    vqslod = norm_high_is_good(row.get("VQSLOD"), 0, 10)
    qd     = norm_high_is_good(row.get("QD"), 2, 20)
    mq     = norm_high_is_good(row.get("MQ"), 40, 60)
    fs     = norm_low_is_good(row.get("FS"), 0, 60)
    sor    = norm_low_is_good(row.get("SOR"), 0, 3)
    mqr    = norm_high_is_good(row.get("MQRankSum"), -12.5, 2)
    rprs   = norm_high_is_good(row.get("ReadPosRankSum"), -8, 2)

    def neutral(x):
        return 0.5 if pd.isna(x) else x

    qual, vqslod, qd, mq, fs, sor, mqr, rprs = map(
        neutral, [qual, vqslod, qd, mq, fs, sor, mqr, rprs]
    )

    # Trio genotype confidence
    p_dp = neutral(norm_high_is_good(row.get("PROBAND_DP"), 15, 60))
    p_gq = neutral(norm_high_is_good(row.get("PROBAND_GQ"), 30, 99))
    m_dp = neutral(norm_high_is_good(row.get("MOTHER_DP"), 15, 60))
    m_gq = neutral(norm_high_is_good(row.get("MOTHER_GQ"), 30, 99))
    f_dp = neutral(norm_high_is_good(row.get("FATHER_DP"), 15, 60))
    f_gq = neutral(norm_high_is_good(row.get("FATHER_GQ"), 30, 99))

    # VAF patterns
    p_vaf = vaf_het_score(row.get("PROBAND_VAF"))
    m_vaf = parent_clean_score(row.get("MOTHER_VAF"))
    f_vaf = parent_clean_score(row.get("FATHER_VAF"))

    # SNV preference
    type_bonus = 1.0 if str(row.get("TYPE")).upper() == "SNV" else 0.85

    score = (
        12*qual +
        10*vqslod +
        10*qd +
        12*mq +
        8*fs +
        6*sor +
        4*mqr +
        4*rprs +
        8*(p_dp + p_gq)/2 +
        6*(m_dp + m_gq)/2 +
        6*(f_dp + f_gq)/2 +
        14*p_vaf +
        12*(m_vaf + f_vaf)/2
    ) * type_bonus

    return float(score)

df["TP_SCORE"] = df.apply(compute_tp_score, axis=1)


In [5]:
top10 = df.sort_values("TP_SCORE", ascending=False).head(10)

top10_cols = [
    "CHROM","POS","REF","ALT","TYPE","TP_SCORE",
    "QUAL","VQSLOD","QD","FS","SOR","MQ",
    "PROBAND_VAF","MOTHER_VAF","FATHER_VAF",
    "PROBAND_DP","MOTHER_DP","FATHER_DP",
    "PROBAND_GQ","MOTHER_GQ","FATHER_GQ"
]

top10[top10_cols]


Unnamed: 0,CHROM,POS,REF,ALT,TYPE,TP_SCORE,QUAL,VQSLOD,QD,FS,...,MQ,PROBAND_VAF,MOTHER_VAF,FATHER_VAF,PROBAND_DP,MOTHER_DP,FATHER_DP,PROBAND_GQ,MOTHER_GQ,FATHER_GQ
26,chr5,110094319,G,T,SNV,102.074051,787.03,16.03,13.57,2.226,...,60.0,0.5,0,0,58,40,34,99,99,99
51,chr10,95296731,G,A,SNV,101.384326,568.03,15.61,14.2,0.0,...,60.0,0.5,0,0,40,36,38,99,99,99
28,chr6,11271313,A,C,SNV,99.81068,639.03,16.64,12.53,10.318,...,60.0,0.4902,0,0,51,39,34,99,99,99
69,chr14,41468025,G,A,SNV,99.671547,500.03,15.72,13.16,2.864,...,60.0,0.5,0,0,38,33,34,99,93,99
40,chr8,43348633,G,C,SNV,99.665447,517.03,16.07,11.75,4.388,...,60.0,0.5,0,0,44,39,39,99,99,99
0,chr1,9999315,G,A,SNV,99.661103,464.03,16.33,12.21,4.815,...,60.0,0.5,0,0,38,42,41,99,99,99
43,chr8,125008170,A,G,SNV,99.645615,747.03,15.46,13.58,2.31,...,60.0,0.4909,0,0,55,35,35,99,99,99
1,chr1,90679453,T,C,SNV,99.406407,500.03,16.68,13.16,3.053,...,60.0,0.5,0,0,38,26,35,99,72,99
4,chr1,229575608,A,C,SNV,99.076371,701.03,15.03,12.75,1.026,...,60.0,0.5091,0,0,55,31,30,99,90,87
73,chr16,7723068,A,C,SNV,98.834287,355.03,15.76,12.68,1.51,...,60.0,0.5,0,0,28,34,38,99,99,99


In [6]:
OUT_TSV = "/content/filtered_CHOP/CDL-068-99.top10_true_positive_likelihood.tsv"
top10.to_csv(OUT_TSV, sep="\t", index=False)

OUT_TSV


'/content/filtered_CHOP/CDL-068-99.top10_true_positive_likelihood.tsv'