# SG10K CYP2D6 Caller Consensus

In [71]:
import pandas as pd
import json
import numpy as np
import itertools
import collections

from pathlib import Path
from callers import Cyp2d6CallerOutput
from diplotype import Diplotype, Haplotype, StarAllele

## Load caller output files

In [104]:
caller_output_dir = Path.cwd() / "../../_data/caller-outputs"
samples  = {sample_path.stem: {caller_path.stem: caller_path for caller_path in sample_path.iterdir()} for sample_path in caller_output_dir.resolve().iterdir()}

{'stellarpgx': PosixPath('/Users/torojr/projects/nalagenetics/SG10K-CYP2D6/_data/caller-outputs/WHB4244/stellarpgx'), 'cyrius': PosixPath('/Users/torojr/projects/nalagenetics/SG10K-CYP2D6/_data/caller-outputs/WHB4244/cyrius'), 'aldy': PosixPath('/Users/torojr/projects/nalagenetics/SG10K-CYP2D6/_data/caller-outputs/WHB4244/aldy')}


In [73]:
len(samples)

1852

In [74]:
outputs = []
for sample, callers in samples.items():
    for caller, caller_path in callers.items():
        if caller == "aldy":
            caller_file = list(caller_path.glob("**/*.aldy"))[0]
        elif caller == "cyrius":
            caller_file = list(caller_path.glob(f"**/{sample}.json"))[0]
        elif caller == "stellarpgx":
            caller_file = list(caller_path.glob(f"**/*.alleles"))[0]
        caller_output = eval(f"Cyp2d6CallerOutput.{caller}(\"{caller_file}\", sample_name=\"{sample}\")")
        outputs.append((sample, caller, caller_output)) 



In [75]:
entries = []
for sample, caller, caller_output in outputs:
    for diplotype in caller_output.diplotypes:
        index = ["sample", "caller", "diplotype_raw_string", "diplotype", "haplotypes", "star_alleles", "filter", "is_novel"]
        srs = pd.Series([sample, caller, diplotype.raw, diplotype.parsed, [hap.parsed for hap in sorted(diplotype.haplotypes)], [sa for star_allele in sorted(diplotype.star_alleles) for sa in star_allele.as_list()], diplotype.filt, diplotype.is_novel], index=index)
        entries.append(srs)

caller_outputs_df = pd.DataFrame(entries).sort_values(["sample", "caller"]).reset_index(drop=True)
caller_outputs_df["caller"] = caller_outputs_df["caller"].astype("category")
caller_outputs_df["filter"] = caller_outputs_df["filter"].astype("category")
caller_outputs_df

Unnamed: 0,sample,caller,diplotype_raw_string,diplotype,haplotypes,star_alleles,filter,is_novel
0,WHB3374,aldy,*1/*10,*1/*10,"[*1, *10]","[*1, *10]",,False
1,WHB3374,cyrius,*1/*10,*1/*10,"[*1, *10]","[*1, *10]",PASS,False
2,WHB3374,stellarpgx,*1/*10,*1/*10,"[*1, *10]","[*1, *10]",,False
3,WHB3375,aldy,*10/*36.ALDY,*10/*36,"[*10, *36]","[*10, *36]",,False
4,WHB3375,cyrius,*5/*36+*10,*5/*10+*36,"[*5, *10+*36]","[*5, *10, *36]",PASS,False
...,...,...,...,...,...,...,...,...
5571,WHB5468,cyrius,*1/*1,*1/*1,"[*1, *1]","[*1, *1]",PASS,False
5572,WHB5468,stellarpgx,*1/*1,*1/*1,"[*1, *1]","[*1, *1]",,False
5573,WHB5469,aldy,*1/*36.ALDY+*10,*1/*10+*36,"[*1, *10+*36]","[*1, *10, *36]",,False
5574,WHB5469,cyrius,*1/*36+*10,*1/*10+*36,"[*1, *10+*36]","[*1, *10, *36]",PASS,False


In [76]:
caller_outputs_df.describe()

Unnamed: 0,sample,caller,diplotype_raw_string,diplotype,haplotypes,star_alleles,filter,is_novel
count,5576,5576,5576,5576,5576,5576,1649,5576
unique,1852,3,444,349,348,310,2,2
top,WHB4932,aldy,*1/*36+*10,*1/*10+*36,"[*1, *10+*36]","[*1, *10, *36]",PASS,False
freq,5,1867,468,710,710,710,1641,5442


## Consensus

In [77]:
def diplotype_consensus(diplotypes, return_agreement=False):
    values = diplotypes.value_counts(dropna=False, normalize=True)
    consensus = np.nan
    agreement = np.nan
    if len(values) > 1:
        if values[0] > values[1]:
            consensus = values.index[0]
            agreement = values[0]
    else:
        consensus = values.index[0]
        agreement = values[0]

    if return_agreement:
        return agreement
    else:
        return consensus


def haplotype_consensus(haplotypes, return_agreement=False):
    consensus_list = []
    agreements = []
    for i in range(haplotypes.apply(len).max()):
        values = haplotypes.str[i].value_counts(dropna=False, normalize=True)
        consensus = np.nan
        agreement = np.nan
        if len(values) > 1:
            if values[0] > values[1]:
                consensus = values.index[0]
                agreement = values[0]
        else:
            consensus = values.index[0]
            agreement = values[0]

        consensus_list.append(consensus)
        agreements.append(agreement)

    srs = []
    for i in range(len(haplotypes)):
        if return_agreement:
            srs.append(agreements)
        else:
            srs.append(consensus_list)
    return pd.Series(srs, index=haplotypes.index)


def star_allele_intersection(star_alleles):
    star_allele_sets = [set(sas) for sas in star_alleles.values]
    common = sorted(list(set.intersection(*star_allele_sets)), key=StarAllele.from_string)

    srs = []
    for i in range(len(star_alleles)):
        srs.append(common)

    return pd.Series(srs, index=star_alleles.index)



In [78]:
caller_outputs_df["diplotype_consensus"] = caller_outputs_df.groupby("sample")["diplotype"].transform(diplotype_consensus)
caller_outputs_df["diplotype_consensus_agreement"] = caller_outputs_df.groupby("sample")["diplotype"].transform(diplotype_consensus, return_agreement=True)
caller_outputs_df["haplotype_consensus"] = caller_outputs_df[caller_outputs_df["diplotype_consensus"].isna()].groupby("sample")["haplotypes"].transform(haplotype_consensus)
caller_outputs_df["haplotype_consensus_agreement"] = caller_outputs_df[caller_outputs_df["diplotype_consensus"].isna()].groupby("sample")["haplotypes"].transform(haplotype_consensus, return_agreement=True)
caller_outputs_df["star_allele_intersection"] = caller_outputs_df[caller_outputs_df["diplotype_consensus"].isna()].groupby("sample")["star_alleles"].transform(star_allele_intersection)

## Aggregate by sample

In [79]:
consensus_df_columns = ["sample", "diplotype_consensus", "diplotype_consensus_agreement", "haplotype_consensus", "haplotype_consensus_agreement", "star_allele_intersection"]
consensus_df_index = caller_outputs_df[consensus_df_columns].astype(str).drop_duplicates().index
consensus_df = caller_outputs_df.loc[consensus_df_index, consensus_df_columns].set_index("sample")

caller_outputs_df_grouped = caller_outputs_df.groupby("sample")
consensus_df["aldy_novel"] = consensus_df.apply(lambda x: any(caller_outputs_df_grouped.get_group(x.name).query("caller == \"aldy\"")["is_novel"]), axis=1)
consensus_df["stellarpgx_novel"] = consensus_df.apply(lambda x: any(caller_outputs_df_grouped.get_group(x.name).query("caller == \"stellarpgx\"")["is_novel"]), axis=1)

In [80]:
idx = []
data = []
for sample, sample_df in caller_outputs_df.groupby("sample"):
    idx.append(sample)
    entry = {}
    for caller, caller_df in sample_df.groupby("caller"):
        entry[f"{caller}_diplotype"] = caller_df["diplotype_raw_string"].values
    data.append(entry)

consensus_df = consensus_df.merge(pd.DataFrame(data, index=idx), how="left", left_index=True, right_index=True)

## Add metadata

In [81]:
metadata_df = pd.read_csv("s3://npm-grids-nalagenetics-collaboration/SG10K_DRAGEN_CYP2D6_sample_metadata.csv")
metadata_df = metadata_df.set_index("npm_research_id")
consensus_df = consensus_df.merge(metadata_df[["genetic_sex", "genetic_ancestry"]], how="left", left_index=True, right_index=True)

In [83]:
consensus_df["diplotype_consensus"].notna().sum()

1514

In [84]:
no_consensus_df = consensus_df[consensus_df["diplotype_consensus"].isna()]
no_consensus_df["haplotype_consensus"].apply(lambda x: pd.Series(x).isna().sum() > 0).sum()

312

## Diplotype Frequencies

In [85]:
not_novel_df = consensus_df.query("(aldy_novel == False) & (stellarpgx_novel == False)")
diplotypes_df = not_novel_df["diplotype_consensus"].value_counts(dropna=False).to_frame(name="AC")
diplotypes_df["AF"] = not_novel_df["diplotype_consensus"].value_counts(dropna=False, normalize=True)
for eth, eth_df in not_novel_df.groupby("genetic_ancestry"):
    diplotypes_df[f"AC_{eth}"] = eth_df["diplotype_consensus"].value_counts(dropna=False)
    diplotypes_df[f"AF_{eth}"] = eth_df["diplotype_consensus"].value_counts(dropna=False, normalize=True)
diplotypes_df = diplotypes_df.fillna(0)
diplotypes_df

Unnamed: 0,AC,AF,AC_C,AF_C,AC_I,AF_I,AC_M,AF_M,AC_O,AF_O
,290,0.166284,200.0,0.154560,27.0,0.143617,63.0,0.242308,0.0,0.0
*1/*10+*36,235,0.134748,210.0,0.162287,2.0,0.010638,23.0,0.088462,0.0,0.0
*10+*36/*10+*36,162,0.092890,150.0,0.115920,1.0,0.005319,11.0,0.042308,0.0,0.0
*10/*10+*36,150,0.086009,130.0,0.100464,1.0,0.005319,19.0,0.073077,0.0,0.0
*1/*1,119,0.068234,76.0,0.058733,23.0,0.122340,20.0,0.076923,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
*35/*86,1,0.000573,0.0,0.000000,1.0,0.005319,0.0,0.000000,0.0,0.0
*2/*14,1,0.000573,1.0,0.000773,0.0,0.000000,0.0,0.000000,0.0,0.0
*1/*17,1,0.000573,0.0,0.000000,1.0,0.005319,0.0,0.000000,0.0,0.0
*10/*52,1,0.000573,1.0,0.000773,0.0,0.000000,0.0,0.000000,0.0,0.0


## Novel alleles

In [86]:
novel_df = consensus_df.query("aldy_novel == True | stellarpgx_novel == True")
novel_df

Unnamed: 0_level_0,diplotype_consensus,diplotype_consensus_agreement,haplotype_consensus,haplotype_consensus_agreement,star_allele_intersection,aldy_novel,stellarpgx_novel,aldy_diplotype,cyrius_diplotype,stellarpgx_diplotype,genetic_sex,genetic_ancestry
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
WHB3386,*4+*68/*41,0.666667,,,,True,True,[*41/*68+*4.021+rs3021084],[*41/*68+*4],[*136/*4],male,I
WHB3406,,,"[*1, nan]","[0.75, nan]",[],True,False,"[*1+rs769258/*61, *1+rs769258/*83.ALDY]",[no_call],[*143/*1],male,C
WHB3440,*1/*10+*36,0.666667,,,,False,True,[*1/*36.ALDY+*10],[*1/*36+*10],[*10/*39],female,C
WHB3459,,,"[*10, nan]","[0.6666666666666666, nan]",[*10],True,False,[*10/*61+rs1065852],[*5/*36+*10],[*10/*36],male,C
WHB3461,,,"[nan, nan]","[nan, nan]",[],False,True,[*1/*2],[no_call],[*34/*88],female,C
...,...,...,...,...,...,...,...,...,...,...,...,...
WHB5384,*1/*10+*36,0.666667,,,,False,True,[*1/*36.ALDY+*10],[*1/*36+*10],[*10/*39],male,C
WHB5419,,,"[nan, nan]","[nan, nan]",[],True,True,[*36.ALDY+*36.ALDY+rs140513104/*39],[no_call],[*1/*10],male,C
WHB5444,,,"[*1, nan]","[0.6666666666666666, nan]","[*1, *10]",False,True,[*1+*10/*36.ALDY+*10+*68],[*1/*36+*10],[*1/*10],female,C
WHB5449,*1/*10+*36,0.666667,,,,False,True,[*1/*36.ALDY+*10],[*1/*36+*10],[*10/*39],female,C


In [87]:
novel_df.query("aldy_novel == True & stellarpgx_novel == True")

Unnamed: 0_level_0,diplotype_consensus,diplotype_consensus_agreement,haplotype_consensus,haplotype_consensus_agreement,star_allele_intersection,aldy_novel,stellarpgx_novel,aldy_diplotype,cyrius_diplotype,stellarpgx_diplotype,genetic_sex,genetic_ancestry
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
WHB3386,*4+*68/*41,0.666667,,,,True,True,[*41/*68+*4.021+rs3021084],[*41/*68+*4],[*136/*4],male,I
WHB3484,,,"[*1, nan]","[0.6666666666666666, nan]",[],True,True,[*1+rs140513104/*36.ALDY+*10],[no_call],[*1/*10],male,C
WHB3493,*5/*41,0.666667,,,,True,True,[*5/*41+rs730882251],[no_call],[*41/*5],male,C
WHB3656,,,"[*1, nan]","[0.6666666666666666, nan]",[*1],True,True,[*1+*1+rs140513104+rs1065852/*36.ALDY+*36.ALDY],[*1/*36+*36+*10],[*1/*4],female,C
WHB3742,,,"[nan, nan]","[nan, nan]",[],True,True,[*43+rs1135840/*65],[no_call],[*10/*2],male,C
WHB3752,,,"[nan, nan]","[nan, nan]",[],True,True,[*36.ALDY+*65/*53+rs1135840],[no_call],[*2/*49],male,C
WHB3909,*1/*14,0.666667,,,,True,True,[*1+rs140513104/*14],[no_call],[*1/*14],male,C
WHB4120,,,"[*1, nan]","[1.0, nan]",[*1],True,True,[*1/*1+rs1065852],[*1/*36],[*1/*10],male,M
WHB4126,,,"[*1, nan]","[0.6666666666666666, nan]",[],True,True,[*1/*1+rs28371704],[no_call],[*1/*4],female,M
WHB4156,,,"[nan, nan]","[nan, nan]",[],True,True,[*1/*1+rs140513104],[no_call],[*10/*4],female,M


In [98]:
novel_df.query("aldy_novel == True and stellarpgx_novel == False").shape

(15, 12)

In [100]:
novel_df.query("aldy_novel == True and stellarpgx_novel == True").shape

(24, 12)

In [97]:
novel_df.query("aldy_novel == True").shape

(39, 12)