# Caller Consensus

## Data prep

In [1]:
import pandas as pd
import numpy as np

from collections import Counter
from pathlib import Path
from callers import Cyp2d6CallerOutput
from diplotype import Diplotype

In [2]:
caller_outputs_path = Path("../../_data/caller_outputs")
stellarpgx_outputs_path = caller_outputs_path / "stellarpgx"
stellarpgx_sample_dirs = stellarpgx_outputs_path.glob("*")
sample_list = [sample.name for sample in stellarpgx_sample_dirs]

In [3]:
diplotypes = {}

for sample in sample_list:
    diplotypes[sample] = {}
    files = {
        "cyrius": [file for file in caller_outputs_path.glob(f"sg10k-dragen/{sample}/cyrius/*.json")][0],
        "aldy": [file for file in caller_outputs_path.glob(f"sg10k-dragen/{sample}/aldy/*.aldy")][0],
        "stellarpgx": [file for file in caller_outputs_path.glob(f"stellarpgx/{sample}/**/*.alleles")][0]
    }
    
    for caller, file in files.items():
        caller_output = eval(f"Cyp2d6CallerOutput.{caller}(\"{file}\")")
        diplotypes[sample][f"raw_{caller}"] = caller_output.diplotype.raw
        diplotypes[sample][caller] = caller_output.diplotype.parsed



## Calculate consensus

In [4]:
def simple_consensus(cyrius, aldy, stellarpgx):
    consensus = None
    if cyrius == aldy == stellarpgx:
        consensus = cyrius
    elif cyrius == aldy:
        consensus = cyrius
    elif cyrius == stellarpgx:
        consensus =  cyrius
    elif aldy == stellarpgx:
        consensus = aldy
    return consensus

In [5]:
diplotypes_df = pd.DataFrame.from_dict(diplotypes, orient="index")
diplotypes_df["all_match"] = (diplotypes_df["cyrius"] == diplotypes_df["aldy"]) & (diplotypes_df["cyrius"] == diplotypes_df["stellarpgx"])
diplotypes_df["diplotype"] = diplotypes_df.apply(lambda sample: simple_consensus(sample["cyrius"], sample["aldy"], sample["stellarpgx"]), axis=1)

In [6]:
#diplotypes_df.sample(10)

## Add metadata

In [7]:
metadata = pd.read_csv("../../_data/metadata/SG10K-DRAGEN.sample_list_full.20210719.csv")
metadata.set_index("npm_research_id", inplace=True)

In [8]:
diplotypes_with_metadata = pd.merge(diplotypes_df, metadata, how="left", left_index=True, right_index=True,)

In [9]:
#diplotypes_with_metadata.sample(5)

## Consensus aggregate data

In [10]:
consensus_df = diplotypes_with_metadata.loc[(diplotypes_with_metadata["diplotype"].notna()) & (diplotypes_with_metadata["diplotype"] != "no_call")]
#consensus_df.sample(5)

In [11]:
ethnicity_counts = consensus_df.groupby("genetic_ancestry")["diplotype"].count()
ethnicity_counts

genetic_ancestry
C    1154
I     166
M     203
O       2
Name: diplotype, dtype: int64

In [12]:
ethnicity_counts.sum()

1525

In [13]:
aggregate_df = consensus_df.groupby(["diplotype", "genetic_ancestry"])["diplotype"].count().to_frame().rename(columns={"diplotype": "diplotype_count"}).reset_index()
aggregate_df["diplotype_frequency"] = aggregate_df.apply(lambda x: x["diplotype_count"] / ethnicity_counts.loc[x["genetic_ancestry"]], axis=1)
aggregate_df["haplotypes"] = aggregate_df["diplotype"].str.split("/")
aggregate_df["star_alleles"] = aggregate_df["diplotype"].apply(Diplotype.from_string).apply(lambda x: list(pd.Series([list(h.star_alleles.elements()) for h in x.haplotypes]).explode()))

In [14]:
haplotype_counts_by_ethnicity = {}
star_allele_counts_by_ethnicity = {}
for ethnicity, data in aggregate_df.groupby("genetic_ancestry"):
    haplotype_counts_by_ethnicity[ethnicity] = Counter((data["haplotypes"] * data["diplotype_count"]).explode())
    star_allele_counts_by_ethnicity[ethnicity] = Counter((data["star_alleles"] * data["diplotype_count"]).explode())

In [15]:
aggregate_df["haplotype_count"] = aggregate_df.apply(lambda x: [haplotype_counts_by_ethnicity[x["genetic_ancestry"]].get(haplotype) for haplotype in x["haplotypes"]], axis=1)
aggregate_df["haplotype_frequency"] = aggregate_df.apply(lambda x: [hc / sum(haplotype_counts_by_ethnicity[x["genetic_ancestry"]].values()) for hc in x["haplotype_count"]], axis=1)
aggregate_df["star_allele_count"] = aggregate_df.apply(lambda x: [star_allele_counts_by_ethnicity[x["genetic_ancestry"]].get(star_allele) for star_allele in x["star_alleles"]], axis=1)
aggregate_df["star_allele_frequency"] = aggregate_df.apply(lambda x: [sac / sum(star_allele_counts_by_ethnicity[x["genetic_ancestry"]].values()) for sac in x["star_allele_count"]], axis=1)
#aggregate_df[(aggregate_df["genetic_ancestry"] == "M") & (aggregate_df["diplotype"].str.contains("\*10"))]

In [16]:
# Export
columns = ["genetic_ancestry", "diplotype", "diplotype_count", "diplotype_frequency", "haplotypes", "haplotype_count", "haplotype_frequency", "star_alleles", "star_allele_count", "star_allele_frequency"]
aggregate_df = aggregate_df.loc[:, columns]
to_export = aggregate_df.sort_values(["diplotype_count"], ascending=False).reset_index(drop=True)

export_file_name = "sg10k_cyp2d6_diplotype_consensus_v2"
to_export.to_csv(f"../../_data/{export_file_name}.tsv", sep="\t", index=False)
to_export.to_json(f"../../_data/{export_file_name}.json", indent=4)

In [17]:
(aggregate_df["haplotypes"] * aggregate_df["diplotype_count"]).explode().value_counts().loc["*10+*36x2"]

12

## Samples without consensus

In [18]:
no_consensus_df = diplotypes_with_metadata[diplotypes_with_metadata["diplotype"].isna()]
#no_consensus_df.sample(5)

In [19]:
no_consensus_df["genetic_ancestry"].value_counts()

C    200
M     64
I     27
Name: genetic_ancestry, dtype: int64

In [20]:
(no_consensus_df["cyrius"] == "no_call").value_counts()

False    147
True     144
Name: cyrius, dtype: int64

In [21]:
# Cyrius "no_call"
no_consensus_df[no_consensus_df["cyrius"] == "no_call"][["aldy", "stellarpgx"]].to_csv(index=False).split()

['aldy,stellarpgx',
 '*10/*49,*10/*49x2',
 '*1/*10+*36,*1/*10+*36x2',
 '*1/*49,*1/*49x2',
 '*4/*10+*36,*4x3/*10',
 '*1/*41,*34/*119x2',
 '*10/*36,*10/*10+*36',
 '*36x2/*65x2,*10/*65x2',
 '*4/*86,*4+*68/*86',
 '*1/*5,*1/*1',
 '*1/*4,*1/*4x2',
 '*4+*68/*36,*4x2/*10+*36',
 '*63/*80,*2/*41',
 '*10x2/*68,*10/*10+*36',
 '*10/*10+*36,*10/*10',
 '*10+*36/*36,*10+*36/*10+*36',
 '*1/*41,*1/*41x2',
 '*10x3/*10+*36+*83,*10x2+*36x3/*39',
 '*1x2/*61,*1/*1x2',
 '*10/*36,*10/*10+*36',
 '*4/*10+*36,*4/*10x2',
 '*10/*10+*36,*10+*36/*10+*36',
 '*2/*2,*2/*2x2',
 '*2/*10+*36,*2/*10',
 '*2/*10+*36,*2/*10',
 '*1/*10+*36,*1/*10x2+*36',
 '*2/*10,*2x2/*10',
 '*1/*1,*1/*1x2',
 '*10x2+*36/*10+*36,*10+*36/*10+*36',
 '*1/*1,*1/*1x2',
 '*1/*10,*1/*10x2',
 '*10+*36/*10+*36,*10/*36x2',
 '*2/*10+*36,*2/*10x2+*36',
 '*1/*2,*1/*2x2',
 '*2/*10+*36,*2/*10x2+*36',
 '*1x2+*10+*36/*10x2+*36,*1/*10+*36x3',
 '*10/*10+*36,*10+*36/*10+*36',
 '*2/*10,*2x2/*10',
 '*10+*36/*10+*36,*10+*36/*10x2+*36',
 '*1+*36/*10+*36,*1/*10+*36',
 '

In [22]:
# Cyrius call
no_consensus_df[no_consensus_df["cyrius"] != "no_call"][["cyrius", "aldy", "stellarpgx"]].to_csv(index=False).split()

['cyrius,aldy,stellarpgx',
 '*2/*10+*36x2,*2+*36/*10+*36,*10+*36/*39',
 '*1/*10+*36x2,*1+*36/*10+*36,*1/*10+*36',
 '*10/*10+*36,*10+*36/*10+*36,*10+*36/*10+*36x2',
 '*1/*10,*1/*10+*36,*1/*10x2+*36',
 '*1/*2,*13/*63,*1/*34',
 '*1/*10,*1/*10+*36,*1/*10+*36x2',
 '*5/*41,*1/*41,*1/*41x2',
 '*2/*10+*36x2,*10+*36/*83,*36x2/*39',
 '*10+*36x2/*49,*10+*36/*36+*49,*36x2/*49x2',
 '*10/*10,*10/*36,no_call',
 '*1/*10+*36,*1+*36/*10+*36,*1/*10x2+*36',
 '*2/*10,*2/*10+*36,*2/*10+*36x2',
 '*2/*10+*36,*2/*10+*83,*2/*10',
 '*1/*10+*36,*1x2/*10+*36+*68,*1/*10x2',
 '*10/*10+*36,*10+*36/*10+*36,*10+*36/*10+*36x2',
 '*2/*10+*36x2,*2+*36/*10+*36,*36x2/*39',
 '*10/*10+*36,*10x2/*10+*36,*10/*10x2',
 '*10+*36/*10+*36,*10x2+*36/*10+*36,*10/*10x3',
 '*1/*10,*36/*39,*1/*36',
 '*10/*10,*10/*10+*36,*10/*10x3',
 '*5/*10+*36,*36/*36,no_call',
 '*1/*13,*1/*2,*1/*2x2',
 '*5/*10+*36,*10/*61,*10/*36',
 '*1x2/*10+*36,*1+*79/*36,no_call',
 '*2/*5,*1/*2,*1/*2x2',
 '*10/*10+*36,*10+*52/*10+*36+*68,*10x2/*52',
 '*10+*36/*10+*3