# Caller Consensus

## Data prep

In [1]:
import pandas as pd
import numpy as np

from collections import Counter
from pathlib import Path
from callers import Cyp2d6CallerOutput
from diplotype import Diplotype

In [2]:
caller_outputs_path = Path("/home/torojr/SG10K-CYP2D6/_data/caller_outputs")
stellarpgx_outputs_path = caller_outputs_path / "stellarpgx"
stellarpgx_sample_dirs = stellarpgx_outputs_path.glob("*")
sample_list = [sample.name for sample in stellarpgx_sample_dirs]

In [3]:
diplotypes = {}

for sample in sample_list:
    diplotypes[sample] = {}
    files = {
        "cyrius": [file for file in caller_outputs_path.glob(f"sg10k-dragen/{sample}/cyrius/*.json")][0],
        "aldy": [file for file in caller_outputs_path.glob(f"sg10k-dragen/{sample}/aldy/*.aldy")][0],
        "stellarpgx": [file for file in caller_outputs_path.glob(f"stellarpgx/{sample}/**/*.alleles")][0]
    }
    
    for caller, file in files.items():
        caller_output = eval(f"Cyp2d6CallerOutput.{caller}(\"{file}\")")
        diplotypes[sample][f"raw_{caller}"] = caller_output.diplotype.raw
        diplotypes[sample][caller] = caller_output.diplotype.parsed



## Calculate consensus

In [4]:
def simple_consensus(cyrius, aldy, stellarpgx):
    consensus = None
    if cyrius == aldy == stellarpgx:
        consensus = cyrius
    elif cyrius == aldy:
        consensus = cyrius
    elif cyrius == stellarpgx:
        consensus =  cyrius
    elif aldy == stellarpgx:
        consensus = aldy
    return consensus

In [5]:
diplotypes_df = pd.DataFrame.from_dict(diplotypes, orient="index")
diplotypes_df["all_match"] = (diplotypes_df["cyrius"] == diplotypes_df["aldy"]) & (diplotypes_df["cyrius"] == diplotypes_df["stellarpgx"])
diplotypes_df["diplotype"] = diplotypes_df.apply(lambda sample: simple_consensus(sample["cyrius"], sample["aldy"], sample["stellarpgx"]), axis=1)

In [6]:
#diplotypes_df

## Add metadata

In [7]:
metadata = pd.read_csv("/home/torojr/SG10K-CYP2D6/_data/metadata/SG10K-DRAGEN.sample_list_full.20210719.csv")
metadata.set_index("npm_research_id", inplace=True)

In [8]:
diplotypes_with_metadata = pd.merge(diplotypes_df, metadata, how="left", left_index=True, right_index=True,)

In [9]:
#diplotypes_with_metadata.sample(5)

## Consensus aggregate data

In [10]:
consensus_df = diplotypes_with_metadata.loc[(diplotypes_with_metadata["diplotype"].notna()) & (diplotypes_with_metadata["diplotype"] != "no_call")]
#consensus_df.sample(5)

In [24]:
ethnicity_counts = consensus_df.groupby("genetic_ancestry")["diplotype"].count()
ethnicity_counts

genetic_ancestry
C    1154
I     166
M     203
O       2
Name: diplotype, dtype: int64

In [12]:
ethnicity_counts.sum()

1525

In [13]:
aggregate_df = consensus_df.groupby(["diplotype", "genetic_ancestry"])["diplotype"].count().to_frame().rename(columns={"diplotype": "diplotype_count"}).reset_index()
aggregate_df["diplotype_frequency"] = aggregate_df.apply(lambda x: x["diplotype_count"] / ethnicity_counts.loc[x["genetic_ancestry"]], axis=1)
aggregate_df["haplotypes"] = aggregate_df["diplotype"].str.split("/")
aggregate_df["star_alleles"] = aggregate_df["diplotype"].apply(Diplotype.from_string).apply(lambda x: list(pd.Series([list(h.star_alleles.elements()) for h in x.haplotypes]).explode()))
aggregate_df.sample(5)

Unnamed: 0,diplotype,genetic_ancestry,diplotype_count,diplotype_frequency,haplotypes,star_alleles
144,*41/*41,M,1,0.004926,"[*41, *41]","[*41, *41]"
50,*10+*36/*14,C,9,0.007799,"[*10+*36, *14]","[*10, *36, *14]"
161,*5/*71,C,3,0.0026,"[*5, *71]","[*5, *71]"
79,*10/*41,I,1,0.006024,"[*10, *41]","[*10, *41]"
138,*4/*41,I,3,0.018072,"[*4, *41]","[*4, *41]"


In [14]:
haplotype_counts_by_ethnicity = {}
star_allele_counts_by_ethnicity = {}
for ethnicity, data in aggregate_df.groupby("genetic_ancestry"):
    haplotype_counts_by_ethnicity[ethnicity] = Counter(data["haplotypes"].explode())
    star_allele_counts_by_ethnicity[ethnicity] = Counter(data["star_alleles"].explode())

In [15]:
aggregate_df["haplotype_count"] = aggregate_df.apply(lambda x: [haplotype_counts_by_ethnicity[x["genetic_ancestry"]].get(haplotype) for haplotype in x["haplotypes"]], axis=1)
aggregate_df["haplotype_frequency"] = aggregate_df.apply(lambda x: [hc / haplotype_counts_by_ethnicity[x["genetic_ancestry"]].total() for hc in x["haplotype_count"]], axis=1)
aggregate_df["star_allele_count"] = aggregate_df.apply(lambda x: [star_allele_counts_by_ethnicity[x["genetic_ancestry"]].get(star_allele) for star_allele in x["star_alleles"]], axis=1)
aggregate_df["star_allele_frequency"] = aggregate_df.apply(lambda x: [sac / star_allele_counts_by_ethnicity[x["genetic_ancestry"]].total() for sac in x["star_allele_count"]], axis=1)
aggregate_df[aggregate_df["diplotype"].str.contains("x")].sample(5)

Unnamed: 0,diplotype,genetic_ancestry,diplotype_count,diplotype_frequency,haplotypes,star_alleles,haplotype_count,haplotype_frequency,star_allele_count,star_allele_frequency
123,*2x2/*41,M,1,0.004926,"[*2x2, *41]","[*2, *2, *41]","[2, 8]","[0.02702702702702703, 0.10810810810810811]","[11, 11, 8]","[0.11956521739130435, 0.11956521739130435, 0.0..."
121,*2x2/*10,C,1,0.000867,"[*2x2, *10]","[*2, *2, *10]","[4, 15]","[0.02631578947368421, 0.09868421052631579]","[22, 22, 45]","[0.11224489795918367, 0.11224489795918367, 0.2..."
94,*1x2/*41,C,1,0.000867,"[*1x2, *41]","[*1, *1, *41]","[4, 13]","[0.02631578947368421, 0.08552631578947369]","[26, 26, 13]","[0.1326530612244898, 0.1326530612244898, 0.066..."
10,*1/*10+*36x2,M,1,0.004926,"[*1, *10+*36x2]","[*1, *10, *36, *36]","[12, 2]","[0.16216216216216217, 0.02702702702702703]","[14, 25, 15, 15]","[0.15217391304347827, 0.2717391304347826, 0.16..."
88,*14/*36x2,C,1,0.000867,"[*14, *36x2]","[*14, *36, *36]","[7, 1]","[0.046052631578947366, 0.006578947368421052]","[7, 36, 36]","[0.03571428571428571, 0.1836734693877551, 0.18..."


In [16]:
# Export
columns = ["genetic_ancestry", "diplotype", "diplotype_count", "diplotype_frequency", "haplotypes", "haplotype_count", "haplotype_frequency", "star_alleles", "star_allele_count", "star_allele_frequency"]
aggregate_df = aggregate_df.loc[:, columns]
aggregate_df.sort_values(["diplotype_count"], ascending=False).reset_index(drop=True).to_csv("/home/torojr/SG10K-CYP2D6/_data/sg10k_cyp2d6_diplotype_consensus_v1.tsv", sep="\t", index=False)

In [17]:
aggregate_df["diplotype"].unique()

array(['*1/*1', '*1/*10', '*1/*10+*36', '*1/*10+*36x2', '*1/*113',
       '*1/*14', '*1/*17', '*1/*1x2', '*1/*2', '*1/*21', '*1/*2x2',
       '*1/*33', '*1/*35', '*1/*36', '*1/*4', '*1/*4+*68', '*1/*41',
       '*1/*43', '*1/*49', '*1/*4x2', '*1/*5', '*1/*71', '*1/*94',
       '*10+*36/*10+*36', '*10+*36/*10+*36x2', '*10+*36/*133',
       '*10+*36/*14', '*10+*36/*21', '*10+*36/*35', '*10+*36/*36',
       '*10+*36/*41', '*10+*36/*43', '*10+*36/*49', '*10+*36/*49x2',
       '*10+*36/*65', '*10+*36/*71', '*10+*36x2/*35', '*10+*36x2/*41',
       '*10/*10', '*10/*10+*36', '*10/*111', '*10/*14', '*10/*35',
       '*10/*36', '*10/*41', '*10/*49', '*10/*52', '*10/*71', '*10/*75',
       '*10x2/*10+*36', '*10x2/*94', '*14/*36x2', '*14/*41',
       '*1x2/*10+*36', '*1x2/*2', '*1x2/*4', '*1x2/*41', '*2/*10',
       '*2/*10+*36', '*2/*10+*36x2', '*2/*14', '*2/*2', '*2/*2x2',
       '*2/*3', '*2/*35', '*2/*4', '*2/*4+*68', '*2/*41', '*2/*43',
       '*2/*49', '*2/*5', '*2/*71', '*2x2/*10', '*2x2/*1

## Samples without consensus

In [18]:
no_consensus_df = diplotypes_with_metadata[diplotypes_with_metadata["diplotype"].isna()]
no_consensus_df.sample(5)

Unnamed: 0,raw_cyrius,cyrius,raw_aldy,aldy,raw_stellarpgx,stellarpgx,all_match,diplotype,multiplex_pool_id,supplier_id,...,supplied_gender,genetic_sex,self_reported_ethnicity,genetic_ancestry,extraction_kit,library_prep_kit,sequencing_depth,estimate_of_sequence_coverage,qc_pass_in_r5_3,industry_consent
WHB4651,no_call,no_call,*10+*10+*10+*36.ALDY+*10/*36.ALDY+*10,*10x4+*36/*10+*36,*10/*10x7,*10/*10x7,False,,MUX9238,010-20832,...,M,male,Malay,M,QIAsymphony DSP DNA Midi Kit,NEBNext UltraII DNA library Prep Kit for Illumina,30x,27.61418,True,True
WHB3671,*36+*10/*36+*10,*10+*36/*10+*36,*10/*36.ALDY+*10,*10/*10+*36,*10/*10x3,*10/*10x3,False,,MUX8198,0217-0012,...,F,female,Chinese,C,"Chemagic DNA Blood Kit (Perkin Elmer, MA)",NEBNext UltraII DNA library Prep Kit for Illumina,30x,28.54265,True,False
WHB4552,no_call,no_call,*5/*41,*5/*41,*41/*41,*41/*41,False,,MUX9208,020-04180,...,F,female,Indian,I,QIAsymphony DSP DNA Midi Kit,NEBNext UltraII DNA library Prep Kit for Illumina,30x,29.041328,True,True
WHB4740,*10/*2,*2/*10,*2/*36.ALDY+*10,*2/*10+*36,*2/*36x2+*10,*2/*10+*36x2,False,,MUX9276,010-21410,...,M,male,Malay,M,QIAsymphony DSP DNA Midi Kit,NEBNext UltraII DNA library Prep Kit for Illumina,30x,29.284166,True,True
WHB5126,*14/*36+*10,*10+*36/*14,*2+*36.ALDY+*10/*14+*68,*2+*10+*36/*14+*68,*2/*36+*10,*2/*10+*36,False,,MUX8358,1217-0005,...,M,male,Chinese,C,"Chemagic DNA Blood Kit (Perkin Elmer, MA)",NEBNext UltraII DNA library Prep Kit for Illumina,30x,31.346788,True,False


In [19]:
no_consensus_df["genetic_ancestry"].value_counts()

C    200
M     64
I     27
Name: genetic_ancestry, dtype: int64

In [20]:
(no_consensus_df["cyrius"] == "no_call").value_counts()

False    147
True     144
Name: cyrius, dtype: int64

In [21]:
# Cyrius "no_call"
no_consensus_df[no_consensus_df["cyrius"] == "no_call"][["aldy", "stellarpgx"]].to_csv(index=False).split()

['aldy,stellarpgx',
 '*2/*27,*2/*2',
 '*10/*10+*36,*10+*36/*10+*36',
 '*1/*2,*1/*2x2',
 '*1/*2,*1/*2x2',
 '*10x7+*36/*10+*36,*10/*10x6',
 '*10+*36/*10+*36,*10/*10+*36',
 '*10x2+*36/*10+*36,*10/*10x3',
 '*1/*1,*1/*1x2',
 '*1/*41,*1/*41x2',
 '*2/*10+*36,*2/*10x2+*36',
 '*1/*36x2,*1/*10+*36',
 '*1/*2x2,*1/*2x3',
 '*2+*68/*39,*2/*10+*36',
 '*1/*5,*1/*1',
 '*4/*41,*4x2/*41',
 '*10x4+*36/*10+*36,*10/*10x5',
 '*4/*10+*36,*4x2/*10',
 '*1/*10+*36,*1/*10+*36x2',
 '*10+*36/*10+*36,*10+*36/*10+*36x2',
 '*4/*10+*36,*4x3/*10',
 '*1/*10,*1/*10x2',
 '*1+*36/*10+*36,*1/*10x2+*36x2',
 '*1/*10+*36,*1/*10+*36x2',
 '*1x2/*61,*1/*1x2',
 '*1/*10,*1/*10x2',
 '*10/*10+*36,*10/*10x2',
 '*10+*36/*36,*10+*36/*10+*36',
 '*1/*10,*1/*10x2',
 '*1/*10,*1/*10x2',
 '*10+*36/*10+*36,*10+*36/*10x2+*36',
 '*1/*41,*1/*41x2',
 '*1/*61,*1/*143',
 '*1x2+*10+*36/*10+*36,*1/*10+*36x2',
 '*10/*10,*10/*10x2',
 '*1/*4,*1/*4x2',
 '*5/*119,*34x2/*119',
 '*4/*10,*4x2/*10',
 '*1/*2,*1/*2x2',
 '*10x2/*68,*10/*10+*36',
 '*2/*10+*36,*2/*1

In [22]:
# Cyrius call
no_consensus_df[no_consensus_df["cyrius"] != "no_call"][["cyrius", "aldy", "stellarpgx"]].to_csv(index=False).split()

['cyrius,aldy,stellarpgx',
 '*10+*36/*10+*36,*10x2+*36/*10+*36,*10/*10+*36',
 '*10+*36/*36x2,*10+*36/*10+*36,*10/*10x3',
 '*10/*10+*36,*10+*36/*36,*10/*36x2',
 '*10+*36/*36x2,*10+*36/*10+*36,*10/*36x2',
 '*10+*36/*36x2,*10+*36/*10+*36,*10/*10x3',
 '*2/*10,*2/*10+*36,*2/*10x2+*36',
 '*1/*10+*36x2,*1+*36/*10+*36,*1/*10+*36',
 '*1/*13,*1/*2,*1/*2x2',
 '*10+*36/*10+*36x2,*10x3+*36/*10+*36,*10+*36/*10+*36',
 '*2/*10,*10+*36/*14,*2x2/*10',
 '*2/*10+*36,*10+*36/*63,*2/*36x2',
 '*10+*36x2/*10+*36x2,*10x4+*36/*10+*36,*10+*36/*10+*36x2',
 '*10+*36/*10+*36x2,*10x2+*36/*10+*36,*10+*36/*10+*36',
 '*10/*10,*10/*10+*36,*10+*36/*10+*36',
 '*1/*10+*36x2,*1x2/*36x2,no_call',
 '*10/*10,*10/*10+*36,*10/*10x2+*36',
 '*2/*10+*36,*2/*10+*83,*2/*10',
 '*10+*36/*10+*36x2,*10x2+*36/*10+*36,*10+*36/*10+*36',
 '*5/*10+*36,*10/*36,*10/*10',
 '*2/*10,*2/*10+*36,*2/*10+*36x2',
 '*10/*10+*36,*10+*52/*10+*36+*68,*10x2/*52',
 '*10+*36x2/*41,*10+*36/*36+*41,*39/*69x2',
 '*10+*36x2/*132,*10+*36/*36+*132,*10/*10x3',
 '*10