In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from diplotype import Diplotype


# Load the caller outputs pickle file

In [2]:
caller_outputs_df = pd.read_parquet("output/1_caller_outputs.parquet")
caller_outputs_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5576 entries, 0 to 5575
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sample_id     5576 non-null   object 
 1   genotype      5379 non-null   object 
 2   filter        1665 non-null   object 
 3   copy_number   5523 non-null   float64
 4   caller        5576 non-null   object 
 5   novel_allele  5576 non-null   bool   
dtypes: bool(1), float64(1), object(4)
memory usage: 223.4+ KB


# Convert genotype calls to diplotypes

In [3]:
def get_diplotype(row: pd.Series) -> Diplotype:
    gt = row["genotype"]
    filt = row["filter"]
    novel = row["novel_allele"]
    if gt is None:
        return Diplotype.no_call()
    elif "*other" in gt or "/" not in gt or "or" in gt:
        return Diplotype.invalid(gt)
    elif novel:
        return Diplotype.novel_allele(raw_diplotype=gt, filt=filt)
    else:
        return Diplotype.from_string(raw_diplotype=gt, filt=filt)
    

In [4]:
caller_outputs_df["diplotype"] = caller_outputs_df.apply(get_diplotype, axis=1)
caller_outputs_df["parsed_diplotype"] = caller_outputs_df["diplotype"].apply(
    lambda x: x.parsed
)
caller_outputs_df["parsed_haplotypes"] = caller_outputs_df["diplotype"].apply(
    lambda x: [haplotype.parsed for haplotype in x.haplotypes]
)
caller_outputs_df["parsed_star_alleles"] = caller_outputs_df["diplotype"].apply(
    lambda x: [star_allele.parsed for star_allele in x.star_alleles]
)
caller_outputs_df


Unnamed: 0,sample_id,genotype,filter,copy_number,caller,novel_allele,diplotype,parsed_diplotype,parsed_haplotypes,parsed_star_alleles
0,WHB3374,*1/*10,,2.0,aldy,False,"Diplotype(parsed='*1/*10', filt=None, is_novel...",*1/*10,"[*1, *10]","[*1, *10]"
1,WHB3374,*1/*10,PASS,4.0,cyrius,False,"Diplotype(parsed='*1/*10', filt='PASS', is_nov...",*1/*10,"[*1, *10]","[*1, *10]"
2,WHB3374,*1/*10,,2.0,stellarpgx,False,"Diplotype(parsed='*1/*10', filt=None, is_novel...",*1/*10,"[*1, *10]","[*1, *10]"
3,WHB3375,*10/*36.ALDY,,2.0,aldy,False,"Diplotype(parsed='*10/*36', filt=None, is_nove...",*10/*36,"[*10, *36]","[*10, *36]"
4,WHB3375,*5/*36+*10,PASS,4.0,cyrius,False,"Diplotype(parsed='*5/*10+*36', filt='PASS', is...",*5/*10+*36,"[*5, *10+*36]","[*5, *36, *10]"
...,...,...,...,...,...,...,...,...,...,...
5571,WHB5468,*1/*1,PASS,4.0,cyrius,False,"Diplotype(parsed='*1/*1', filt='PASS', is_nove...",*1/*1,"[*1, *1]","[*1, *1]"
5572,WHB5468,*1/*1,,2.0,stellarpgx,False,"Diplotype(parsed='*1/*1', filt=None, is_novel=...",*1/*1,"[*1, *1]","[*1, *1]"
5573,WHB5469,*1/*36.ALDY+*10,,3.0,aldy,False,"Diplotype(parsed='*1/*10+*36', filt=None, is_n...",*1/*10+*36,"[*1, *10+*36]","[*1, *36, *10]"
5574,WHB5469,*1/*36+*10,PASS,5.0,cyrius,False,"Diplotype(parsed='*1/*10+*36', filt='PASS', is...",*1/*10+*36,"[*1, *10+*36]","[*1, *36, *10]"


# Consensus

In [5]:
def get_diplotype_consensus(sample_df: pd.DataFrame) -> dict:
    counts = sample_df["parsed_diplotype"].value_counts(normalize=True, dropna=False)

    consensus = None
    if counts.max() > 0.5:
        consensus = list(counts.items())[0][0]

    callers_in_agreement = list(
        sample_df.apply(
            lambda x: x["caller"] if x["parsed_diplotype"] == consensus else np.nan,
            axis=1,
        ).dropna()
    )

    return {
        "diplotype_consensus": consensus,
        "diplotype_caller_agreement": counts.max() if counts.max() > 0.5 else None,
        "callers_in_agreement": callers_in_agreement,
        "novel_allele": list(sample_df[sample_df["novel_allele"] == True]["caller"]),
        # "diplotype_counts": list(zip(counts.index.values, counts.values)),
    }


def get_haplotype_consensus(sample_df: pd.DataFrame) -> dict:
    counts_by_pos = {}
    haplotypes = sample_df["parsed_haplotypes"]
    max_length = haplotypes.apply(len).max()
    for i in range(max_length):
        mode = haplotypes.apply(lambda x: x[i] if len(x) > i else None).mode(
            dropna=False
        )
        if len(mode) < max_length:
            counts_by_pos[i] = mode[0]
        else:
            counts_by_pos[i] = None
    return {"haplotype_consensus": pd.Series(counts_by_pos).to_list()}


def get_consensus(sample_df: pd.DataFrame) -> pd.Series:
    original_calls = {}

    for caller in sample_df["caller"].unique():
        genotypes = sample_df.query("caller == @caller")["genotype"]
        original_calls[caller] = list(genotypes)

    stellarpgx_copy_number = sample_df.query("caller == 'stellarpgx'").iloc[0][
        "copy_number"
    ]

    diplotype_consensus = get_diplotype_consensus(sample_df)
    haplotype_consensus = get_haplotype_consensus(sample_df)

    return pd.Series(
        {
            "stellarpgx_copy_number": stellarpgx_copy_number,
            **original_calls,
            **diplotype_consensus,
            **haplotype_consensus,
        }
    )


In [6]:
consensus_df = caller_outputs_df.groupby("sample_id").apply(get_consensus)
consensus_df.sample(10)


  mode = haplotypes.apply(lambda x: x[i] if len(x) > i else None).mode(
  mode = haplotypes.apply(lambda x: x[i] if len(x) > i else None).mode(
  mode = haplotypes.apply(lambda x: x[i] if len(x) > i else None).mode(
  mode = haplotypes.apply(lambda x: x[i] if len(x) > i else None).mode(
  mode = haplotypes.apply(lambda x: x[i] if len(x) > i else None).mode(
  mode = haplotypes.apply(lambda x: x[i] if len(x) > i else None).mode(
  mode = haplotypes.apply(lambda x: x[i] if len(x) > i else None).mode(
  mode = haplotypes.apply(lambda x: x[i] if len(x) > i else None).mode(
  mode = haplotypes.apply(lambda x: x[i] if len(x) > i else None).mode(
  mode = haplotypes.apply(lambda x: x[i] if len(x) > i else None).mode(
  mode = haplotypes.apply(lambda x: x[i] if len(x) > i else None).mode(
  mode = haplotypes.apply(lambda x: x[i] if len(x) > i else None).mode(
  mode = haplotypes.apply(lambda x: x[i] if len(x) > i else None).mode(
  mode = haplotypes.apply(lambda x: x[i] if len(x) > i else None

Unnamed: 0_level_0,stellarpgx_copy_number,aldy,cyrius,stellarpgx,diplotype_consensus,diplotype_caller_agreement,callers_in_agreement,novel_allele,haplotype_consensus
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
WHB4140,2.0,[*1/*41],[*1/*41],[*1/*41],*1/*41,1.0,"[aldy, cyrius, stellarpgx]",[],"[*1, *41]"
WHB3996,3.0,[*10/*36.ALDY+*10],[*10/*36+*10],[*10/*36+*10],*10/*10+*36,1.0,"[aldy, cyrius, stellarpgx]",[],"[*10, *10+*36]"
WHB4142,3.0,[*1/*36.ALDY+*10],[*1/*36+*10],[*1/*36+*10],*1/*10+*36,1.0,"[aldy, cyrius, stellarpgx]",[],"[*1, *10+*36]"
WHB3781,3.0,[*36.ALDY+*10/*36.ALDY+*10],[None],[*10/*36x2],,,[],[],"[None, None]"
WHB4094,2.0,[*1/*1],[*1/*1],[*1/*1],*1/*1,1.0,"[aldy, cyrius, stellarpgx]",[],"[*1, *1]"
WHB4131,2.0,[*2/*2],[*2/*2],[*2/*2],*2/*2,1.0,"[aldy, cyrius, stellarpgx]",[],"[*2, *2]"
WHB4397,2.0,[*1/*41],[*1/*41],[*1/*41],*1/*41,1.0,"[aldy, cyrius, stellarpgx]",[],"[*1, *41]"
WHB3489,2.0,[*1/*10],[*1/*10],[*10/*39],*1/*10,0.666667,"[aldy, cyrius]",[stellarpgx],"[*1, *10]"
WHB4975,1.0,[*1/*5],[*1/*5],[*5/*1],*1/*5,1.0,"[aldy, cyrius, stellarpgx]",[],"[*1, *5]"
WHB4923,3.0,[*1/*36.ALDY+*36.ALDY],[None],[*1/*36+*10],,,[],[],"[*1, None]"


# Add sample metadata

In [7]:
metadata_df = pd.read_csv(
    "s3://npm-grids-nalagenetics-collaboration/SG10K_DRAGEN_CYP2D6_sample_metadata.csv"
)
metadata_df = metadata_df.set_index("npm_research_id")
consensus_df = consensus_df.merge(
    metadata_df.loc[:, ["genetic_sex", "genetic_ancestry"]],
    how="left",
    left_index=True,
    right_index=True,
)


In [8]:
consensus_df


Unnamed: 0_level_0,stellarpgx_copy_number,aldy,cyrius,stellarpgx,diplotype_consensus,diplotype_caller_agreement,callers_in_agreement,novel_allele,haplotype_consensus,genetic_sex,genetic_ancestry
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
WHB3374,2.0,[*1/*10],[*1/*10],[*1/*10],*1/*10,1.000000,"[aldy, cyrius, stellarpgx]",[],"[*1, *10]",female,C
WHB3375,2.0,[*10/*36.ALDY],[*5/*36+*10],[*10/*36],*10/*36,0.666667,"[aldy, stellarpgx]",[],"[*10, *36]",male,C
WHB3376,3.0,[*10/*36.ALDY+*10],[*10/*36+*10],[*10/*36+*10],*10/*10+*36,1.000000,"[aldy, cyrius, stellarpgx]",[],"[*10, *10+*36]",female,C
WHB3377,3.0,[*10/*36.ALDY+*10],[*10/*36+*10],[*10/*36+*10],*10/*10+*36,1.000000,"[aldy, cyrius, stellarpgx]",[],"[*10, *10+*36]",female,C
WHB3378,3.0,[*36.ALDY+*10/*41],[*41/*36+*10],[*41/*36+*10],*10+*36/*41,1.000000,"[aldy, cyrius, stellarpgx]",[],"[*41, *10+*36]",male,C
...,...,...,...,...,...,...,...,...,...,...,...
WHB5465,2.0,[*1/*1],[*1/*1],[*1/*1],*1/*1,1.000000,"[aldy, cyrius, stellarpgx]",[],"[*1, *1]",female,C
WHB5466,3.0,[*1+*36.ALDY/*36.ALDY+*10],[*1/*36+*36+*10],[*1/*36+*10],,,[],[],"[*1, *10+*36]",female,C
WHB5467,2.0,[*1/*21],[*1/*21],[*1/*21],*1/*21,1.000000,"[aldy, cyrius, stellarpgx]",[],"[*1, *21]",female,C
WHB5468,2.0,[*1/*1],[*1/*1],[*1/*1],*1/*1,1.000000,"[aldy, cyrius, stellarpgx]",[],"[*1, *1]",male,I


# Export to TSV and parquet

In [9]:
output_dir = Path("output")
file_name = "2_consensus"

if not output_dir.exists():
    output_dir.mkdir()

consensus_df = consensus_df.reset_index()

consensus_df.to_csv(output_dir / f"{file_name}.tsv", index=False, sep="\t")
consensus_df.to_parquet(output_dir / f"{file_name}.parquet", index=False)
