In [1]:
import pandas as pd
from pathlib import Path


# Load consensus_df

In [2]:
consensus_df = pd.read_parquet("output/2_consensus.parquet")
consensus_df


Unnamed: 0,sample_id,stellarpgx_copy_number,aldy,cyrius,stellarpgx,diplotype_consensus,diplotype_caller_agreement,callers_in_agreement,novel_allele,haplotype_consensus,genetic_sex,genetic_ancestry
0,WHB3374,2.0,[*1/*10],[*1/*10],[*1/*10],*1/*10,1.000000,"[aldy, cyrius, stellarpgx]",[],"[*1, *10]",female,C
1,WHB3375,2.0,[*10/*36.ALDY],[*5/*36+*10],[*10/*36],*10/*36,0.666667,"[aldy, stellarpgx]",[],"[*10, *36]",male,C
2,WHB3376,3.0,[*10/*36.ALDY+*10],[*10/*36+*10],[*10/*36+*10],*10/*10+*36,1.000000,"[aldy, cyrius, stellarpgx]",[],"[*10, *10+*36]",female,C
3,WHB3377,3.0,[*10/*36.ALDY+*10],[*10/*36+*10],[*10/*36+*10],*10/*10+*36,1.000000,"[aldy, cyrius, stellarpgx]",[],"[*10, *10+*36]",female,C
4,WHB3378,3.0,[*36.ALDY+*10/*41],[*41/*36+*10],[*41/*36+*10],*10+*36/*41,1.000000,"[aldy, cyrius, stellarpgx]",[],"[*41, *10+*36]",male,C
...,...,...,...,...,...,...,...,...,...,...,...,...
1847,WHB5465,2.0,[*1/*1],[*1/*1],[*1/*1],*1/*1,1.000000,"[aldy, cyrius, stellarpgx]",[],"[*1, *1]",female,C
1848,WHB5466,3.0,[*1+*36.ALDY/*36.ALDY+*10],[*1/*36+*36+*10],[*1/*36+*10],,,[],[],"[*1, *10+*36]",female,C
1849,WHB5467,2.0,[*1/*21],[*1/*21],[*1/*21],*1/*21,1.000000,"[aldy, cyrius, stellarpgx]",[],"[*1, *21]",female,C
1850,WHB5468,2.0,[*1/*1],[*1/*1],[*1/*1],*1/*1,1.000000,"[aldy, cyrius, stellarpgx]",[],"[*1, *1]",male,I


# Filter out potentially novel alleles, drop samples with no diplotype consensus and drop no_call

In [3]:
filt_df = (
    consensus_df[consensus_df["novel_allele"].apply(lambda x: len(x) == 0)]
    .dropna(subset=["diplotype_consensus"])
    .copy()
)
filt_df = filt_df.query("diplotype_consensus != 'no_call'")


# Aggregate by diplotype

In [4]:
diplotype_df = pd.DataFrame(
    {
        "AC": filt_df["diplotype_consensus"].value_counts(),
        "AF": filt_df["diplotype_consensus"].value_counts(normalize=True),
    }
)

for ancestry, ancestry_df in filt_df.groupby("genetic_ancestry"):
    ancestry_af_ac = {
        f"AC_{ancestry}": ancestry_df["diplotype_consensus"].value_counts(),
        f"AF_{ancestry}": ancestry_df["diplotype_consensus"].value_counts(
            normalize=True
        ),
    }
    # display(pd.DataFrame(ancestry_af_ac))
    diplotype_df = pd.concat([diplotype_df, pd.DataFrame(ancestry_af_ac)], axis=1)

diplotype_df = diplotype_df.fillna(0).reset_index(names="diplotype")
diplotype_df


Unnamed: 0,diplotype,AC,AF,AC_C,AF_C,AC_I,AF_I,AC_M,AF_M,AC_O,AF_O
0,*1/*10+*36,235,0.162630,210.0,0.193015,2.0,0.012579,23.0,0.117347,0.0,0.0
1,*10+*36/*10+*36,162,0.112111,150.0,0.137868,1.0,0.006289,11.0,0.056122,0.0,0.0
2,*10/*10+*36,150,0.103806,130.0,0.119485,1.0,0.006289,19.0,0.096939,0.0,0.0
3,*1/*1,118,0.081661,75.0,0.068934,23.0,0.144654,20.0,0.102041,0.0,0.0
4,*1/*10,103,0.071280,71.0,0.065257,2.0,0.012579,30.0,0.153061,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
83,*1x2/*4,1,0.000692,0.0,0.000000,1.0,0.006289,0.0,0.000000,0.0,0.0
84,*35/*86,1,0.000692,0.0,0.000000,1.0,0.006289,0.0,0.000000,0.0,0.0
85,*2/*14,1,0.000692,1.0,0.000919,0.0,0.000000,0.0,0.000000,0.0,0.0
86,*1/*17,1,0.000692,0.0,0.000000,1.0,0.006289,0.0,0.000000,0.0,0.0


# Aggregate by haplotype

In [5]:
filt_hap_df = filt_df.copy()
filt_hap_df["haplotype"] = filt_hap_df["diplotype_consensus"].str.split("/")
filt_hap_df = filt_hap_df.explode("haplotype")

haplotype_df = pd.DataFrame(
    {
        "AC": filt_hap_df["haplotype"].value_counts(),
        "AF": filt_hap_df["haplotype"].value_counts(normalize=True),
    }
)

for ancestry, ancestry_df in filt_hap_df.groupby("genetic_ancestry"):
    ancestry_af_ac = {
        f"AC_{ancestry}": ancestry_df["haplotype"].value_counts(),
        f"AF_{ancestry}": ancestry_df["haplotype"].value_counts(normalize=True),
    }
    haplotype_df = pd.concat([haplotype_df, pd.DataFrame(ancestry_af_ac)], axis=1)

haplotype_df = haplotype_df.fillna(0).reset_index()
haplotype_df


Unnamed: 0,haplotype,AC,AF,AC_C,AF_C,AC_I,AF_I,AC_M,AF_M,AC_O,AF_O
0,*10+*36,843,0.291696,753.0,0.346048,10.0,0.031447,80.0,0.204082,0.0,0.0
1,*1,798,0.276125,555.0,0.255055,123.0,0.386792,119.0,0.303571,1.0,0.25
2,*10,509,0.176125,397.0,0.182445,11.0,0.034591,101.0,0.257653,0.0,0.0
3,*2,295,0.102076,192.0,0.088235,63.0,0.198113,38.0,0.096939,2.0,0.5
4,*41,137,0.047405,82.0,0.037684,38.0,0.119497,17.0,0.043367,0.0,0.0
5,*5,101,0.034948,70.0,0.032169,20.0,0.062893,11.0,0.028061,0.0,0.0
6,*36,45,0.015571,40.0,0.018382,0.0,0.0,5.0,0.012755,0.0,0.0
7,*14,28,0.009689,27.0,0.012408,0.0,0.0,1.0,0.002551,0.0,0.0
8,*4,24,0.008304,0.0,0.0,17.0,0.053459,7.0,0.017857,0.0,0.0
9,*49,20,0.00692,19.0,0.008732,0.0,0.0,1.0,0.002551,0.0,0.0


# Export TSV and parquet for both diplotype and haplotype tables 

In [6]:
output_dir = Path("output")
dip_file_name = "3_diplotypes"
hap_file_name = "3_haplotypes"

if not output_dir.exists():
    output_dir.mkdir()

# Export diplotypes
diplotype_df.to_csv(output_dir / f"{dip_file_name}.tsv", sep="\t", index=False)
diplotype_df.to_parquet(output_dir / f"{dip_file_name}.parquet", index=False)

# Export haplotypes
haplotype_df.to_csv(output_dir / f"{hap_file_name}.tsv", sep="\t", index=False)
haplotype_df.to_parquet(output_dir / f"{hap_file_name}.parquet", index=False)
