In [1]:
import pandas as pd
import numpy as np
from pathlib import Path


# Load consensus_df and PharmGKB reference

In [2]:
consensus_df = pd.read_parquet("output/2_consensus.parquet")
# PharmGKB data
pgkb = pd.read_csv("/home/jupyter-yusuf/Consensus/data_viz/References/pgkb_haplotype_function.tsv", sep = "\t", dtype = {'activity_value': 'float64'})


In [3]:
consensus_df.head()

Unnamed: 0,sample_id,stellarpgx_copy_number,aldy,cyrius,stellarpgx,diplotype_consensus,diplotype_caller_agreement,callers_in_agreement,novel_allele,haplotype_consensus,genetic_sex,genetic_ancestry
0,WHB3374,2.0,[*1/*10],[*1/*10],[*1/*10],*1/*10,1.0,"[aldy, cyrius, stellarpgx]",[],"[*1, *10]",female,C
1,WHB3375,2.0,[*10/*36.ALDY],[*5/*36+*10],[*10/*36],*10/*36,0.666667,"[aldy, stellarpgx]",[],"[*10, *36]",male,C
2,WHB3376,3.0,[*10/*36.ALDY+*10],[*10/*36+*10],[*10/*36+*10],*10/*10+*36,1.0,"[aldy, cyrius, stellarpgx]",[],"[*10, *10+*36]",female,C
3,WHB3377,3.0,[*10/*36.ALDY+*10],[*10/*36+*10],[*10/*36+*10],*10/*10+*36,1.0,"[aldy, cyrius, stellarpgx]",[],"[*10, *10+*36]",female,C
4,WHB3378,3.0,[*36.ALDY+*10/*41],[*41/*36+*10],[*41/*36+*10],*10+*36/*41,1.0,"[aldy, cyrius, stellarpgx]",[],"[*41, *10+*36]",male,C


In [20]:
filt_df = (
    consensus_df
    .dropna(subset=["diplotype_consensus"])
    .copy()
)

filt_df.query("diplotype_consensus == 'no_call'")

Unnamed: 0,sample_id,stellarpgx_copy_number,aldy,cyrius,stellarpgx,diplotype_consensus,diplotype_caller_agreement,callers_in_agreement,novel_allele,haplotype_consensus,genetic_sex,genetic_ancestry
935,WHB4383,5.0,[*36.ALDY+*10/*36.ALDY+*10],[None],[None],no_call,0.666667,"[cyrius, stellarpgx]",[],"[None, None]",female,C


# Filter out potentially novel alleles, drop samples with no diplotype consensus and drop no_call

In [4]:
# Still keeping the novel alleles if they passed the threshold
# [consensus_df["novel_allele"].apply(lambda x: len(x) == 0)]
filt_df = (
    consensus_df
    .dropna(subset=["diplotype_consensus"])
    .copy()
)
filt_df = filt_df.query("diplotype_consensus != 'no_call'")
filt_df.shape


(1503, 12)

In [5]:
filt_df[filt_df["diplotype_consensus"].str.contains(r'\*4(?!\d+)')]

Unnamed: 0,sample_id,stellarpgx_copy_number,aldy,cyrius,stellarpgx,diplotype_consensus,diplotype_caller_agreement,callers_in_agreement,novel_allele,haplotype_consensus,genetic_sex,genetic_ancestry
12,WHB3386,2.0,[*41/*68+*4.021+rs3021084],[*41/*68+*4],[*136/*4],*4+*68/*41,0.666667,"[aldy, cyrius]","[aldy, stellarpgx]","[*41, *4+*68]",male,I
39,WHB3416,2.0,[*4C/*10],[*10/*4],[*10/*4],*4/*10,1.0,"[aldy, cyrius, stellarpgx]",[],"[*10, *4]",female,M
164,WHB3552,2.0,[*4/*10],[*10/*4],[*10/*4],*4/*10,1.0,"[aldy, cyrius, stellarpgx]",[],"[*10, *4]",male,I
433,WHB3858,2.0,[*4C/*10],[*10/*4],[*10/*4],*4/*10,1.0,"[aldy, cyrius, stellarpgx]",[],"[*10, *4]",female,M
443,WHB3868,2.0,[*4.021/*112],[*112/*4],[*112/*4],*4/*112,1.0,"[aldy, cyrius, stellarpgx]",[],"[*112, *4]",female,I
449,WHB3874,3.0,[*41/*68+*4.021],[*41/*68+*4],[*41/*68+*4],*4+*68/*41,1.0,"[aldy, cyrius, stellarpgx]",[],"[*41, *4+*68]",female,I
457,WHB3883,2.0,[*1/*4],[*1/*4],[*1/*4],*1/*4,1.0,"[aldy, cyrius, stellarpgx]",[],"[*1, *4]",female,M
480,WHB3907,2.0,[*2/*4],[None],[*2/*4],*2/*4,0.666667,"[aldy, stellarpgx]",[],"[*2, *4]",female,I
487,WHB3914,2.0,[*1/*4],[*1/*4],[*10/*4],*1/*4,0.666667,"[aldy, cyrius]",[stellarpgx],"[*1, *4]",female,I
519,WHB3946,2.0,[*1/*4],[*1/*4],[*1/*4],*1/*4,1.0,"[aldy, cyrius, stellarpgx]",[],"[*1, *4]",male,I


In [6]:
filt_df.groupby("diplotype_consensus")['stellarpgx_copy_number'].unique().head(30)

diplotype_consensus
*1/*1                   [2.0, 3.0]
*1/*10                  [2.0, 3.0]
*1/*10+*36         [3.0, 2.0, 4.0]
*1/*113                 [3.0, 2.0]
*1/*14                  [2.0, 3.0]
*1/*17                       [2.0]
*1/*1x2                      [3.0]
*1/*2                   [2.0, 3.0]
*1/*21                       [2.0]
*1/*2x2                      [3.0]
*1/*33                       [2.0]
*1/*35                       [2.0]
*1/*36                       [2.0]
*1/*4                   [2.0, 3.0]
*1/*4+*4                     [3.0]
*1/*4+*68               [3.0, 2.0]
*1/*41                       [2.0]
*1/*43                       [2.0]
*1/*49                       [2.0]
*1/*5                        [1.0]
*1/*71                       [2.0]
*1/*94                       [2.0]
*10+*36/*10+*36    [4.0, 3.0, 5.0]
*10+*36/*133                 [2.0]
*10+*36/*14                  [3.0]
*10+*36/*21                  [2.0]
*10+*36/*35                  [3.0]
*10+*36/*36                  [3.0]


# Aggregate by diplotype

In [7]:
diplotype_df = pd.DataFrame(
    {
        "AC": filt_df["diplotype_consensus"].value_counts(),
        "AF": filt_df["diplotype_consensus"].value_counts(normalize=True) * 100,
    }
)

for ancestry, ancestry_df in filt_df.groupby("genetic_ancestry"):
    ancestry_af_ac = {
        f"AC_{ancestry}": ancestry_df["diplotype_consensus"].value_counts(),
        f"AF_{ancestry}": ancestry_df["diplotype_consensus"].value_counts() / len(filt_df) * 100,
        f"AF_GA_{ancestry}": ancestry_df["diplotype_consensus"].value_counts(normalize=True) * 100,
        
    }
    # display(pd.DataFrame(ancestry_af_ac))
    diplotype_df = pd.concat([diplotype_df, pd.DataFrame(ancestry_af_ac)], axis=1)

diplotype_df = diplotype_df.fillna(0).reset_index(names="diplotype")
diplotype_df.head(10)


Unnamed: 0,diplotype,AC,AF,AC_C,AF_C,AF_GA_C,AC_I,AF_I,AF_GA_I,AC_M,AF_M,AF_GA_M
0,*1/*10+*36,248,16.500333,223.0,14.836993,19.595782,2.0,0.133067,1.212121,23.0,1.530273,11.5
1,*10+*36/*10+*36,164,10.91151,152.0,10.113107,13.356766,1.0,0.066534,0.606061,11.0,0.73187,5.5
2,*10/*10+*36,152,10.113107,132.0,8.782435,11.599297,1.0,0.066534,0.606061,19.0,1.264138,9.5
3,*1/*1,119,7.917498,76.0,5.056554,6.678383,23.0,1.530273,13.939394,20.0,1.330672,10.0
4,*1/*10,111,7.38523,77.0,5.123087,6.766257,2.0,0.133067,1.212121,32.0,2.129075,16.0
5,*1/*2,95,6.320692,53.0,3.526281,4.657293,29.0,1.929474,17.575758,13.0,0.864937,6.5
6,*2/*10+*36,70,4.657352,57.0,3.792415,5.008787,3.0,0.199601,1.818182,10.0,0.665336,5.0
7,*10/*10,54,3.592814,40.0,2.661344,3.514938,1.0,0.066534,0.606061,13.0,0.864937,6.5
8,*2/*10,53,3.526281,41.0,2.727878,3.602812,4.0,0.266134,2.424242,8.0,0.532269,4.0
9,*10/*36,44,2.927478,39.0,2.59481,3.427065,0.0,0.0,0.0,5.0,0.332668,2.5


# Function to retrieve the activity value and phenotype from the reference

In [8]:
def get_av(df, ref):    
    av = 0
    diplotype = df['diplotype'].split('/')
    for haplotype in diplotype:
        if haplotype in ref['haplotype'].values:
            av += ref.loc[ref['haplotype'] == haplotype, 'activity_value'].iloc[0]
                
    return av

@np.vectorize
def phenotype(row):
    if (0 == row):
        return 'PM'
    elif (0 < row <1.25):
        return 'IM' 
    elif (1.25 <= row <=2.25):
        return 'NM'
    elif (2.25 < row <15):
        return 'UM'
    else:
        return 'Indeterminate'

In [9]:
diplotype_df['activity_value'] = diplotype_df.apply(lambda x: get_av(x, pgkb), axis=1)
diplotype_df['phenotype'] = phenotype(diplotype_df['activity_value'])

  outputs = ufunc(*inputs)


In [10]:
diplotype_df.head()

Unnamed: 0,diplotype,AC,AF,AC_C,AF_C,AF_GA_C,AC_I,AF_I,AF_GA_I,AC_M,AF_M,AF_GA_M,activity_value,phenotype
0,*1/*10+*36,248,16.500333,223.0,14.836993,19.595782,2.0,0.133067,1.212121,23.0,1.530273,11.5,1.25,NM
1,*10+*36/*10+*36,164,10.91151,152.0,10.113107,13.356766,1.0,0.066534,0.606061,11.0,0.73187,5.5,0.5,IM
2,*10/*10+*36,152,10.113107,132.0,8.782435,11.599297,1.0,0.066534,0.606061,19.0,1.264138,9.5,0.5,IM
3,*1/*1,119,7.917498,76.0,5.056554,6.678383,23.0,1.530273,13.939394,20.0,1.330672,10.0,2.0,NM
4,*1/*10,111,7.38523,77.0,5.123087,6.766257,2.0,0.133067,1.212121,32.0,2.129075,16.0,1.25,NM


# Aggregate by haplotype

In [11]:
filt_hap_df = filt_df.copy()
filt_hap_df["haplotype"] = filt_hap_df["diplotype_consensus"].str.split("/")
filt_hap_df = filt_hap_df.explode("haplotype")

haplotype_df = pd.DataFrame(
    {
        "AC": filt_hap_df["haplotype"].value_counts(),
        "AF": filt_hap_df["haplotype"].value_counts(normalize=True) * 100,
    }
)

for ancestry, ancestry_df in filt_hap_df.groupby("genetic_ancestry"):
    ancestry_af_ac = {
        f"AC_{ancestry}": ancestry_df["haplotype"].value_counts(),
        f"AF_{ancestry}": ancestry_df["haplotype"].value_counts() / len(filt_hap_df) * 100,
        f"AF_GA_{ancestry}": ancestry_df["haplotype"].value_counts(normalize=True) * 100,
    }
    haplotype_df = pd.concat([haplotype_df, pd.DataFrame(ancestry_af_ac)], axis=1)

haplotype_df = haplotype_df.fillna(0).reset_index(names="haplotype")


Add activity values and function from pharmgkb

In [12]:
haplotype_df = pd.merge(haplotype_df, pgkb, on = "haplotype", how = "left")
haplotype_df.head(5)


Unnamed: 0,haplotype,AC,AF,AC_C,AF_C,AF_GA_C,AC_I,AF_I,AF_GA_I,AC_M,AF_M,AF_GA_M,activity_value,function
0,*10+*36,866,28.809049,776.0,25.815037,34.094903,10.0,0.332668,3.030303,80.0,2.661344,20.0,0.25,Decreased function
1,*1,833,27.711244,587.0,19.527611,25.790861,124.0,4.125083,37.575758,122.0,4.05855,30.5,1.0,Normal function
2,*10,529,17.598137,413.0,13.739188,18.14587,11.0,0.365935,3.333333,105.0,3.493014,26.25,0.25,Decreased function
3,*2,301,10.013307,197.0,6.55356,8.655536,66.0,2.195609,20.0,38.0,1.264138,9.5,1.0,Normal function
4,*41,142,4.723886,85.0,2.827678,3.734622,40.0,1.330672,12.121212,17.0,0.565536,4.25,0.5,Decreased function


In [13]:
haplotype_df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 35 entries, 0 to 34
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   haplotype       35 non-null     object 
 1   AC              35 non-null     int64  
 2   AF              35 non-null     float64
 3   AC_C            35 non-null     float64
 4   AF_C            35 non-null     float64
 5   AF_GA_C         35 non-null     float64
 6   AC_I            35 non-null     float64
 7   AF_I            35 non-null     float64
 8   AF_GA_I         35 non-null     float64
 9   AC_M            35 non-null     float64
 10  AF_M            35 non-null     float64
 11  AF_GA_M         35 non-null     float64
 12  activity_value  23 non-null     float64
 13  function        35 non-null     object 
dtypes: float64(11), int64(1), object(2)
memory usage: 4.1+ KB


# Export TSV and parquet for both diplotype and haplotype tables 

In [14]:
output_dir = Path("output")
dip_file_name = "3_diplotypes"
hap_file_name = "3_haplotypes"

if not output_dir.exists():
    output_dir.mkdir()

# Export diplotypes
diplotype_df.to_csv(output_dir / f"{dip_file_name}.tsv", sep="\t", index=False)
diplotype_df.to_parquet(output_dir / f"{dip_file_name}.parquet", index=False)

# Export haplotypes
haplotype_df.to_csv(output_dir / f"{hap_file_name}.tsv", sep="\t", index=False)
haplotype_df.to_parquet(output_dir / f"{hap_file_name}.parquet", index=False)


In [16]:
haplotype_df['AC'].sum() /2

1503.0