In [1]:
import pandas as pd
import numpy as np
from pathlib import Path


# Load consensus_df and PharmGKB reference

In [2]:
consensus_df = pd.read_parquet("output/2_consensus.parquet")
# PharmGKB data
pgkb = pd.read_csv("/home/jupyter-yusuf/Consensus/data_viz/References/pgkb_haplotype_function.tsv", sep = "\t", dtype = {'activity_value': 'float64'})


# Filter out potentially novel alleles, drop samples with no diplotype consensus and drop no_call

In [3]:
# Still keeping the novel alleles if they passed the threshold
# [consensus_df["novel_allele"].apply(lambda x: len(x) == 0)]
filt_df = (
    consensus_df
    .dropna(subset=["diplotype_consensus"])
    .copy()
)
filt_df = filt_df.query("diplotype_consensus != 'no_call'")
potential_novel_alleles = [
    'WHB3711', 'WHB4034', 'WHB4120', 'WHB4360', 'WHB4694', 'WHB5352',
    'WHB3484', 'WHB4156', 'WHB5002', 'WHB5419', 'WHB3646', 'WHB3750',
    'WHB5450', 'WHB3827', 'WHB4038', 'WHB3823', 'WHB4039', 'WHB4775',
    'WHB5383', 'WHB4768', 'WHB4836', 'WHB3871', 'WHB3653', 'WHB3493',
    'WHB4615', 'WHB4943', 'WHB4891', 'WHB4967'
]
filt_df = filt_df[~filt_df['sample_id'].isin(potential_novel_alleles)]
len(filt_df)

1487

# Analyzing no consensus samples

In [6]:
no_consensus_df = consensus_df[~consensus_df['sample_id'].isin(filt_df['sample_id'])]

In [9]:
def contains_SV(lst):
    if lst is not None:
        for item in lst:
            if isinstance(item, str) and ('+' in item or 'x' in item):
                return True
    return False

# Apply the function to the 'aldy' column
mask_aldy = no_consensus_df['aldy'].apply(contains_SV)
mask_cyrius = no_consensus_df['cyrius'].apply(contains_SV)
mask_stellarpgx = no_consensus_df['stellarpgx'].apply(contains_SV)

combined_mask = mask_aldy | mask_cyrius | mask_stellarpgx

# Use the mask to filter the DataFrame
sv_df = no_consensus_df[combined_mask]
novel_df = no_consensus_df[no_consensus_df['novel_allele'].apply(lambda x: 'stellarpgx' in x )]
print(f"percentage of sv in non consensus: {len(sv_df) / len(no_consensus_df) * 100}")
print(f"percentage of novel in non consensus: {len(novel_df) / len(no_consensus_df) * 100}")


percentage of sv in non consensus: 90.35812672176309
percentage of novel in non consensus: 15.151515151515152


In [84]:
len(filt_df)

1487

# Aggregate by diplotype

In [10]:
diplotype_df = pd.DataFrame(
    {
        "AC": filt_df["diplotype_consensus"].value_counts(),
        "AF": filt_df["diplotype_consensus"].value_counts(normalize=True) * 100,
    }
)

for ancestry, ancestry_df in filt_df.groupby("genetic_ancestry"):
    ancestry_af_ac = {
        f"AC_{ancestry}": ancestry_df["diplotype_consensus"].value_counts(),
        f"AF_{ancestry}": ancestry_df["diplotype_consensus"].value_counts() / len(filt_df) * 100,
        f"AF_GA_{ancestry}": ancestry_df["diplotype_consensus"].value_counts(normalize=True) * 100,
        
    }
    # display(pd.DataFrame(ancestry_af_ac))
    diplotype_df = pd.concat([diplotype_df, pd.DataFrame(ancestry_af_ac)], axis=1)

diplotype_df = diplotype_df.fillna(0).reset_index(names="diplotype")
diplotype_df.head(10)


Unnamed: 0,diplotype,AC,AF,AC_C,AF_C,AF_GA_C,AC_I,AF_I,AF_GA_I,AC_M,AF_M,AF_GA_M
0,*1/*10+*36,248,16.677875,223.0,14.996638,19.839858,2.0,0.134499,1.219512,23.0,1.546738,11.557789
1,*10+*36/*10+*36,163,10.961668,151.0,10.154674,13.434164,1.0,0.067249,0.609756,11.0,0.739744,5.527638
2,*10/*10+*36,150,10.087424,130.0,8.742434,11.565836,1.0,0.067249,0.609756,19.0,1.27774,9.547739
3,*1/*1,118,7.93544,75.0,5.043712,6.672598,23.0,1.546738,14.02439,20.0,1.34499,10.050251
4,*1/*10,111,7.464694,77.0,5.178211,6.850534,2.0,0.134499,1.219512,32.0,2.151984,16.080402
5,*1/*2,91,6.119704,49.0,3.295225,4.359431,29.0,1.950235,17.682927,13.0,0.874243,6.532663
6,*2/*10+*36,70,4.707465,57.0,3.833221,5.071174,3.0,0.201748,1.829268,10.0,0.672495,5.025126
7,*2/*10,53,3.564223,41.0,2.757229,3.647687,4.0,0.268998,2.439024,8.0,0.537996,4.020101
8,*10/*10,51,3.429724,38.0,2.555481,3.380783,1.0,0.067249,0.609756,12.0,0.806994,6.030151
9,*10/*36,44,2.958978,39.0,2.62273,3.469751,0.0,0.0,0.0,5.0,0.336247,2.512563


# Function to retrieve the activity value and phenotype from the reference

In [11]:
def get_av(df, ref):    
    av = 0
    diplotype = df['diplotype'].split('/')
    for haplotype in diplotype:
        if haplotype in ref['haplotype'].values:
            av += ref.loc[ref['haplotype'] == haplotype, 'activity_value'].iloc[0]
                
    return av

@np.vectorize
def phenotype(row):
    if (0 == row):
        return 'PM'
    elif (0 < row <1.25):
        return 'IM' 
    elif (1.25 <= row <=2.25):
        return 'NM'
    elif (2.25 < row <15):
        return 'UM'
    else:
        return 'Indeterminate'

In [12]:
diplotype_df['activity_value'] = diplotype_df.apply(lambda x: get_av(x, pgkb), axis=1)
diplotype_df['phenotype'] = phenotype(diplotype_df['activity_value'])

  outputs = ufunc(*inputs)


# Aggregate by haplotype

In [13]:
filt_hap_df = filt_df.copy()
filt_hap_df["haplotype"] = filt_hap_df["diplotype_consensus"].str.split("/")
filt_hap_df = filt_hap_df.explode("haplotype")

haplotype_df = pd.DataFrame(
    {
        "AC": filt_hap_df["haplotype"].value_counts(),
        "AF": filt_hap_df["haplotype"].value_counts(normalize=True) * 100,
    }
)

for ancestry, ancestry_df in filt_hap_df.groupby("genetic_ancestry"):
    ancestry_af_ac = {
        f"AC_{ancestry}": ancestry_df["haplotype"].value_counts(),
        f"AF_{ancestry}": ancestry_df["haplotype"].value_counts() / len(filt_hap_df) * 100,
        f"AF_GA_{ancestry}": ancestry_df["haplotype"].value_counts(normalize=True) * 100,
    }
    haplotype_df = pd.concat([haplotype_df, pd.DataFrame(ancestry_af_ac)], axis=1)

haplotype_df = haplotype_df.fillna(0).reset_index(names="haplotype")


# Add activity values and function from pharmgkb

In [14]:
haplotype_df = pd.merge(haplotype_df, pgkb, on = "haplotype", how = "left")
haplotype_df.head(5)


Unnamed: 0,haplotype,AC,AF,AC_C,AF_C,AF_GA_C,AC_I,AF_I,AF_GA_I,AC_M,AF_M,AF_GA_M,activity_value,function
0,*10+*36,861,28.950908,771.0,25.924681,34.297153,10.0,0.336247,3.04878,80.0,2.68998,20.100503,0.25,Decreased function
1,*1,827,27.807666,581.0,19.535978,25.845196,124.0,4.169469,37.804878,122.0,4.102219,30.653266,1.0,Normal function
2,*10,519,17.451244,405.0,13.618023,18.016014,11.0,0.369872,3.353659,103.0,3.463349,25.879397,0.25,Decreased function
3,*2,295,9.919301,193.0,6.489576,8.585409,64.0,2.151984,19.512195,38.0,1.27774,9.547739,1.0,Normal function
4,*41,141,4.741089,84.0,2.824479,3.736655,40.0,1.34499,12.195122,17.0,0.571621,4.271357,0.5,Decreased function


# Export TSV and parquet for both diplotype and haplotype tables 

In [16]:
output_dir = Path("output")
dip_file_name = "3_diplotypes"
hap_file_name = "3_haplotypes"

if not output_dir.exists():
    output_dir.mkdir()

# Export diplotypes
diplotype_df.to_csv(output_dir / f"{dip_file_name}.tsv", sep="\t", index=False)
diplotype_df.to_parquet(output_dir / f"{dip_file_name}.parquet", index=False)

# Export haplotypes
haplotype_df.to_csv(output_dir / f"{hap_file_name}.tsv", sep="\t", index=False)
haplotype_df.to_parquet(output_dir / f"{hap_file_name}.parquet", index=False)


In [20]:
print(f"Total haplotype: {haplotype_df['AC'].sum()}, Chinese: {haplotype_df['AC_C'].sum()}, Malay: {haplotype_df['AC_M'].sum()}, Indian: {haplotype_df['AC_I'].sum()}")

Total haplotype: 2974, Chinese: 2248.0, Malay: 398.0, Indian: 328.0


In [21]:
print(f"Total haplotype: {diplotype_df['AC'].sum()}, Chinese: {diplotype_df['AC_C'].sum()}, Malay: {diplotype_df['AC_M'].sum()}, Indian: {diplotype_df['AC_I'].sum()}")

Total haplotype: 1487, Chinese: 1124.0, Malay: 199.0, Indian: 164.0
