In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from parsers import Cyp2d6Output


# Import the caller output files and parse them into dataframes

In [2]:
sample_paths = Path(
    "/Users/torojr/Projects/nalagenetics/SG10K-CYP2D6/_data/caller-outputs/"
).iterdir()


In [3]:
sample_outputs = {}

for path in sample_paths:
    sample_id = path.stem
    outputs = [
        Cyp2d6Output(
            file_path=list(path.glob("cyrius/*.json"))[0],
            caller="cyrius",
            sample_id=sample_id,
        ),
        Cyp2d6Output(
            file_path=list(path.glob("stellarpgx/**/*.alleles"))[0],
            caller="stellarpgx",
            sample_id=sample_id,
        ),
        Cyp2d6Output(
            file_path=list(path.glob("aldy/*.aldy"))[0],
            caller="aldy",
            sample_id=sample_id,
        ),
    ]

    sample_outputs[sample_id] = outputs


In [4]:
caller_dfs = {}

for sample_id in sample_outputs:
    for caller_output in sample_outputs[sample_id]:
        caller = caller_output.caller
        caller_df = caller_dfs.get(caller, None)
        caller_data = {**caller_output.data, "sample_id": sample_id}
        if caller_df is not None:
            caller_dfs[caller] = pd.concat(
                [caller_df, pd.DataFrame([caller_data])], ignore_index=True
            )
        else:
            caller_dfs[caller] = pd.DataFrame([caller_data])

caller_dfs.keys()


dict_keys(['cyrius', 'stellarpgx', 'aldy'])

# Wrangle each caller's data

## Cyrius

In [5]:
cyrius_df = caller_dfs["cyrius"]


In [6]:
cyrius_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1852 entries, 0 to 1851
Data columns (total 18 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Coverage_MAD       1850 non-null   float64
 1   Median_depth       1850 non-null   float64
 2   Total_CN           1800 non-null   object 
 3   Spacer_CN          1744 non-null   object 
 4   Total_CN_raw       1850 non-null   float64
 5   Spacer_CN_raw      1850 non-null   float64
 6   Variants_called    1719 non-null   object 
 7   CNV_group          1740 non-null   object 
 8   Genotype           1660 non-null   object 
 9   Filter             1660 non-null   object 
 10  Raw_star_allele    1719 non-null   object 
 11  Call_info          1719 non-null   object 
 12  Exon9_CN           1180 non-null   object 
 13  CNV_consensus      1800 non-null   object 
 14  d67_snp_call       1800 non-null   object 
 15  d67_snp_raw        1800 non-null   object 
 16  Variant_raw_count  1852 

In [7]:
cyrius_df.head()


Unnamed: 0,Coverage_MAD,Median_depth,Total_CN,Spacer_CN,Total_CN_raw,Spacer_CN_raw,Variants_called,CNV_group,Genotype,Filter,Raw_star_allele,Call_info,Exon9_CN,CNV_consensus,d67_snp_call,d67_snp_raw,Variant_raw_count,sample_id
0,0.108,31.1815,3,2,3.197,1.782,,,,,,,,"2,1,2,2,None","None,None,2,1,2,2,None,1,None,None,None,2,2,No...","1.438,1.468,1.5,1.382,1.667,1.781,1.56,1.523,1...","{'g.42126611C>G': '30,0', 'g.42126877G>A': '0(...",WHB4244
1,0.075,35.8625,5,3,5.305,2.887,"[g.42126611C>G, g.42129754G>A, g.42129754G>A, ...",exon9hyb,*1/*36+*10,PASS,[*1_*10_*36],unique_match,2.0,22333,"2,2,2,2,2,2,2,2,2,3,None,3,3,3,3,3,3,3,None,3,...","1.866,1.818,1.901,1.941,2.154,2.292,2.113,1.75...","{'g.42126611C>G': '14,14', 'g.42126877G>A': '0...",WHB4076
2,0.078,38.505,5,3,4.931,2.932,"[g.42126611C>G, g.42126611C>G, g.42127803C>T, ...",exon9hyb,*41/*36+*10,PASS,[*10_*36_*41],unique_match,2.0,22333,"2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,...","1.929,1.776,1.731,1.742,2.029,1.867,2.262,2.04...","{'g.42126611C>G': '29,0', 'g.42126877G>A': '0(...",WHB3637
3,0.075,40.3925,4,2,4.132,2.027,"[g.42126611C>G, g.42126611C>G, g.42127803C>T, ...",cn2,*10/*41,PASS,[*10_*41],unique_match,2.0,22222,"2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,...","1.797,1.758,1.75,2.215,1.97,2.085,2.356,2.085,...","{'g.42126611C>G': '34,1', 'g.42126877G>A': '0(...",WHB4412
4,0.076,44.9225,5,2,4.907,1.846,[],cn3,*1/*1x2,PASS,[*1_*1_*1],unique_match,,23333,"2,None,2,3,3,3,3,3,None,3,3,3,3,3,3,3,3,3,3,No...","1.689,1.419,1.6,3.029,3.109,3.173,3.298,2.837,...","{'g.42126611C>G': '0,58', 'g.42126877G>A': '0(...",WHB4620


In [8]:
cyrius_df["split_gt"] = cyrius_df.Genotype.str.split(";")


In [9]:
cyrius_ready_df = (
    cyrius_df.explode(["split_gt"])
    .loc[:, ["sample_id", "split_gt", "Filter", "Total_CN"]]
    .rename(
        columns={"split_gt": "genotype", "Filter": "filter", "Total_CN": "copy_number"}
    )
)
cyrius_ready_df["caller"] = "cyrius"
cyrius_ready_df["novel_allele"] = False
cyrius_ready_df = cyrius_ready_df.fillna(np.nan)
cyrius_ready_df


Unnamed: 0,sample_id,genotype,filter,copy_number,caller,novel_allele
0,WHB4244,,,3.0,cyrius,False
1,WHB4076,*1/*36+*10,PASS,5.0,cyrius,False
2,WHB3637,*41/*36+*10,PASS,5.0,cyrius,False
3,WHB4412,*10/*41,PASS,4.0,cyrius,False
4,WHB4620,*1/*1x2,PASS,5.0,cyrius,False
...,...,...,...,...,...,...
1847,WHB4200,*1/*36+*10,PASS,5.0,cyrius,False
1848,WHB3819,*1/*36+*10,PASS,5.0,cyrius,False
1849,WHB4032,*10/*36+*10,PASS,5.0,cyrius,False
1850,WHB4831,*36+*10/*36+*10,PASS,6.0,cyrius,False


## StellarPGx

In [10]:
stellarpgx_df = caller_dfs["stellarpgx"]


In [11]:
stellarpgx_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1852 entries, 0 to 1851
Data columns (total 8 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   initially_computed_cn      1852 non-null   int64 
 1   sample_core_variants       1852 non-null   object
 2   candidate_alleles          1849 non-null   object
 3   result                     1852 non-null   object
 4   activity_score             1852 non-null   object
 5   metaboliser_status         1852 non-null   object
 6   sample_id                  1852 non-null   object
 7   likely_background_alleles  93 non-null     object
dtypes: int64(1), object(7)
memory usage: 115.9+ KB


In [12]:
stellarpgx_df.head()


Unnamed: 0,initially_computed_cn,sample_core_variants,candidate_alleles,result,activity_score,metaboliser_status,sample_id,likely_background_alleles
0,3,"[42126611~C>G~1/1, 42129180~A>T~0/1, 42130692~...",[10.v1_49.v1],*10/*49x2,1.25,Normal metaboliser (NM),WHB4244,
1,3,"[42126611~C>G~0/1, 42130692~G>A~0/1]",[1.v1_10.v1],*1/*36+*10,1.25,Normal metaboliser (NM),WHB4076,
2,3,"[42126611~C>G~1/1, 42127803~C>T~0/1, 42127941~...","[10.v1_41.v1, 39.v1_69.v1]",*39/*69x2,1.0,Intermediate metaboliser (IM),WHB3637,
3,2,"[42126611~C>G~1/1, 42127803~C>T~0/1, 42127941~...","[10.v1_41.v1, 39.v1_69.v1]",*10/*41,0.75,Intermediate metaboliser (IM),WHB4412,
4,3,[],[1.v1_1.v1],*1/*1x2,3.0,Ultrarapid metaboliser (UM),WHB4620,


In [13]:
stellarpgx_df.result.value_counts().sort_index()


result
*1/*1                                                                                                                                            118
*1/*10                                                                                                                                           104
*1/*10x2                                                                                                                                          15
*1/*10x3                                                                                                                                           2
*1/*113                                                                                                                                            1
                                                                                                                                                ... 
*71/*36+*10                                                                                        

In [14]:
stellarpgx_ready_df = stellarpgx_df.loc[
    :, ["sample_id", "result", "initially_computed_cn"]
].rename(columns={"result": "genotype", "initially_computed_cn": "copy_number"})
stellarpgx_ready_df["filter"] = None
stellarpgx_ready_df["caller"] = "stellarpgx"
stellarpgx_ready_df["genotype"] = stellarpgx_ready_df["genotype"].apply(
    lambda x: np.nan if x == "No_call" else x
)
stellarpgx_ready_df["novel_allele"] = (
    stellarpgx_ready_df["genotype"]
    .str.startswith("Possible novel allele")
    .fillna(False)
)
stellarpgx_ready_df["genotype"].update(
    stellarpgx_df.query("likely_background_alleles.notna()")[
        "likely_background_alleles"
    ].apply(lambda x: x.replace("[", "").replace("]", ""))
)
stellarpgx_ready_df = stellarpgx_ready_df.fillna(np.nan)
stellarpgx_ready_df


Unnamed: 0,sample_id,genotype,copy_number,filter,caller,novel_allele
0,WHB4244,*10/*49x2,3,,stellarpgx,False
1,WHB4076,*1/*36+*10,3,,stellarpgx,False
2,WHB3637,*39/*69x2,3,,stellarpgx,False
3,WHB4412,*10/*41,2,,stellarpgx,False
4,WHB4620,*1/*1x2,3,,stellarpgx,False
...,...,...,...,...,...,...
1847,WHB4200,*1/*36+*10,3,,stellarpgx,False
1848,WHB3819,*10/*39,3,,stellarpgx,True
1849,WHB4032,*10/*36+*10,3,,stellarpgx,False
1850,WHB4831,*10/*36+*10,3,,stellarpgx,False


## Aldy

In [15]:
aldy_df = caller_dfs["aldy"]


In [16]:
aldy_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1852 entries, 0 to 1851
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   genotype     1851 non-null   object 
 1   copy_number  1851 non-null   float64
 2   sample_id    1852 non-null   object 
dtypes: float64(1), object(2)
memory usage: 43.5+ KB


In [17]:
aldy_df


Unnamed: 0,genotype,copy_number,sample_id
0,[*10/*49],2.0,WHB4244
1,[*1/*36.ALDY+*36.ALDY],3.0,WHB4076
2,[*36.ALDY+*10/*41],3.0,WHB3637
3,[*10/*41],2.0,WHB4412
4,[*1/*1+*1],3.0,WHB4620
...,...,...,...
1847,[*1/*36.ALDY+*10],3.0,WHB4200
1848,[*1/*36.ALDY+*10],3.0,WHB3819
1849,[*10/*36.ALDY+*10],3.0,WHB4032
1850,[*10/*36.ALDY+*10],3.0,WHB4831


In [18]:
aldy_ready_df = aldy_df.explode("genotype")
aldy_ready_df["filter"] = None
aldy_ready_df["caller"] = "aldy"
aldy_ready_df["novel_allele"] = (
    aldy_ready_df["genotype"].str.contains("\+rs").fillna(False)
)
aldy_ready_df = aldy_ready_df.fillna(np.nan)
aldy_ready_df


Unnamed: 0,genotype,copy_number,sample_id,filter,caller,novel_allele
0,*10/*49,2.0,WHB4244,,aldy,False
1,*1/*36.ALDY+*36.ALDY,3.0,WHB4076,,aldy,False
2,*36.ALDY+*10/*41,3.0,WHB3637,,aldy,False
3,*10/*41,2.0,WHB4412,,aldy,False
4,*1/*1+*1,3.0,WHB4620,,aldy,False
...,...,...,...,...,...,...
1847,*1/*36.ALDY+*10,3.0,WHB4200,,aldy,False
1848,*1/*36.ALDY+*10,3.0,WHB3819,,aldy,False
1849,*10/*36.ALDY+*10,3.0,WHB4032,,aldy,False
1850,*10/*36.ALDY+*10,3.0,WHB4831,,aldy,False


# Merge the clean caller outputs

In [19]:
merged_df = (
    pd.concat([cyrius_ready_df, stellarpgx_ready_df, aldy_ready_df])
    .sort_values(["sample_id", "caller"])
    .infer_objects()
)
merged_df


Unnamed: 0,sample_id,genotype,filter,copy_number,caller,novel_allele
281,WHB3374,*1/*10,,2.0,aldy,False
281,WHB3374,*1/*10,PASS,4.0,cyrius,False
281,WHB3374,*1/*10,,2.0,stellarpgx,False
363,WHB3375,*10/*36.ALDY,,2.0,aldy,False
363,WHB3375,*5/*36+*10,PASS,4.0,cyrius,False
...,...,...,...,...,...,...
1580,WHB5468,*1/*1,PASS,4.0,cyrius,False
1580,WHB5468,*1/*1,,2.0,stellarpgx,False
1511,WHB5469,*1/*36.ALDY+*10,,3.0,aldy,False
1511,WHB5469,*1/*36+*10,PASS,5.0,cyrius,False


In [20]:
merged_df.value_counts("sample_id").value_counts()


count
3    1833
4      18
5       1
Name: count, dtype: int64

In [21]:
merged_df.caller.value_counts()


caller
aldy          1867
cyrius        1857
stellarpgx    1852
Name: count, dtype: int64

In [22]:
merged_df.sample_id.nunique()


1852

# Export as TSV and parquet file

In [23]:
output_dir = Path("output")
file_name = "1_caller_outputs"

if not output_dir.exists():
    output_dir.mkdir()

merged_df.to_csv(output_dir / f"{file_name}.tsv", sep="\t", index=False)
merged_df.to_parquet(output_dir / f"{file_name}.parquet", index=False)
