In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from parsers import Cyp2d6Output


# Import the caller output files and parse them into dataframes

In [2]:
sample_paths = Path(
    "/home/jupyter-yusuf/aws-s3-cyp2d6/cyp2d6-caller-outputs/"
).iterdir()


In [3]:
sample_outputs = {}

for path in sample_paths:
    sample_id = path.stem
    outputs = [
        Cyp2d6Output(
            file_path=list(path.glob("cyrius/*.json"))[0],
            caller="cyrius",
            sample_id=sample_id,
        ),
        Cyp2d6Output(
            file_path=list(path.glob("stellarpgx/**/*.alleles"))[0],
            caller="stellarpgx",
            sample_id=sample_id,
        ),
        Cyp2d6Output(
            file_path=list(path.glob("aldy/*.aldy"))[0],
            caller="aldy",
            sample_id=sample_id,
        ),
    ]

    sample_outputs[sample_id] = outputs


In [4]:
caller_dfs = {}

for sample_id in sample_outputs:
    for caller_output in sample_outputs[sample_id]:
        caller = caller_output.caller
        caller_df = caller_dfs.get(caller, None)
        caller_data = {**caller_output.data, "sample_id": sample_id}
        if caller_df is not None:
            caller_dfs[caller] = pd.concat(
                [caller_df, pd.DataFrame([caller_data])], ignore_index=True
            )
        else:
            caller_dfs[caller] = pd.DataFrame([caller_data])

caller_dfs.keys()


dict_keys(['cyrius', 'stellarpgx', 'aldy'])

# Wrangle each caller's data

## Cyrius

In [5]:
cyrius_df = caller_dfs["cyrius"]


In [6]:
cyrius_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1852 entries, 0 to 1851
Data columns (total 18 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Coverage_MAD       1850 non-null   float64
 1   Median_depth       1850 non-null   float64
 2   Total_CN           1800 non-null   object 
 3   Spacer_CN          1744 non-null   object 
 4   Total_CN_raw       1850 non-null   float64
 5   Spacer_CN_raw      1850 non-null   float64
 6   Variants_called    1719 non-null   object 
 7   CNV_group          1740 non-null   object 
 8   Genotype           1660 non-null   object 
 9   Filter             1660 non-null   object 
 10  Raw_star_allele    1719 non-null   object 
 11  Call_info          1719 non-null   object 
 12  Exon9_CN           1180 non-null   object 
 13  CNV_consensus      1800 non-null   object 
 14  d67_snp_call       1800 non-null   object 
 15  d67_snp_raw        1800 non-null   object 
 16  Variant_raw_count  1852 

In [7]:
cyrius_df.head()


Unnamed: 0,Coverage_MAD,Median_depth,Total_CN,Spacer_CN,Total_CN_raw,Spacer_CN_raw,Variants_called,CNV_group,Genotype,Filter,Raw_star_allele,Call_info,Exon9_CN,CNV_consensus,d67_snp_call,d67_snp_raw,Variant_raw_count,sample_id
0,0.083,33.673,4,2,4.043,1.905,"[g.42126611C>G, g.42129754G>A, g.42130692G>A]",cn2,*1/*10,PASS,[*1_*10],unique_match,2,22222,"2,2,2,2,None,3,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2...","2.348,2.367,2.449,2.071,2.533,2.621,2.069,1.96...","{'g.42126611C>G': '14,9', 'g.42126877G>A': '0(...",WHB3374
1,0.071,42.5065,4,3,4.094,3.1,"[g.42126611C>G, g.42126611C>G, g.42129754G>A, ...",exon9hyb_star5,*5/*36+*10,PASS,[*10_*10],unique_match,1,21222,"2,2,2,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,...","1.879,1.731,1.758,1.056,1.224,1.209,1.179,0.65...","{'g.42126611C>G': '12,0', 'g.42126877G>A': '0(...",WHB3375
2,0.072,40.3925,5,3,5.111,3.327,"[g.42126611C>G, g.42126611C>G, g.42129754G>A, ...",exon9hyb,*10/*36+*10,PASS,[*10_*10_*36],unique_match,2,22333,"1,None,None,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,3,...","1.364,1.418,1.449,2.368,2.339,2.243,2.045,2.17...","{'g.42126611C>G': '30,0', 'g.42126877G>A': '0(...",WHB3376
3,0.069,40.166,5,3,5.125,3.051,"[g.42126611C>G, g.42126611C>G, g.42129754G>A, ...",exon9hyb,*10/*36+*10,PASS,[*10_*10_*36],unique_match,2,22333,"2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,...","1.667,1.558,1.603,2.041,2.234,2.172,2.394,1.82...","{'g.42126611C>G': '29,0', 'g.42126877G>A': '0(...",WHB3377
4,0.08,38.052,5,3,4.917,2.747,"[g.42126611C>G, g.42126611C>G, g.42127803C>T, ...",exon9hyb,*41/*36+*10,PASS,[*10_*36_*41],unique_match,2,22333,"2,2,2,None,2,2,None,2,2,3,3,3,3,3,3,3,3,3,3,3,...","2.063,2.083,1.983,2.432,2.171,2.042,2.4,1.557,...","{'g.42126611C>G': '18,0', 'g.42126877G>A': '0(...",WHB3378


In [8]:
cyrius_df["split_gt"] = cyrius_df.Genotype.str.split(";")


In [9]:
cyrius_ready_df = (
    cyrius_df.explode(["split_gt"])
    .loc[:, ["sample_id", "split_gt", "Filter", "Total_CN"]]
    .rename(
        columns={"split_gt": "genotype", "Filter": "filter", "Total_CN": "copy_number"}
    )
)
cyrius_ready_df["caller"] = "cyrius"
cyrius_ready_df["novel_allele"] = False
cyrius_ready_df = cyrius_ready_df.fillna(np.nan)
cyrius_ready_df


Unnamed: 0,sample_id,genotype,filter,copy_number,caller,novel_allele
0,WHB3374,*1/*10,PASS,4.0,cyrius,False
1,WHB3375,*5/*36+*10,PASS,4.0,cyrius,False
2,WHB3376,*10/*36+*10,PASS,5.0,cyrius,False
3,WHB3377,*10/*36+*10,PASS,5.0,cyrius,False
4,WHB3378,*41/*36+*10,PASS,5.0,cyrius,False
...,...,...,...,...,...,...
1847,WHB5465,*1/*1,PASS,4.0,cyrius,False
1848,WHB5466,*1/*36+*36+*10,PASS,6.0,cyrius,False
1849,WHB5467,*1/*21,PASS,4.0,cyrius,False
1850,WHB5468,*1/*1,PASS,4.0,cyrius,False


## StellarPGx

In [10]:
stellarpgx_df = caller_dfs["stellarpgx"]


In [11]:
stellarpgx_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1852 entries, 0 to 1851
Data columns (total 8 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   initially_computed_cn      1852 non-null   int64 
 1   sample_core_variants       1852 non-null   object
 2   candidate_alleles          1849 non-null   object
 3   result                     1852 non-null   object
 4   activity_score             1852 non-null   object
 5   metaboliser_status         1852 non-null   object
 6   sample_id                  1852 non-null   object
 7   likely_background_alleles  93 non-null     object
dtypes: int64(1), object(7)
memory usage: 115.9+ KB


In [12]:
stellarpgx_df.head()


Unnamed: 0,initially_computed_cn,sample_core_variants,candidate_alleles,result,activity_score,metaboliser_status,sample_id,likely_background_alleles
0,2,"[42126611~C>G~0/1, 42130692~G>A~0/1]",[1.v1_10.v1],*1/*10,1.25,Normal metaboliser (NM),WHB3374,
1,2,"[42126611~C>G~1/1, 42130692~G>A~1/1]",[10.v1_10.v1],*10/*36,0.25,Intermediate metaboliser (IM),WHB3375,
2,3,"[42126611~C>G~1/1, 42130692~G>A~1/1]",[10.v1_10.v1],*10/*36+*10,0.5,Intermediate metaboliser (IM),WHB3376,
3,3,"[42126611~C>G~1/1, 42130692~G>A~1/1]",[10.v1_10.v1],*10/*36+*10,0.5,Intermediate metaboliser (IM),WHB3377,
4,3,"[42126611~C>G~1/1, 42127803~C>T~0/1, 42127941~...","[10.v1_41.v1, 39.v1_69.v1]",*41/*36+*10,0.75,Intermediate metaboliser (IM),WHB3378,


In [13]:
stellarpgx_df.result.value_counts().sort_index()


*1/*1                                                                                                                                            118
*1/*10                                                                                                                                           104
*1/*10x2                                                                                                                                          15
*1/*10x3                                                                                                                                           2
*1/*113                                                                                                                                            1
                                                                                                                                                ... 
*71/*36+*10                                                                                               

In [14]:
stellarpgx_ready_df = stellarpgx_df.loc[
    :, ["sample_id", "result", "initially_computed_cn"]
].rename(columns={"result": "genotype", "initially_computed_cn": "copy_number"})
stellarpgx_ready_df["filter"] = None
stellarpgx_ready_df["caller"] = "stellarpgx"
stellarpgx_ready_df["genotype"] = stellarpgx_ready_df["genotype"].apply(
    lambda x: np.nan if x == "No_call" else x
)
stellarpgx_ready_df["novel_allele"] = (
    stellarpgx_ready_df["genotype"]
    .str.startswith("Possible novel allele")
    .fillna(False)
)
stellarpgx_ready_df["genotype"].update(
    stellarpgx_df.query("likely_background_alleles.notna()")[
        "likely_background_alleles"
    ].apply(lambda x: x.replace("[", "").replace("]", ""))
)
stellarpgx_ready_df = stellarpgx_ready_df.fillna(np.nan)
stellarpgx_ready_df


Unnamed: 0,sample_id,genotype,copy_number,filter,caller,novel_allele
0,WHB3374,*1/*10,2,,stellarpgx,False
1,WHB3375,*10/*36,2,,stellarpgx,False
2,WHB3376,*10/*36+*10,3,,stellarpgx,False
3,WHB3377,*10/*36+*10,3,,stellarpgx,False
4,WHB3378,*41/*36+*10,3,,stellarpgx,False
...,...,...,...,...,...,...
1847,WHB5465,*1/*1,2,,stellarpgx,False
1848,WHB5466,*1/*36+*10,3,,stellarpgx,False
1849,WHB5467,*1/*21,2,,stellarpgx,False
1850,WHB5468,*1/*1,2,,stellarpgx,False


## Aldy

In [15]:
aldy_df = caller_dfs["aldy"]


In [16]:
aldy_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1852 entries, 0 to 1851
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   genotype     1851 non-null   object 
 1   copy_number  1851 non-null   float64
 2   sample_id    1852 non-null   object 
dtypes: float64(1), object(2)
memory usage: 43.5+ KB


In [17]:
aldy_df


Unnamed: 0,genotype,copy_number,sample_id
0,[*1/*10],2.0,WHB3374
1,[*10/*36.ALDY],2.0,WHB3375
2,[*10/*36.ALDY+*10],3.0,WHB3376
3,[*10/*36.ALDY+*10],3.0,WHB3377
4,[*36.ALDY+*10/*41],3.0,WHB3378
...,...,...,...
1847,[*1/*1],2.0,WHB5465
1848,[*1+*36.ALDY/*36.ALDY+*10],4.0,WHB5466
1849,[*1/*21],2.0,WHB5467
1850,[*1/*1],2.0,WHB5468


In [18]:
aldy_ready_df = aldy_df.explode("genotype")
aldy_ready_df["filter"] = None
aldy_ready_df["caller"] = "aldy"
aldy_ready_df["novel_allele"] = (
    aldy_ready_df["genotype"].str.contains("\+rs").fillna(False)
)
aldy_ready_df = aldy_ready_df.fillna(np.nan)
aldy_ready_df


Unnamed: 0,genotype,copy_number,sample_id,filter,caller,novel_allele
0,*1/*10,2.0,WHB3374,,aldy,False
1,*10/*36.ALDY,2.0,WHB3375,,aldy,False
2,*10/*36.ALDY+*10,3.0,WHB3376,,aldy,False
3,*10/*36.ALDY+*10,3.0,WHB3377,,aldy,False
4,*36.ALDY+*10/*41,3.0,WHB3378,,aldy,False
...,...,...,...,...,...,...
1847,*1/*1,2.0,WHB5465,,aldy,False
1848,*1+*36.ALDY/*36.ALDY+*10,4.0,WHB5466,,aldy,False
1849,*1/*21,2.0,WHB5467,,aldy,False
1850,*1/*1,2.0,WHB5468,,aldy,False


# Merge the clean caller outputs

In [19]:
merged_df = (
    pd.concat([cyrius_ready_df, stellarpgx_ready_df, aldy_ready_df])
    .sort_values(["sample_id", "caller"])
    .infer_objects()
)
merged_df


Unnamed: 0,sample_id,genotype,filter,copy_number,caller,novel_allele
0,WHB3374,*1/*10,,2.0,aldy,False
0,WHB3374,*1/*10,PASS,4.0,cyrius,False
0,WHB3374,*1/*10,,2.0,stellarpgx,False
1,WHB3375,*10/*36.ALDY,,2.0,aldy,False
1,WHB3375,*5/*36+*10,PASS,4.0,cyrius,False
...,...,...,...,...,...,...
1850,WHB5468,*1/*1,PASS,4.0,cyrius,False
1850,WHB5468,*1/*1,,2.0,stellarpgx,False
1851,WHB5469,*1/*36.ALDY+*10,,3.0,aldy,False
1851,WHB5469,*1/*36+*10,PASS,5.0,cyrius,False


In [20]:
merged_df.value_counts("sample_id").value_counts()


3    1833
4      18
5       1
dtype: int64

In [21]:
merged_df.caller.value_counts()


aldy          1867
cyrius        1857
stellarpgx    1852
Name: caller, dtype: int64

In [22]:
merged_df.sample_id.nunique()


1852

# Export as TSV and parquet file

In [24]:
output_dir = Path("output")
file_name = "1_caller_outputs"

if not output_dir.exists():
    output_dir.mkdir()

merged_df.to_csv(output_dir / f"{file_name}.tsv", sep="\t", index=False)
merged_df.to_parquet(output_dir / f"{file_name}.parquet", index=False)
