In [3]:
import pandas as pd
import numpy as np
from pathlib import Path
import fuc
from fuc import pyvcf
import re

In [4]:
# Set path for the ouputs of the cyp2d6 caller
caller_outputs_path = Path("/home/jupyter-yusuf/aws-s3-cyp2d6/cyp2d6-caller-outputs") 

# Load list of samples with potential novel alleles

file_path = "samples_with_potential_novel_alleles.txt"

with open("samples_with_potential_novel_alleles.txt", "r") as f:
            samples = [line.strip() for line in f.readlines() if len(line) > 0]
sample_list = [name for name in samples]


# Function to parse stellar pgx output and to obtain all the merged vcfs

In [5]:
def parse_output_stellarpgx(stellarpgx_alleles_file):
    file_path = Path(stellarpgx_alleles_file).resolve()
    if not file_path.exists():
        raise FileNotFoundError(file_path)

    with file_path.open("r") as f:
        lines = [line.strip() for line in f.readlines() if len(line) > 0]

    result = []
    for i, line in enumerate(lines):
        if line.startswith("Likely"):
            result.append(lines[i + 1])
            
        elif line.startswith("Initially"):
            result.append(lines[i][-1])
            
        elif line.startswith("Candidate"):
            result.append(lines[i + 1].strip())
        
        elif line.startswith("Sample"):
            result.append(lines[i + 1:i + 2])
            
    return result

In [6]:
parsed_output = {}
vcfs = []

for sample in sample_list:
    parsed_output[sample] = {}
    stellarpgx_files = [file for file in caller_outputs_path.glob(f"{sample}/**/*.alleles")]
    stellarpgx_files_vcf = [file for file in caller_outputs_path.glob(f'{sample}/**/*.vcf.gz')]

    for file in stellarpgx_files:
        result = parse_output_stellarpgx(file)
        parsed_output[sample]["copy_number"] = result[0]
        parsed_output[sample]["core_allele"] = result[1]
        parsed_output[sample]["candidate_allele"] = result[2]
        parsed_output[sample]["likely_background_allele"] = result[-1]

    for file in stellarpgx_files_vcf:       
        vcf = pyvcf.VcfFrame.from_file(file.as_posix())        
        vcfs.append(vcf)

        


In [7]:
novel_df = pd.DataFrame(parsed_output).T.reset_index(names='samples')
novel_df.head()

Unnamed: 0,samples
0,WHB3386
1,WHB3440
2,WHB3461
3,WHB3484
4,WHB3488


In [6]:
# Creating new columns for haplotypes
novel_df['core_alleles_comparison'] = novel_df['core_allele'].apply(lambda x: re.findall("\d{8}~\w+>\w+", str(x))) # Core allele withour genotype
novel_df['likely_background_allele'] = novel_df['likely_background_allele'].apply(lambda x: re.findall("\*\d+", x))
novel_df[['h1','h2']] = novel_df['likely_background_allele'].values.tolist()

In [7]:
novel_df['core_alleles_comparison']

0     [42126611~C>G, 42126877~G>A, 42127803~C>T, 421...
1                                        [42130692~G>A]
2     [42126611~C>G, 42127941~G>A, 42129779~A>G, 421...
3            [42126611~C>G, 42127853~G>A, 42130692~G>A]
4                                        [42130692~G>A]
                            ...                        
88                                       [42130692~G>A]
89           [42126611~C>G, 42127853~G>A, 42130692~G>A]
90           [42126611~C>G, 42127841~C>T, 42130692~G>A]
91                                       [42130692~G>A]
92                                       [42128341~C>T]
Name: core_alleles_comparison, Length: 93, dtype: object

# 2. Obtain the core variants of each haplotype from reference

In [8]:
# Reference allele core variants
ref_df = pd.read_csv('CYP2D6.NC_000022.11.haplotypes.tsv', sep = '\t', skiprows= 1, usecols=['Haplotype Name','Variant Start','Reference Allele','Variant Allele'])

In [9]:
# Clean up the ref_dference allele core variants df
ref_df['var'] = ref_df['Variant Start'].map(str) + "~" + ref_df['Reference Allele'].map(str) + ">" + ref_df["Variant Allele"].map(str)
ref_df = ref_df.rename(columns = {'Haplotype Name':'haplotype'})

# To keep only major allele core variants
remove = ref_df.loc[ref_df['haplotype'].str.contains("\.")]
ref_df.drop(remove.index, inplace =True)
ref_df['haplotype'] = ref_df['haplotype'].str.replace("CYP2D6", "")

# Change to look up data frame to dictionary
ref_dict = ref_df.groupby('haplotype').agg({'var': lambda x: ','.join(x)}).to_dict()['var']


In [10]:
# Adding the core variants of haplotypes to the novel_df

novel_df['h1_core_var'] = novel_df['h1'].apply(lambda x: ref_dict.get(x)).fillna(np.nan)
novel_df['h2_core_var'] = novel_df['h2'].apply(lambda x: ref_dict.get(x)).fillna('[]')



In [11]:
novel_df.head()

Unnamed: 0,samples,copy_number,core_allele,candidate_allele,likely_background_allele,core_alleles_comparison,h1,h2,h1_core_var,h2_core_var
0,WHB3386,2,[42126611~C>G~1/1;42126877~G>A~0/1;42127803~C>...,[136.v1_4.v1],"[*136, *4]","[42126611~C>G, 42126877~G>A, 42127803~C>T, 421...",*136,*4,"42127941~G>A,42126877~G>A,42126611~C>G",42128945~C>T
1,WHB3440,3,[42130692~G>A~0/1],[10.v1_39.v1],"[*10, *39]",[42130692~G>A],*10,*39,"42130692~G>A,42126611~C>G",42126611~C>G
2,WHB3461,2,[42126611~C>G~0/1;42127941~G>A~0/1;42129779~A>...,[34.v1_88.v1],"[*34, *88]","[42126611~C>G, 42127941~G>A, 42129779~A>G, 421...",*34,*88,42127941~G>A,"42129779~A>G,42126611~C>G"
3,WHB3484,3,[42126611~C>G~0/1;42127853~G>A~0/1;42130692~G>...,[1.v1_10.v1],"[*1, *10]","[42126611~C>G, 42127853~G>A, 42130692~G>A]",*1,*10,.~nan>nan,"42130692~G>A,42126611~C>G"
4,WHB3488,3,[42130692~G>A~0/1],[10.v1_39.v1],"[*10, *39]",[42130692~G>A],*10,*39,"42130692~G>A,42126611~C>G",42126611~C>G


In [12]:
novel_df['h2_core_var'].isna().sum()

0

# 3. Add additional variant column

In [13]:

add_var = []
for i in range(novel_df.shape[0]):
  try:
    var = [x for x in novel_df['core_alleles_comparison'][i] if x not in novel_df['h2_core_var'][i] and x not in novel_df['h1_core_var'][i]]
  except TypeError:
    var = []
  add_var.append(var)

novel_df['add_var'] = add_var

# 4.1 Merge vcfs and obtain AD information

In [14]:

novel_vcf = pyvcf.rescue_filtered_variants(vcfs, format = "GT:AD").df

# Restructure the POS value to match the novel_df
novel_vcf['POS'] = novel_vcf['POS'].map(str) + "~" + novel_vcf['REF'].map(str) + ">" + novel_vcf["ALT"].map(str)

novel_vcf = novel_vcf.replace('./.:.', '').set_index('POS')

novel_vcf.head()

Unnamed: 0_level_0,CHROM,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,WHB3386,WHB3440,...,WHB5237,WHB5352,WHB5357,WHB5358,WHB5383,WHB5384,WHB5419,WHB5444,WHB5449,WHB5450
POS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
42126310~C>T,chr22,.,C,T,.,.,.,GT:AD,,,...,,,,,"0/1:8,11",,,,,
42126347~T>C,chr22,.,T,C,.,.,.,GT:AD,,,...,,,,,,,,,,
42126390~G>A,chr22,.,G,A,.,.,.,GT:AD,"0/1:8,13","0/1:9,13",...,"0/1:6,11","1/1:1,25",,,,"0/1:9,9","0/1:7,17","0/1:6,8","0/1:6,10",
42126611~C>G,chr22,.,C,G,.,.,.,GT:AD,"1/1:0,19",,...,,"1/1:0,22",,,"0/1:18,13",,"0/1:4,9","0/1:18,14",,
42126741~A>G,chr22,.,A,G,.,.,.,GT:AD,,,...,,,,,,,,,,


# 4.2 Query AD information from novel_vcf

In [15]:
ad_list = []
for index, row in novel_df.iterrows():
    sample = row['samples']
    vars = row['add_var']
    try :
        value = novel_vcf[sample][vars].to_list()
        ad_list.append(value)
    except KeyError:
        ad_list.append('LowABHet or LowQUAL')
    



novel_df['GT:AD'] = ad_list

In [17]:
novel_df.to_csv('novel_alleles.csv', index = False)