<div style="text-align: justify; padding:5px; background-color:rgb(252, 253, 255); border: 1px solid lightgrey; padding-left: 1em; padding-right: 1em;">
    <font color='red'>Mini Jupyter tutorial<br><br>To run each cell, click the cell and press <kbd>Run</kbd> from the menu bar. This will run any Python code or display any text within the selected cell before highlighting the next cell down. There are two types of cell: A <i>text cell</i> of type <kbd>Markdown</kbd> or <kbd>Heading</kbd> and a <i>code cell</i> of type <kbd>Code</kbd> identifiable with the <span style="font-family: courier; color:black; background-color:white;">In[ ]:</span> to the left of the cell</i>. The type of cell is also identifiable from the dropdown menu in the above menu bar to the right of <kbd>Run</kbd>. Any visual results produced by the code (text/figures) are displayed directly below that cell. Press <kbd>Run</kbd> again until you reach the end of the notebook or alternatively click <kbd>Kernel</kbd><font color='black'>→</font><kbd>Restart and Run All</kbd>. Should the Jupyter notebook crash for any reason, restart the Jupyter Kernel by clicking <kbd>Kernel</kbd><font color='black'>→</font><kbd>Restart</kbd>, and start again from the top.
        
</div>

# Comparison on variants between individuals to generate a couples report

<p style=\"text-align: justify\">
<br>
    This workflow parses and compares two annotated variant CSV files and generates a couples report
</p>

In [1]:
import pandas as pd
import numpy as np
from natsort import index_natsorted
from number_parser import parse_ordinal
import os
import glob

<div style="background-color:rgb(255, 250, 250); padding:5px;  border: 1px solid lightgrey; padding-left: 1em; padding-right: 1em;">

<h3 style="text-align: justify">Given a list of couples CSV file in folder <kbd>input_couples_csv</kbd>, indivduals are compared and a report CSV is generated for each couple</h3>
</div>

In [2]:
couples = pd.read_csv('input_couples_csv/couples.csv')

pathogenic_panel = ['Pathogenic', 'Likely pathogenic', 'Suspected pathogenic', 'Assess further']
core_gene_panel = ['CFTR', 'SMN1', 'HBA1', 'HBA2', 'HBA1/HBA2', 'HBB']
additional_finding_gene_panel = ['ATM', 'LDLR', 'FH', 'TTN']

In [3]:
# Sort the final report by patient ID and gene before writing out
def custom_sort(col: pd.Series) -> pd.Series:
    if col.name == "Patient ID":
        return col.apply(parse_ordinal)
    return col

In [4]:
path = os.getcwd()

for index, couple in couples.iterrows():
    individual_1_file = couple.iloc[0]
    individual_2_file = couple.iloc[1]

    individual_1_file_path = glob.glob(os.path.join(path, "output_annotated_csv/*"+individual_1_file+"*.csv"))[0]
    individual_2_file_path = glob.glob(os.path.join(path, "output_annotated_csv/*"+individual_2_file+"*.csv"))[0]

    individual_1 = pd.read_csv(individual_1_file_path)
    individual_2 = pd.read_csv(individual_2_file_path)
    
    ind1 = individual_1[['Gene', 'Transcript', 'Start', 'Variant', 'ClassificationOverall']]
    ind2 = individual_2[['Gene', 'Transcript', 'Start', 'Variant', 'ClassificationOverall']]
    
    # Identify common variants that are pathogenic between couples
    couple_variants = pd.merge(ind1, ind2, left_on=['Gene', 'Transcript', 'Start', 'Variant', 'ClassificationOverall'], right_on=['Gene', 'Transcript', 'Start', 'Variant', 'ClassificationOverall']).drop_duplicates()

    couple_condition = [(couple_variants.ClassificationOverall.isin(pathogenic_panel))]

    couple_choices = ['Potential Couple at Risk']

    couple_variants['Reason'] = np.select(couple_condition, couple_choices)
    couple_variants = couple_variants[couple_variants.Reason.isin(couple_choices)]

    couple_gene_risk = list(couple_variants.Gene)

    couple_transcript_risk = list(couple_variants.Transcript)

    couple_variant_risk = list(couple_variants.Variant)

    couple_classify_risk = list(couple_variants.ClassificationOverall)
    
    couple = [individual_1, individual_2]

    report = pd.DataFrame()

    #Filter and assess variants from each couple
    for individual in couple:
    
        genes = np.unique(individual['Gene'])
        gene_variant_classify = individual[['Gene', 'Variant', 'ClassificationOverall']].drop_duplicates().reset_index(drop=True)
    
        #Filter gene variants to only pathogenic and assess further cases
        gene_variant_classify = gene_variant_classify[gene_variant_classify.ClassificationOverall.isin(pathogenic_panel)]
    
        gene_variant = gene_variant_classify[['Gene', 'Variant']].drop_duplicates().reset_index(drop=True)
    
        genes = np.unique(gene_variant['Gene'])
        
        #Count the number of pathogenic and assess further variants per gene 
        df_gene_variant_grouping = pd.DataFrame()
        for gene in genes:
            gene_variant_grouping = gene_variant[gene_variant.Gene == gene].copy(deep=True)
            gene_variant_grouping['Gene_Variant_Count'] = len(gene_variant_grouping)       

            if df_gene_variant_grouping.empty:
                df_gene_variant_grouping = gene_variant_grouping
            else:
                df_tmp = gene_variant_grouping
                df_gene_variant_grouping = pd.concat([df_gene_variant_grouping, df_tmp], axis=0).reset_index(drop=True)
    
        #Merge the counts back into the full dataset, then set NaN gene_variant_count values to 0
        merged_individual = individual.merge(df_gene_variant_grouping, left_on=['Gene', 'Variant'], right_on=['Gene', 'Variant'], how='outer').drop_duplicates().reset_index(drop=True)
        
        merged_individual['Gene_Variant_Count'] = merged_individual['Gene_Variant_Count'].fillna(0)
        
        #Filter variants based on series of conditions
        conditons_dict = dict({})
        
        conditons_dict['Potential Affected (HOM)'] = [(merged_individual.Zygosity == 'HOMOZYGOTE')
                                        & (merged_individual.ClassificationOverall.isin(pathogenic_panel))]
    
        conditons_dict['Potential Affected (COMPHET)'] = [((merged_individual.Gene_Variant_Count >= 2)
                                        & (merged_individual.ClassificationOverall.isin(pathogenic_panel)))]    
    
        conditons_dict['Potential X-linked'] = [(merged_individual.Chromosome == 'X')
                                        & (merged_individual.ClassificationOverall.isin(pathogenic_panel))]
    
        conditons_dict['Core Gene'] = [(merged_individual.Gene.isin(core_gene_panel))
                                    & (merged_individual.ClassificationOverall.isin(pathogenic_panel))]
    
        conditons_dict['Potential Additional Finding'] = [(merged_individual.Gene.isin(additional_finding_gene_panel))
                                    & (merged_individual.ClassificationOverall.isin(pathogenic_panel))]
    
        conditons_dict['Potential Couple at Risk'] = [(merged_individual.Gene.isin(couple_gene_risk))
                                                   & (merged_individual.Transcript.isin(couple_transcript_risk))
                                                   & (merged_individual.Variant.isin(couple_variant_risk))
                                                   & (merged_individual.ClassificationOverall.isin(couple_classify_risk))]
                                                  
        # Build the variant assessment report
        for key, value in conditons_dict.items():
            
            choices = [key]
            
            sample_id_choices = [list(individual['Sample#'])] * len(choices)
            genomic_coordinates = [list(individual['Genomic Coordinates'])] * len(choices)
            ref = [list(individual['Ref'])] * len(choices)
            alt = [list(individual['Alt'])] * len(choices)
            gene = [list(individual['Gene'])] * len(choices)
            variant = [list(individual['Variant'])] * len(choices)
            variant_protein = [list(individual['Protein Change'])] * len(choices)
            transcript = [list(individual['Transcript'])] * len(choices)        
            allele_frequency = [[x[0].split('=')[1] for x in list(individual['Variant QC'].str.split(';'))]] * len(choices)
            zygosity = [list(individual['Zygosity'])] * len(choices)
            classification_overall = [list(individual['ClassificationOverall'])] * len(choices)
            
            if report.empty:
                report['Patient ID'] = np.select(value, sample_id_choices)
                report['Genomic Coordinates'] = np.select(value, genomic_coordinates)
                report['Ref'] = np.select(value, ref)
                report['Alt'] = np.select(value, alt)
                report['Gene'] = np.select(value, gene)
                report['Variant Change'] = np.select(value, variant)
                report['Transcript'] = np.select(value, transcript)
                report['Variant Protein'] = np.select(value, variant_protein)
                report['Zygosity'] = np.select(value, zygosity)
                report['Allele Frequency'] = np.select(value, allele_frequency)
                report['ClassificationOverall'] = np.select(value, classification_overall)
                report['Reason'] = np.select(value, choices)
                report = report.loc[~(report=='0').all(axis=1)].reset_index(drop=True)
            else:
                df_tmp = pd.DataFrame()
                df_tmp['Patient ID'] = np.select(value, sample_id_choices)
                df_tmp['Genomic Coordinates'] = np.select(value, genomic_coordinates)
                df_tmp['Ref'] = np.select(value, ref)
                df_tmp['Gene'] = np.select(value, ref)
                df_tmp['Alt'] = np.select(value, alt)
                df_tmp['Gene'] = np.select(value, gene)
                df_tmp['Variant Change'] = np.select(value, variant)
                df_tmp['Transcript'] = np.select(value, transcript)
                df_tmp['Variant Protein'] = np.select(value, variant_protein)
                df_tmp['Zygosity'] = np.select(value, zygosity)
                df_tmp['Allele Frequency'] = np.select(value, allele_frequency)
                df_tmp['ClassificationOverall'] = np.select(value, classification_overall)
                df_tmp['Reason'] = np.select(value, choices)  
                report = pd.concat([report, df_tmp], axis=0)
                report = report.loc[~(report=='0').all(axis=1)].reset_index(drop=True)
        
    report['Analyst comment'] = [''] * len(report)

    report = report.sort_values(by=['Patient ID', 'Gene'], key=custom_sort)
    
    # Check if there is an empty report with no variants, then add comment
    if len(report) == 0:
        report['Analyst comment'] = ['No Variants for Reporting']
    
    output_file = "output_couples_report/"+ individual_1_file +"_"+ individual_2_file +"_couples_report.csv"
    report.to_csv(output_file, index=False)