This notebook provides a table with the altered driver genes in primary samples of ALL (Additional file 1 Figure S2). At the end of the notebook there are also some chi square test to check for differences between groups. The statistics and p-values are then provided in the text

In [None]:
import pandas as pd
import os
import numpy as np
import glob
from collections import defaultdict

from scipy.stats import chi2_contingency
from scipy.stats import chi2

import seaborn
import matplotlib.pyplot as plt
import matplotlib.patches as mpatch
import matplotlib.gridspec as gridspec
from matplotlib.patches import Rectangle

from aux_data_in_pyvar import config_rcparams,COLORS_SUBTYPES,COLORS_AGES_TALL

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', -1)

In [None]:
config_rcparams()

def stack_barplot(df, ax_grid, subtype_labels):
    
    suma_bar = 0
    
    for c in subtype_labels:
        df_c = df[df['SUBTYPE_LABEL'] == c].reset_index()
        
        if df_c.empty == False:
        
            ax_grid.barh(0, df_c.loc[0,'NUM PATIENTS'], color=COLORS_SUBTYPES[c], 
                     edgecolor='white', height=1, left=suma_bar)
            suma_bar = suma_bar + df_c.loc[0,'NUM PATIENTS']
    return ax_grid

In [None]:
# literature list of genes
df_list_lite = pd.read_csv("../ext_files/literature/mutations_lite.tsv", sep='\t')
df_list_lite.head()

In [None]:
# Read candidate mutations

# this dataframe is the result after running ../processing/driver_mutations_primary_ALL.ipynb
df_all = pd.read_csv("candidate_driver_muts.tsv", sep='\t') 
df_all['PATHWAY'] = df_all['PATHWAY'].fillna('other')
df_all['SUBTYPE'] = df_all['SUBTYPE'].str.replace('PHALL', 'Ph positive')
df_all['SUBTYPE_LABEL'] = df_all['SUBTYPE_LABEL'].str.replace('PHALL', 'Ph positive')
df_all = df_all[df_all['STAGE'] == 'primary']

# Read detected candidate driver genes per subtype.
# You can find this data frame in Additional file 2 Table S3 
df_genes = pd.read_csv("cancer_genes_ALL.csv", sep='\t')

In [None]:
# FILTER BY STAGE. THIS PLOT IS A LANDSCAPE OF PRIMARY AL LEUKEMIAS
df_all_pry = df_all[df_all['STAGE'] == 'primary'] 
df_all_pry = df_all_pry[~df_all_pry['AGE_RANGE'].isnull()]


# SORT COLUMNS BY COHORT AND AGE RANGES
order_subtypes = ['TALL Adult','TALL Pediatric', 'BALL Pediatric', 'DUX4-ERG', 'Hypodiploid',
                 'Hyperdiploid', 'Ph-like', 'Ph positive','iAMP21', 'Infant MLL-R']

grps_lables = df_all_pry.groupby('SUBTYPE_LABEL')

df_all_pry = pd.DataFrame()
for g in order_subtypes:
    df_label = grps_lables.get_group(g)
    df_label.sort_values(by=["SUBTYPE","AGE_RANGE"], inplace=True, ascending=False)
    df_all_pry = df_all_pry.append(df_label, ignore_index=True)

# create list of sorted samples
samples = df_all_pry[['SUBTYPE_LABEL', 'AGE_RANGE', 'COMPARISON']].drop_duplicates()['COMPARISON'].tolist()

In [None]:
# CREATE DICTIONARY WITH GENE AS KEY AND SUBTYPES AS VALUES
# this will help to distinguix between mutations in cohort detected 
# driver genes by intogen or mutations in "known" by literature driver gene

dicc_drivers = defaultdict(list)

for i,rw in df_genes.iterrows():
    if (rw["SUBTYPE"] == 'ALL_Pediatric_WXS_pry'):
        dicc_drivers[rw['SYMBOL']].append('TALL_Pediatric_WXS_pry')
        dicc_drivers[rw['SYMBOL']].append('BALL_Pediatric_WXS_pry')
    elif (rw["SUBTYPE"] == 'ALL_Pediatric_WXS_rel'):
        dicc_drivers[rw['SYMBOL']].append('TALL_Pediatric_WXS_rel')
        dicc_drivers[rw['SYMBOL']].append('BALL_Pediatric_WXS_rel')
    else:    
        dicc_drivers[rw['SYMBOL']].append(rw['SUBTYPE'])

# complete with literature
for j,row in df_list_lite.iterrows():
    dicc_drivers[row['SYMBOL']].append('literature')

In [None]:
# SORT ROWS (GENES) BY PATHWAYS

left_annotator_count = df_all_pry[['PATHWAY', 'SYMBOL', 'COMPARISON']].drop_duplicates().groupby(['PATHWAY','SYMBOL']).count()
left_annotator_count.rename(columns={'COMPARISON':'COUNT'}, inplace=True)

order_pathways = left_annotator_count.sort_values('COUNT', ascending=False).reset_index()['PATHWAY'].unique().tolist()

left_annotator = df_all_pry[['PATHWAY', 'SYMBOL']].drop_duplicates(keep='first')

left_annotator = left_annotator.merge(left_annotator_count, on=['PATHWAY', 'SYMBOL'])

grps_path = left_annotator.groupby('PATHWAY')
left_annotator = pd.DataFrame()

for g in order_pathways:
    try:
        df = grps_path.get_group(g)
        df = df.sort_values('COUNT', ascending=False)
        left_annotator = left_annotator.append(df, ignore_index=True, sort=False)
    except KeyError:
        do_nothing = 'do_nothing'

In [None]:
# Create information for stacked barplot on the right

subset_for_count = df_all_pry[['PATIENT','SUBTYPE_LABEL','SYMBOL']].drop_duplicates(keep='first')

subset_for_count = subset_for_count.groupby(by=['SYMBOL', 'SUBTYPE_LABEL']).count()

subset_for_count.rename(columns={'PATIENT':'NUM PATIENTS'}, inplace=True)

subset_for_count.reset_index(inplace=True)

In [None]:
# Output plot name
output_plot = "table_driver_mutations_primary_ALL_pathways.svg"

In [None]:
## MAKE FIGURE

# make subset
df_subset = df_all[['COMPARISON','AGE_RANGE','SYMBOL','SUBTYPE_LABEL', 'SUBTYPE', 'PATHWAY']]
grps_drivers = df_subset.groupby("SYMBOL")

## Define figure
fig = plt.figure(figsize=(22,250))

outer = gridspec.GridSpec(ncols=3, nrows=len(df_subset['SYMBOL'].tolist())+2, hspace=0.2,wspace=0.08, 
                           width_ratios=[5,len(samples),15], figure=fig)

## add pathway grid

ax_grid_p0 = fig.add_subplot(outer[1,0])
ax_grid_p0.set_ylabel('PATHWAYS', rotation=0, labelpad=1, ha='right', va='center',
                     fontsize=12, fontweight='bold')
ax_grid_p0.set_yticks([])
ax_grid_p0.set_xticks([])
ax_grid_p0.tick_params(top=False, bottom=False, left=False, right=False)
ax_grid_p0.spines['top'].set_visible(False)
ax_grid_p0.spines['right'].set_visible(False)
ax_grid_p0.spines['left'].set_visible(False)

for k,p in enumerate(left_annotator['PATHWAY'].tolist()):
    ax_grid_p = fig.add_subplot(outer[k+2,0], sharex=ax_grid_p0)
    ax_grid_p.set_yticks([])
    ax_grid_p.set_xticks([])
    ax_grid_p.spines['top'].set_visible(False)
    ax_grid_p.spines['right'].set_visible(False)
    ax_grid_p.spines['bottom'].set_visible(False)
    ax_grid_p.spines['left'].set_visible(False)
    ax_grid_p.text(x=0.5, y=0.2,s=p, fontsize=12, va='bottom',ha='right', fontweight='bold')
    
## add cohort label grid

width = 1
height = 2

ax_grid_0 = fig.add_subplot(outer[0,1], sharey=ax_grid_p)
ax_grid_0.set_ylabel('ALL SUBTYPES', rotation=0, labelpad=1, ha='right', va='center',
                     fontsize=12, fontweight='bold')
ax_grid_0.set_yticks([])
ax_grid_0.set_xticks(range(len(samples)))
ax_grid_0.set_xticklabels(samples,fontsize=8,color='#252525', rotation=90, ha='center',va='bottom')
ax_grid_0.tick_params(top=False, bottom=False, left=False, right=False)
ax_grid_0.tick_params(axis='x',top=False, bottom=False, left=False,pad = 0, right=False,labelbottom=False,labeltop=True)
ax_grid_0.tick_params(axis='y',top=False, bottom=False, left=False, right=False)
ax_grid_0.spines['top'].set_visible(False)
ax_grid_0.spines['right'].set_visible(False)
ax_grid_0.spines['left'].set_visible(False)

for i,pat in enumerate(samples):
        df_pat = df_subset[df_subset["COMPARISON"] == pat].reset_index(drop=True)
        try:
            ax_grid_0.add_patch(Rectangle(xy=(i-0.5, 0) ,
                                       width=width, height=height, linewidth=0.5, 
                                            color=COLORS_SUBTYPES[df_pat.loc[0, 'SUBTYPE_LABEL']], 
                                          fill=True))
        except KeyError:
            print(df_pat)

## add age range grid

ax_grid_1 = fig.add_subplot(outer[1,1], sharex=ax_grid_0)
ax_grid_1.set_ylabel('AGE RANGES', rotation=0, labelpad=1, ha='right', va='center',
                     fontsize=12, fontweight='bold')
ax_grid_1.set_yticks([])
ax_grid_1.set_xticks(range(len(samples)))
ax_grid_1.xaxis.set_visible(False)
ax_grid_1.tick_params(top=False, bottom=False, left=False, right=False)
ax_grid_1.spines['top'].set_visible(False)
ax_grid_1.spines['right'].set_visible(False)
ax_grid_1.spines['left'].set_visible(False)
for i,pat in enumerate(samples):
    df_pat = df_subset[df_subset["COMPARISON"] == pat].reset_index(drop=True)
    try:
        ax_grid_1.add_patch(Rectangle(xy=(i-0.5, 0) ,
                                       width=width, height=height, linewidth=0.5, 
                                            color=COLORS_AGES_TALL[df_pat.loc[0, 'AGE_RANGE']], 
                                          fill=True))
    except KeyError:
        print(df_pat)
        
## add gene symbol grids

for j,gene in enumerate(left_annotator['SYMBOL'].tolist()):
 
    ax_grid = fig.add_subplot(outer[j+2,1], sharex=ax_grid_0)
    ax_grid.set_ylabel(gene, rotation=0, labelpad=10, ha='right', va='center',
                      fontsize=12,fontstyle='italic')
    ax_grid.set_yticks([])
    ax_grid.set_xticks(range(len(samples)))
    ax_grid.xaxis.set_visible(False)
    ax_grid.spines['top'].set_visible(False)
    ax_grid.spines['right'].set_visible(False)
    ax_grid.spines['bottom'].set_visible(False)
    ax_grid.spines['left'].set_visible(False)
  
    for i,sam in enumerate(samples):
        df_subset_gene = grps_drivers.get_group(gene)
        df_pat = df_subset_gene[df_subset_gene["COMPARISON"] == sam].reset_index(drop=True)
        if df_pat.empty != True:
            if df_pat.loc[0, 'SUBTYPE'] in dicc_drivers[gene]:
                ax_grid.add_patch(Rectangle(xy=(i-0.5, 0) ,
                                       width=width, height=height, linewidth=0.5, 
                                            color='#252525', fill=True))
            else:
                ax_grid.add_patch(Rectangle(xy=(i-0.5, 0) ,
                                       width=width, height=height, linewidth=0.5, 
                                            color='#bdbdbd', fill=True))
## add stack barplot grid with counts 

ax_grid_c0 = fig.add_subplot(outer[1,2], sharey=ax_grid_p)
ax_grid_c0.text(x=0, y=0,s='NUM.PATIENTS', fontsize=12, fontweight='bold', ha='left', va='center')
ax_grid_c0.set_yticks([])
ax_grid_c0.set_xticks([])
ax_grid_c0.spines['top'].set_visible(False)
ax_grid_c0.spines['right'].set_visible(False)
ax_grid_c0.spines['bottom'].set_visible(False)

for k,gene in enumerate(left_annotator['SYMBOL'].tolist()):

    df_counts = subset_for_count[subset_for_count['SYMBOL'] == gene]
    
 
    ax_grid_c = fig.add_subplot(outer[k+2,2], sharex=ax_grid_c0)
    ax_grid_c = stack_barplot(df_counts, ax_grid_c, order_subtypes)
    ax_grid_c.set_xticks([])
    ax_grid_c.set_yticks([])
    ax_grid_c.spines['top'].set_visible(False)
    ax_grid_c.spines['right'].set_visible(False)
    ax_grid_c.spines['bottom'].set_visible(False)

plt.savefig(output_plot, bbox_inches='tight', dpi=300)
plt.show()

### Chi-square

In [None]:
df_info = df_all[df_all['STAGE'] == 'primary'][['PATIENT', 'SUBTYPE_LABEL', 'TYPE']].drop_duplicates()
df_info.head()

In [None]:
# Define cancer gene of interest
cancer_gene = 'PHF6'

test_gene = df_info.merge(df_all[df_all['SYMBOL'] == cancer_gene].drop_duplicates()[['SYMBOL', 'PATIENT']].drop_duplicates(), 
                              how='left', on='PATIENT')
test_gene['Mutated'] = test_gene.apply(lambda x: True if type(x['SYMBOL']) == str else False, axis=1)

In [None]:
test_gene.head()

In [None]:
# create contingency table
contingency_table = pd.crosstab(index=test_gene['Mutated'], columns=test_gene['TYPE'])
contingency_table

In [None]:
stat_chi2, p, dof, expected = chi2_contingency(contingency_table, correction=False)
p

In [None]:
from decimal import Decimal

print('%.2E' % Decimal(p))

output = "{:.3f}".format(p)
print(output)

In [None]:
# interpret test-statistic
prob = 0.90
critical = chi2.ppf(prob, dof)
if abs(stat_chi2) >= critical:
    print('Dependent (reject H0)')
else:
    print('Independent (fail to reject H0)')

In [None]:
# interpret test-statistic
prob = 0.95
critical = chi2.ppf(prob, dof)
if abs(stat_chi2) >= critical:
    print('Dependent (reject H0)')
else:
    print('Independent (fail to reject H0)')