This notebook provides a table figure with the protein affecting mutations in driver genes of T-ALL cohorts. The figure here correspond to Figure 2B and Additional file 1 Figure S4 of the paper.

In [None]:
import os
import pandas as pd
import numpy as np
import seaborn
import matplotlib.pyplot as plt
import matplotlib.patches as mpatch
from collections import defaultdict
import functools
from multiprocessing import Pool
import matplotlib.gridspec as gridspec
from matplotlib.patches import Rectangle
import six
from bgreference import hg19
from scipy.stats import chi2_contingency
from scipy.stats import chi2
from aux_data_in_pyvar import config_rcparams,COLORS_AGES_TALL,COLORS_SUBTYPES,COLORS_COHORTS,COLORS_IMMUNOPHENO

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', -1)

In [None]:
config_rcparams()

In [None]:
def plot_combination(combination_subset, coord_x, ax_grid):
    size_standard = 120
    #size_standard = 160
    
    if {'shared', 'private_relapse','private_primary'} == combination_subset:
        drawPieMarker(xs=coord_x,
              ys=1,
              ratios=[0.5, 0.5],
              sizes=[size_standard,size_standard],
              colors=['#2c7fb8','#fd8d3c'],
              border_colors = "#006837",
              border_width = 2,
              ax = ax_grid)  
    elif {'shared', 'private_relapse'} == combination_subset:
        drawPieMarker(xs=coord_x,
              ys=1,
              ratios=[0.5, 0.5],
              sizes=[size_standard,size_standard],
              colors=['#FFFFFF','#fd8d3c'],
              border_colors = "#006837",
              border_width = 2,
              ax = ax_grid)  
    elif {'shared', 'private_primary'} == combination_subset:
        drawPieMarker(xs=coord_x,
              ys=1,
              ratios=[0.5, 0.5],
              sizes=[size_standard,size_standard],
              colors=['#2c7fb8','#FFFFFF'],
              border_colors = "#006837",
              border_width = 2,
              ax = ax_grid)  
    elif {'private_primary', 'private_relapse'} == combination_subset:
        drawPieMarker(xs=coord_x,
              ys=1,
              ratios=[0.5, 0.5],
              sizes=[size_standard,size_standard],
              colors=['#2c7fb8','#fd8d3c'],
              border_colors = '#FFFFFF',
              border_width = 0.8,
              ax = ax_grid)  
    elif {'shared'} == combination_subset:
        drawPieMarker(xs=coord_x,
              ys=1,
              ratios=[1],
              sizes=[size_standard],
              colors=['#FFFFFF'],
              border_colors = "#006837",
              border_width = 2,
              ax = ax_grid)
    elif {'private_primary'} == combination_subset:
        drawPieMarker(xs=coord_x,
              ys=1,
              ratios=[0.5, 0.5],
              sizes=[size_standard,size_standard],
              colors=['#2c7fb8','#FFFFFF'],
              border_colors = '#FFFFFF',
              border_width = 0.8,
              ax = ax_grid)
    elif {'private_relapse'} == combination_subset:
        drawPieMarker(xs=coord_x,
              ys=1,
              ratios=[0.5, 0.5],
              sizes=[size_standard,size_standard],
              colors=['#FFFFFF', '#fd8d3c'],
              border_colors = '#FFFFFF',
              border_width = 0.8,
              ax = ax_grid)
    else:
 #       drawPieMarker(xs=coord_x,
 #             ys=1,
 #             ratios=[1],
 #             sizes=[size_standard],
 #             colors=['#FFFFFF'],
 #             border_colors = '#FFFFFF',
 #             ax = ax_grid)
        pass

    
def drawPieMarker(xs, ys, ratios, sizes, colors, border_colors, border_width, ax):
    "Adapted from https://stackoverflow.com/questions/56337732/how-to-plot-scatter-pie-chart-using-matplotlib"
    assert sum(ratios) <= 1, 'sum of ratios needs to be < 1'
    
    markers = []
    previous = 0
    # calculate the points of the pie pieces
    for color, ratio in zip(colors, ratios):
        this = 2 * np.pi * ratio + previous
        x  = [0] + np.cos(np.linspace(previous, this, 40)).tolist() + [0]
        y  = [0] + np.sin(np.linspace(previous, this, 40)).tolist() + [0]
        xy = np.column_stack([x, y])
        previous = this
        markers.append({'marker':xy, 's':np.abs(xy).max()**2*np.array(sizes), 'facecolor':color, 
                        'edgecolors':border_colors, 'linewidth':border_width})

    # scatter each of the pie pieces to create pies
    if len(ratios) == 1:
        ax.scatter(xs, ys, marker = 'o',s=np.abs(xy).max()**2*np.array(sizes), facecolor=color, 
                        edgecolors=border_colors, linewidth=border_width)
        ax.set_ylim(0,2)
    else:
        for marker in markers: 
            ax.scatter(xs, ys, **marker)
            ax.set_ylim(0,2)
            
def stack_barplot(df, ax_grid, cohorts, col):
    
    suma_bar = 0
    df.reset_index(inplace=True, drop=True)
    
    for c in cohorts:
        df_c = df[df['COHORT'] == c].reset_index()
        
        if df_c.empty == False:
        
            ax_grid.barh(0, df_c.loc[0,'NUM PATIENTS'], color=COLORS_COHORTS[c], 
                     edgecolor='white', height=1, left=suma_bar)
            suma_bar = suma_bar + df_c.loc[0,'NUM PATIENTS']
    if col == 'GENE':
        ax_grid.set_ylabel(df.loc[0,col], rotation='horizontal', va='center')

In [None]:
def make_table_alterations(df, fig_size, output_path, option='B'):
    ## MAKE FIGURE

    #Define some parameters
    
    width = 1
    height = 2

    # make subset
    df_subset = df[['PATIENT', 'subset', 'AGE_RANGE','SYMBOL','COHORT', 'PATHWAY', 'IMMUNOPHENOTYPE', 'GENE']]
    patients = df_subset[['PATIENT']].drop_duplicates()['PATIENT'].tolist()
    grps_drivers = df_subset.groupby("SYMBOL")

    ## Define figure
    fig = plt.figure(figsize=fig_size)

    outer = gridspec.GridSpec(ncols=3, nrows=len(df_subset['SYMBOL'].tolist())+3, hspace=0.05,wspace=0.08, 
                               width_ratios=[7,len(patients),10], figure=fig)

    ## add pathway grid

    ax_grid_p0 = fig.add_subplot(outer[1,0])
    ax_grid_p0.set_ylabel('PATHWAYS', rotation=0, labelpad=2, ha='right', va='center',
                         fontsize=12, fontweight='bold')
    ax_grid_p0.set_yticks([])
    ax_grid_p0.set_xticks([])
    ax_grid_p0.tick_params(top=False, bottom=False, left=False, right=False)
    ax_grid_p0.spines['top'].set_visible(False)
    ax_grid_p0.spines['right'].set_visible(False)
    ax_grid_p0.spines['left'].set_visible(False)

    for k,p in enumerate(left_annotator['PATHWAY'].tolist()):
        ax_grid_p = fig.add_subplot(outer[k+3,0], sharex=ax_grid_p0)
        ax_grid_p.set_yticks([])
        ax_grid_p.set_xticks([])
        ax_grid_p.spines['top'].set_visible(False)
        ax_grid_p.spines['right'].set_visible(False)
        ax_grid_p.spines['bottom'].set_visible(False)
        ax_grid_p.spines['left'].set_visible(False)
        ax_grid_p.text(x=0.5, y=0.2,s=p, fontsize=12, va='bottom',ha='right', fontweight='bold')
    
    ## add immunopheno label grid

    ax_grid_extra = fig.add_subplot(outer[0,1])
    ax_grid_extra.set_ylabel('IMMUNOPHENO', rotation=0, labelpad=1, ha='right', va='center',
                         fontsize=12, fontweight='bold')
    ax_grid_extra.set_yticks([])
    ax_grid_extra.set_xticks(range(len(patients)))
    ax_grid_extra.set_xticklabels(patients,color='#252525', rotation=45, ha='left',va='bottom')
    ax_grid_extra.tick_params(axis='x',top=False, bottom=False, left=False, right=False,labelbottom=False,labeltop=True)
    ax_grid_extra.tick_params(axis='y',top=False, bottom=False, left=False, right=False)
    ax_grid_extra.spines['top'].set_visible(False)
    ax_grid_extra.spines['right'].set_visible(False)
    ax_grid_extra.spines['left'].set_visible(False)

    for i,pat in enumerate(patients):
            df_pat = df_subset[df_subset["PATIENT"] == pat].reset_index(drop=True)
            try:
                ax_grid_extra.add_patch(Rectangle(xy=(i-0.5, 0) ,
                                       width=width, height=height, linewidth=0.5, 
                                            color=COLORS_IMMUNOPHENO[df_pat.loc[0, 'IMMUNOPHENOTYPE']], 
                                          fill=True))
            except KeyError:
                print(df_pat) 

    ## add cohort label grid

    ax_grid_0 = fig.add_subplot(outer[1,1], sharex=ax_grid_extra, sharey=ax_grid_p0)
    ax_grid_0.set_ylabel('ALL SUBTYPES', rotation=0, labelpad=1, ha='right', va='center',
                         fontsize=12, fontweight='bold')
    ax_grid_0.set_yticks([])
    ax_grid_0.set_xticks(range(len(patients)))
    ax_grid_0.tick_params(top=False, bottom=False, left=False, right=False)
    ax_grid_0.spines['top'].set_visible(False)
    ax_grid_0.spines['right'].set_visible(False)
    ax_grid_0.spines['left'].set_visible(False)

    for i,pat in enumerate(patients):
            df_pat = df_subset[df_subset["PATIENT"] == pat].reset_index(drop=True)
            try:
                ax_grid_0.add_patch(Rectangle(xy=(i-0.5, 0) ,
                                       width=width, height=height, linewidth=0.5, 
                                            color=COLORS_COHORTS[df_pat.loc[0, 'COHORT']], 
                                          fill=True))
            except KeyError:
                print(df_pat)

    ## add age range grid

    ax_grid_1 = fig.add_subplot(outer[2,1], sharex=ax_grid_extra)
    ax_grid_1.set_ylabel('AGE RANGES', rotation=0, labelpad=1, ha='right', va='center',
                         fontsize=12, fontweight='bold')
    ax_grid_1.set_yticks([])
    ax_grid_1.set_xticks(range(len(patients)))
    ax_grid_1.xaxis.set_visible(False)
    ax_grid_1.tick_params(top=False, bottom=False, left=False, right=False)
    ax_grid_1.spines['top'].set_visible(False)
    ax_grid_1.spines['right'].set_visible(False)
    ax_grid_1.spines['left'].set_visible(False)
    for i,pat in enumerate(patients):
        df_pat = df_subset[df_subset["PATIENT"] == pat].reset_index(drop=True)
        try:
            ax_grid_1.add_patch(Rectangle(xy=(i-0.5, 0) ,
                                       width=width, height=height, linewidth=0.5, 
                                            color=COLORS_AGES_TALL[df_pat.loc[0, 'AGE_RANGE']], 
                                          fill=True))
        except KeyError:
            print(df_pat)

    ## add gene symbol grids

    for j,gene in enumerate(left_annotator['SYMBOL'].tolist()):

        ax_grid = fig.add_subplot(outer[j+3,1], sharex=ax_grid_extra)
        ax_grid.set_ylabel(gene, rotation=0, labelpad=10, ha='right', va='center',
                          fontsize=12,fontstyle='italic')
        ax_grid.set_yticks([])
        ax_grid.set_xticks(range(len(patients)))
        ax_grid.xaxis.set_visible(False)
        ax_grid.spines['top'].set_visible(False)
        ax_grid.spines['bottom'].set_visible(False)
        ax_grid.spines['left'].set_visible(False)
        if j%2==0:
            ax_grid.set_facecolor("#e0e0e0")
        for i,pat in enumerate(patients):
            df_subset_gene = grps_drivers.get_group(gene)
            df_pat = df_subset_gene[df_subset_gene["PATIENT"] == pat]
            combination_subset = set(df_pat[df_pat['SYMBOL'] == gene]['subset'].tolist())
            plot_combination(combination_subset, i, ax_grid)

    ## add stack barplot grid with counts 

    ax_grid_c0 = fig.add_subplot(outer[2,2],sharey=ax_grid_1)
    ax_grid_c0.text(x=0, y=0,s='NUM.PATIENTS', fontsize=12, fontweight='bold', ha='left', va='bottom')
    ax_grid_c0.set_yticks([])
    ax_grid_c0.set_xticks([])
    ax_grid_c0.spines['top'].set_visible(False)
    ax_grid_c0.spines['right'].set_visible(False)
    ax_grid_c0.spines['bottom'].set_visible(False)
    ax_grid_c0.spines['left'].set_visible(False)

    if option == 'A':
        col = 'GENE'
    else:
        col = 'SYMBOL'

    for k,gene in enumerate(left_annotator.drop_duplicates(subset=col)[col].tolist()):
        ax_grid_c = fig.add_subplot(outer[k+3,2],sharex=ax_grid_c0)

        ax_grid_c.set_yticks([])
        ax_grid_c.set_xticks([])
        ax_grid_c.spines['top'].set_visible(False)
        ax_grid_c.spines['right'].set_visible(False)
        ax_grid_c.spines['bottom'].set_visible(False)
        ax_grid_c.spines['left'].set_visible(False)

        df_counts = counter_gene[counter_gene[col] == gene]

        stack_barplot(df_counts, ax_grid_c, order_cohorts, col)


    plt.savefig(output_path, bbox_inches='tight', dpi=300)
    plt.show()
    

In [None]:
# READ CLINICAL DATA FOR EXTRA COLUMNS IN ADULT COHORT ABOUT IMMUNOPHENOTYPES
adult_info = pd.read_csv("", sep='\t') # Additional file 2 Table S1
adult_info = adult_info[['Patient_id', 'Primary_immunoclassification']]
adult_info.rename(columns={'Patient_id':'PATIENT', 'Primary_immunoclassification':'IMMUNOPHENOTYPE'}, inplace=True)

In [None]:
# read driver candidates. Those are obtained after running al processing notebooks
# these are provided in Additional file 2 Table S5 and Tables S6

# this table is obtained after running driver_mutations_TALL.ipynb  
candidate_muts_drivers = pd.read_csv("driver_muts_TALL_subsets.tsv", sep='\t') 
#Table S6a
candidate_cnv_drivers = pd.read_csv("driver_cnv_TALL.tsv", sep='\t') 
#Table S6b
candidate_sv_drivers = pd.read_csv("driver_sv_TALL.tsv", sep='\t')  

# In the additional tables are missing the mutations from Li et al., 2020 Blood. To obtain the same
# figure those must be added from the supplementary of their paper PMID: 31697823

In [None]:
#make sure all of them have the same columns
candidate_muts_drivers['GENE'] = candidate_muts_drivers['SYMBOL']
candidate_sv_drivers['GENE'] = candidate_sv_drivers['SYMBOL']  
candidate_sv_drivers['SYMBOL'] = candidate_sv_drivers.apply(lambda x: x['SYMBOL']+' '+x['Variant'], axis=1)
candidate_cnv_drivers['GENE'] = candidate_cnv_drivers['SYMBOL'].apply(lambda x: x.split(' ')[0] if " (" in x else x) 

In [None]:
# most common alteration CDKN2A and B
candidate_cnv_drivers['SYMBOL'] = candidate_cnv_drivers['SYMBOL'].apply(lambda x: "CDKN2A,CDKN2B 9p21.1-3 (del)" if "9p21" in x else x)
candidate_cnv_drivers['GENE'] = candidate_cnv_drivers['GENE'].apply(lambda x: "CDKN2A,CDKN2B" if "CDKN2" in x else x)

# join CDKN1B
candidate_cnv_drivers['SYMBOL'] = candidate_cnv_drivers['SYMBOL'].apply(lambda x: "CDKN1B 12p13.1-31 (del)" if "CDKN1B 12p13" in x else x)

# join HOX gene variants
candidate_cnv_drivers['SYMBOL'] = candidate_cnv_drivers['SYMBOL'].apply(lambda x: "HOXA cluster genes 7p15.2 (amp)" if "7p15.2" in x else x)

# correct PKN3. Deletions have been observed previously
candidate_cnv_drivers = candidate_cnv_drivers[candidate_cnv_drivers['SYMBOL'] != "PKN3 9q34.11 (amp)"]

# PRDM1 most likely is a tumor supressor https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3158840/ 
candidate_cnv_drivers = candidate_cnv_drivers[candidate_cnv_drivers['SYMBOL'] != "PRDM1 6q21. (amp)"]

# ME1 most likely has a loss of function role and truncating deletions are the driver alterations
candidate_cnv_drivers = candidate_cnv_drivers[candidate_cnv_drivers['SYMBOL'] != "ME1 6q14.2 (amp)"]

# MLLT1 is suggested to have gain of function role https://doi.org/10.1006/bcmd.2002.0525 
candidate_cnv_drivers = candidate_cnv_drivers[candidate_cnv_drivers['SYMBOL'] != "MLLT1 19p13.3 (del)"]

# TAL1 join alteratons for simplicity
candidate_cnv_drivers['SYMBOL'] = candidate_cnv_drivers['SYMBOL'].apply(lambda x: "TAL1 1p32.1-3 (del)" if "TAL1" in x else x)

# NOTCH1 amp hasn't been reported in literature as a driver event
candidate_cnv_drivers = candidate_cnv_drivers[candidate_cnv_drivers['SYMBOL'] != "NOTCH1 9q34.3 (amp)"]

# ABL1 is an oncogene https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3935732/
candidate_cnv_drivers = candidate_cnv_drivers[candidate_cnv_drivers['SYMBOL'] != "ABL1 9q34.12 (del)"]

# Join MYC amplifications for simplicity
candidate_cnv_drivers['SYMBOL'] = candidate_cnv_drivers['SYMBOL'].apply(lambda x: "MYC 8q24.3-23 (amp)" if "MYC" in x else x)

# Join RB1 deletions 
candidate_cnv_drivers['SYMBOL'] = candidate_cnv_drivers['SYMBOL'].apply(lambda x: "RB1 13q14.2-3 (del)" if "RB1" in x else x)

# Join PTEN deletions
candidate_cnv_drivers['SYMBOL'] = candidate_cnv_drivers['SYMBOL'].apply(lambda x: "PTEN 10q23.2-33 (del)" if "PTEN" in x else x)

# join TLX1 alterations for simplicity
candidate_sv_drivers['SYMBOL'] = candidate_sv_drivers['SYMBOL'].apply(lambda x: "TLX1 t(7;10)(q34;q24),t(10;14)(q24;q11),10q(24.31)(amp)" if "TLX1" in x else x)

# join LMO2 alterations for simplicity
candidate_sv_drivers['SYMBOL'] = candidate_sv_drivers['SYMBOL'].apply(lambda x: "LMO2 t(11;14)(p13;q11),11p13(del)" if "LMO2" in x else x) 

candidate_cnv_drivers.drop_duplicates(inplace=True)

# black_list genes highly mutated in two cohorts coming from the same project. Suspects of FP
potential_false_positives = ['MSH3', 'MAP3K4']
candidate_muts_drivers = candidate_muts_drivers[~candidate_muts_drivers['SYMBOL'].isin(potential_false_positives)]

In [None]:
driver_alterations = candidate_muts_drivers[['SYMBOL', 'PATIENT', 'AGE_RANGE', 'GENE', 'PATHWAY', 'COHORT', 'subset']].drop_duplicates()
driver_alterations = driver_alterations.append(candidate_cnv_drivers[['SYMBOL', 'PATIENT', 'AGE_RANGE', 'GENE', 'PATHWAY', 'COHORT', 'subset']].drop_duplicates(), ignore_index=True, sort=False)
driver_alterations = driver_alterations.append(candidate_sv_drivers[['SYMBOL', 'PATIENT', 'AGE_RANGE', 'GENE', 'PATHWAY', 'COHORT', 'subset']].drop_duplicates(), ignore_index=True, sort=False)

In [None]:
driver_alterations = driver_alterations.merge(adult_info, on='PATIENT', how='left')

#### large table

In [None]:
# SORT COLUMNS BY COHORT AND AGE RANGES

# create new dataframe sorted
grps = driver_alterations.groupby("COHORT")

order_cohorts = ['ADULT TALL AECC PROJECT',
                'PEDIATRIC TALL WXS (Oshima et al., 2016; PNAS)',
                'PEDIATRIC ALL (Li et al., 2019, Blood)']

driver_alterations = pd.DataFrame()
for c in order_cohorts:
    df_cohort = grps.get_group(c)
    df_cohort.sort_values(by=['AGE_RANGE', 'PATIENT'], inplace=True, ascending=[False, False])
    driver_alterations = driver_alterations.append(df_cohort, sort=False, ignore_index=True)

# create a list with the patients sorted
patients = driver_alterations[['PATIENT']].drop_duplicates(keep='first')['PATIENT'].tolist()

In [None]:
# count number of patients that have a gene altered by cohort
counter_filt = driver_alterations[['GENE', 'PATIENT']].drop_duplicates().groupby(['GENE']).count()
counter_filt.reset_index(inplace=True)
counter_filt.rename(columns={'PATIENT':'COUNT_GENE'}, inplace=True)

print(len(counter_filt))
counter_filt = counter_filt[counter_filt['COUNT_GENE']>1]
print(len(counter_filt))

# at least the gene presents alterations in more than one patient
driver_alterations = driver_alterations[driver_alterations['GENE'].isin(counter_filt['GENE'])]

In [None]:
## count by alteration
option = 'B'
counter_gene = driver_alterations[['SYMBOL', 'PATIENT', 'COHORT']].drop_duplicates().groupby(['SYMBOL','COHORT']).count()
counter_gene.reset_index(inplace=True)
counter_gene.rename(columns={'PATIENT':'NUM PATIENTS'}, inplace=True)
counter_gene.head()

In [None]:
driver_alterations[['SYMBOL', 'PATIENT','PATHWAY', 'COHORT']].groupby(['PATHWAY', 'COHORT']).count().sort_values(by=['SYMBOL', 'PATIENT'],ascending=False)

In [None]:
# SORT ROWS (GENES) BY PATHWAYS

#custom order of pathways
order_pathways = ['Notch signaling pathway','Cell cycle', 'Ras Pathway','Pyrimidine/purine metabolism', 
                 'Chromatin histone modifiers','Chromatin other',  'JAK/STAT signaling pathway', 
                  'Wnt signaling pathway','Epigenetics DNA modifiers','p53 pathway','Transcription factor',
                  'NF-KappaB signaling','PI3 kinase pathway','Ubiquitin proteasome pathway',
                  'Gonadotropin-releasing hormone receptor pathway',
                  'ATP-binding cassette (ABC) transporter superfamily',
                  'PKA Signaling', 'Pyruvate metabolism',
                  'Apoptosis signaling pathway','Transcriptional misregulation in cancer',
                  'Transcriptional regulation of white adipocyte differentiation','Integrin signalling pathway', 'Metabolism',
                  'Genome integrity', 'Splicing', 
                  'Signaling by ROBO receptors', 'Inhibitor of DNA binding','RNA abundance',
                  'Interleukin signaling pathway','Regulation of hematopoietic stem cell differentiation','other']
print(len(order_pathways))



left_annotator_count = driver_alterations[['PATHWAY', 'GENE', 'PATIENT']].drop_duplicates().groupby(['PATHWAY','GENE']).count()
left_annotator_count.rename(columns={'PATIENT':'COUNT'}, inplace=True)

left_annotator = driver_alterations[['PATHWAY', 'GENE','SYMBOL']].drop_duplicates(keep='first')

left_annotator = left_annotator.merge(left_annotator_count, on=['PATHWAY', 'GENE'])

grps_path = left_annotator.groupby('PATHWAY')
left_annotator = pd.DataFrame()

for g in order_pathways:
    try:
        df = grps_path.get_group(g)
        df = df.sort_values(['COUNT', 'GENE'], ascending=[False, False])
        left_annotator = left_annotator.append(df, ignore_index=True, sort=False)
    except KeyError:
        pass

In [None]:
out_fig = "table_driver_alterations_TALL_big.svg"
figure_size = (22,180)

make_table_alterations(driver_alterations,figure_size,out_fig, option=option)

#### short table for main figure

In [None]:
#  filter by at least two adult patients with gene mutated
counter_filt = driver_alterations[driver_alterations['COHORT'] == 'ADULT TALL AECC PROJECT'][['GENE', 'PATIENT']].drop_duplicates().groupby(['GENE']).count()
counter_filt.reset_index(inplace=True)
counter_filt.rename(columns={'PATIENT':'COUNT_GENE'}, inplace=True)

print(len(counter_filt))
counter_filt = counter_filt[counter_filt['COUNT_GENE']>1]
print(len(counter_filt))

driver_alterations = driver_alterations[driver_alterations['GENE'].isin(counter_filt['GENE'])]

In [None]:
## count by alteration
option = 'B'
counter_gene = driver_alterations[['SYMBOL', 'PATIENT', 'COHORT']].drop_duplicates().groupby(['SYMBOL','COHORT']).count()
counter_gene.reset_index(inplace=True)
counter_gene.rename(columns={'PATIENT':'NUM PATIENTS'}, inplace=True)
counter_gene.head()

In [None]:
# SORT ROWS (GENES) BY PATHWAYS

order_pathways = ['Notch signaling pathway','Cell cycle', 'Ras Pathway','Pyrimidine/purine metabolism', 
                 'Chromatin histone modifiers','Chromatin other',  'JAK/STAT signaling pathway', 
                  'Wnt signaling pathway','Epigenetics DNA modifiers','p53 pathway','Transcription factor',
                  'NF-KappaB signaling','PI3 kinase pathway','Ubiquitin proteasome pathway',
                  'Gonadotropin-releasing hormone receptor pathway',
                  'ATP-binding cassette (ABC) transporter superfamily',
                  'PKA Signaling','Apoptosis signaling pathway','Transcriptional misregulation in cancer',
                  'Transcriptional regulation of white adipocyte differentiation','Inhibitor of DNA binding', 
                  'Metabolism','Integrin signalling pathway','Genome integrity', 'Splicing', 
                  'Signaling by ROBO receptors', 'Pyruvate metabolism','RNA abundance',
                  'Interleukin signaling pathway','Regulation of hematopoietic stem cell differentiation','other']
len(order_pathways)



left_annotator_count = driver_alterations[['PATHWAY', 'GENE', 'PATIENT']].drop_duplicates().groupby(['PATHWAY','GENE']).count()
left_annotator_count.rename(columns={'PATIENT':'COUNT'}, inplace=True)

left_annotator = driver_alterations[['PATHWAY', 'GENE','SYMBOL']].drop_duplicates(keep='first')

left_annotator = left_annotator.merge(left_annotator_count, on=['PATHWAY', 'GENE'])

grps_path = left_annotator.groupby('PATHWAY')
left_annotator = pd.DataFrame()

for g in order_pathways:
    try:
        df = grps_path.get_group(g)
        df = df.sort_values(['COUNT', 'GENE'], ascending=[False, False])
        left_annotator = left_annotator.append(df, ignore_index=True, sort=False)
    except KeyError:
        do_nothing = 'do_nothing'

In [None]:
out_fig = "table_driver_alterations_TALL_small.svg"
figure_size = (22,80)

make_table_alterations(driver_alterations,figure_size,out_fig, option=option)

### chi square 

In [None]:
df_all = pd.read_csv("driver_muts_TALL_subsets.tsv", sep='\t') 
df_info = df_all[['PATIENT', 'SUBTYPE']].drop_duplicates()
df_info['AGE_COHORT'] = df_info.apply(lambda x: 'adult' if 'Adult' in x['SUBTYPE'] else 'pediatric', axis=1)

In [None]:
# Define cancer gene of interest
cancer_gene = 'PHF6'

test_gene = df_info.merge(df_all[df_all['SYMBOL'] == cancer_gene].drop_duplicates()[['SYMBOL', 'PATIENT']].drop_duplicates(), 
                              how='left', on='PATIENT')
test_gene['Mutated'] = test_gene.apply(lambda x: True if type(x['SYMBOL']) == str else False, axis=1)

In [None]:
test_gene.head()

In [None]:
# create contingency table
contingency_table = pd.crosstab(index=test_gene['Mutated'], columns=test_gene['AGE_COHORT'])
contingency_table

In [None]:
stat_chi2, p, dof, expected = chi2_contingency(contingency_table, correction=False)
p

In [None]:
from decimal import Decimal

print('%.2E' % Decimal(p))

output = "{:.3f}".format(p)
print(output)

In [None]:
# interpret test-statistic
prob = 0.90
critical = chi2.ppf(prob, dof)
if abs(stat_chi2) >= critical:
    print('Dependent (reject H0)')
else:
    print('Independent (fail to reject H0)')

In [None]:
# interpret test-statistic
prob = 0.95
critical = chi2.ppf(prob, dof)
if abs(stat_chi2) >= critical:
    print('Dependent (reject H0)')
else:
    print('Independent (fail to reject H0)')