This notebook creates Figure 1c

In [None]:
import pandas as pd
import numpy as np
import pybedtools
import gzip
from io import StringIO
import os
import glob

import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns

from aux_data_in_pyvar import config_rcparams, COLORS_SIGNATURES

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', -1)

In [None]:
config_rcparams()

In [None]:
df_signatures = pd.DataFrame()

In [None]:
# Get age ranges and ALL subtypes from clinical data
clinical = pd.read_csv("../ext_files/all_cohort_clinical_groups.tsv", sep='\t')
clinical = clinical[~clinical['COMPARISON'].isin(['AE6518_vs_AE6519', 'AE6521_vs_AE6522'])] # PAT3, PAT4 primary samples are damaged
black_list = ['SJBALL021893_D1_vs_SJBALL021893_G1', 'SJBALL021894_D1_vs_SJBALL021894_G1',
             'SJBALL021895_D1_vs_SJBALL021895_G1', 'SJBALL021896_D1_vs_SJBALL021896_G1',
             'SJBALL021897_D1_vs_SJBALL021897_G1', 'SJTALL014_D_vs_SJTALL014_G',
             'SJPHALL020041_D1_vs_SJPHALL020041_G1'] # those lack some clinical information
clinical['SUBTYPE_LABEL'] = clinical['SUBTYPE_LABEL'].str.replace("PHALL", 'Ph positive')
clinical = clinical[~clinical['COMPARISON'].isin(black_list)]
clinical = clinical[clinical['STAGE'] == 'primary']
clinical.head()

In [None]:
## PEDIATRIC FITTING

# Path to the results from fitting of signatures of deconstructSigs run. Here we expect the results from
# all primary samples per cohort from pediatric projects that we have downloaded. 
# If you check ../ext_runs/run_deconstructSig/make_inputs_fitting_adults.ipynb is explained how to obtain
# the signature weigths for the primary mutations of all the pediatic cohorts. 
pediatric_paths = "run_all_primary_pediatric/*/signatures_weight.csv" # *asterisk to catch all the pediatric cohort folders

for f in glob.glob(pediatric_paths):
    df_w = pd.read_csv(f, sep='\t')
    df_signatures = df_signatures.append(df_w, ignore_index=True)
    
df_signatures = df_signatures.merge(clinical[['PATIENT','COMPARISON', 'SUBTYPE', 'SUBTYPE_LABEL']], how='left', left_on='sample_id', 
                                    right_on='PATIENT')
df_signatures = df_signatures.dropna(subset=['SUBTYPE'])

## ADULT FITTING
# all primary leukemias

# Path to the results from fitting of signatures of deconstructSigs run. Here we expect the results from
# all primary samples of the adult T-ALL cohort. If you check ../ext_runs/run_deconstructSig/make_inputs_fitting_adults.ipynb
# it should correspond to a path pointing to a run with folder named run_all_primary/
df_w = pd.read_csv("", sep='\t')

df_w = df_w.merge(clinical[clinical['STAGE'] == 'primary'][['PATIENT','COMPARISON', 'SUBTYPE', 'SUBTYPE_LABEL']], how='left', 
                  left_on='sample_id',right_on='PATIENT')
df_w = df_w.dropna(subset=['SUBTYPE'])

# append both
df_signatures = df_signatures.append(df_w, ignore_index=True)
df_signatures = df_signatures.fillna(0)

In [None]:
# mean of exposures in dict for plot

all_sig = ["SBS1", "SBS2","SBS5", "SBS6","SBS9", "SBS13", "SBS17a", "SBS17b","SBS18","SBS34","SBS36", "SBS37"]

sub_sig = []
for s in all_sig:
    if s in df_signatures.columns:
        sub_sig.append(s)
sub_sig.extend(['unknown' ,'SUBTYPE_LABEL'])


mean_exposures = df_signatures[sub_sig].groupby('SUBTYPE_LABEL').mean()
mean_exposures.rename(columns={'unknown':'unassigned'}, inplace = True)
mean_exposures = mean_exposures.T
mean_exposures.index.name=None
dicc_expo = mean_exposures.to_dict()

sd_exposures = df_signatures[sub_sig].groupby('SUBTYPE_LABEL').std()
sd_exposures.rename(columns={'unknown':'unassigned'}, inplace = True)
sd_exposures = sd_exposures.T
sd_exposures.index.name=None
dicc_sd = sd_exposures.to_dict()

dicc_pats = df_signatures[['SUBTYPE_LABEL', 'COMPARISON']].drop_duplicates().groupby('SUBTYPE_LABEL').count()
dicc_pats = dicc_pats.to_dict()['COMPARISON']

In [None]:
# custom orders
order_sig = ['unassigned','SBS6','SBS36','SBS34','SBS17b','SBS17a', 'SBS13', 'SBS37', 'SBS9', 'SBS2',  'SBS18', 'SBS1', 'SBS5']
order_subtype = ['DUX4-ERG',
 'Ph-like',
 'Hypodiploid',
 'Infant MLL-R',
 'TALL Pediatric',
 'Hyperdiploid',
 'TALL Adult',
 'iAMP21',
 'Ph positive'] # same as in mutation burden 

out_path = "" # output path for the figure

In [None]:
fig, axs = plt.subplots(1,len(order_subtype), figsize=(15, 4),frameon=False,
                        gridspec_kw={'wspace':0.1, 'hspace':0.5}, sharey=True)

for i,sub in enumerate(order_subtype):
    dicc_sub = dicc_expo[sub]
    ax = axs[i]
    j = 0
    labels = []
    for sig in order_sig:
        try:
            ax.barh(y=j, width=dicc_sub[sig], color=COLORS_SIGNATURES[sig], 
                xerr=dicc_sd[sub][sig])
            ax.tick_params(axis='x', which='major', labelsize=12)
            j = j+1
            labels.append(sig)
        except KeyError:
            continue
    ax.set_xlabel(sub, fontsize =14)
axs[0].set_yticks(range(0,len(labels), 1))
axs[0].set_yticklabels(labels, fontsize =14)
axs[0].set_ylabel('Mean of exposures', fontsize=16)


fig.savefig(os.path.join(out_path,"barplot_signatures_subtypes_all.svg"), dpi=300, bbox_inches='tight')