This notebook does a dimentionality redution with UMAP. It takes the 96 mutation types or channels and plots them into 2 dimensions or components to see a clustering of patients by mutation profile. 

Only primary samples are used

The dot plots correspond to Figure 1b of the paper

In [None]:
import os
import pandas as pd
import numpy as np
import glob
from io import StringIO

import umap
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
from collections import OrderedDict
from aux_data_in_pyvar import CHANNELS,config_rcparams, COLORS_AGES, COLORS_AGES_TALL, COLORS_SUBTYPES


pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', -1)

In [None]:
config_rcparams()

In [None]:
# Normalize counts

def get_context_counts(df_pat_count):
    
    sums = pd.DataFrame(df_pat_count.sum(axis=1))
    for sam, value in df_pat_count.iterrows():
        for cntxt in df_pat_count.columns:
            df_pat_count.loc[sam, cntxt] = df_pat_count.loc[sam, cntxt]/sums.loc[sam, 0]
    
    df_pat_count = df_pat_count.reset_index().rename(columns={'index':'SAMPLE'})
    return df_pat_count

In [None]:
dire_plot = "" # output for the plot

In [None]:
# Get age ranges and ALL subtypes from clinical data
clinical = pd.read_csv("../ext_files/all_cohort_clinical_groups.tsv", sep='\t')
clinical = clinical[~clinical['COMPARISON'].isin(['AE6518_vs_AE6519', 'AE6521_vs_AE6522'])] # PAT3, PAT4 primary samples are damaged
black_list = ['SJBALL021893_D1_vs_SJBALL021893_G1', 'SJBALL021894_D1_vs_SJBALL021894_G1',
             'SJBALL021895_D1_vs_SJBALL021895_G1', 'SJBALL021896_D1_vs_SJBALL021896_G1',
             'SJBALL021897_D1_vs_SJBALL021897_G1', 'SJTALL014_D_vs_SJTALL014_G',
             'SJPHALL020041_D1_vs_SJPHALL020041_G1'] # those lack some clinical information
clinical['SUBTYPE_LABEL'] = clinical['SUBTYPE_LABEL'].str.replace("PHALL", 'Ph positive')
clinical = clinical[~clinical['COMPARISON'].isin(black_list)]
clinical = clinical[clinical['STAGE'] == 'primary']
clinical.head()

In [None]:
## INITATE COLLECTOR DF 
# pedia
dff_channels = pd.DataFrame()

In [None]:
## ADULT TALL
# Path to the results from fitting of signatures of deconstructSigs run. Here we expect the results from
# all primary samples of the adult T-ALL cohort. If you check ../ext_runs/run_deconstructSig/make_inputs_fitting_adults.ipynb
# it should correspond to a path pointing to a run with folder named run_all_primary/
path_adult = ""
df_channels = pd.read_csv(os.path.join(path_adult, "mut_count_96_ch.tsv"), sep='\t')
df_channels = get_context_counts(df_channels)

## PEDIATRIC COHORTS
# Path to the results from fitting of signatures of deconstructSigs run. Here we expect the results from
# all primary samples per cohort from pediatric projects that we have downloaded. 
# If you check ../ext_runs/run_deconstructSig/make_inputs_fitting_adults.ipynb is explained how to obtain
# the signature weigths for the primary mutations of all the pediatic cohorts. 
path_pediatric = ""

for file in glob.glob(os.path.join(path_pediatric, '*', 'mut_count_96_ch.tsv')):# *asterisk to catch all the pediatric cohort folders
    cohort = file.split("/")[-2]
    print(cohort)
    df = pd.read_csv(file, sep='\t')
    df = get_context_counts(df)
    dff_channels = dff_channels.append(df, ignore_index=True,sort=False)

In [None]:
# append both
dff_channels = dff_channels.append(df_channels, ignore_index=True,sort=False)
dff_channels.rename(columns={'SAMPLE':'PATIENT'}, inplace=True)

# merge clinical info to counts
dff_channels = dff_channels.merge(clinical[['COHORT', 'PATIENT','COMPARISON', 'SUBTYPE_LABEL', 'AGE_RANGE']], how='left', on='PATIENT')

# sort them by customized subtype order and age range
dff_channels.sort_values('AGE_RANGE', inplace=True)

grps = dff_channels.groupby('SUBTYPE_LABEL')

dff_channels = pd.DataFrame()

for g in ['iAMP21','Hyperdiploid', 'Hypodiploid', 
       'Ph positive', 'Ph-like','Infant MLL-R','DUX4-ERG', 'TALL Pediatric','TALL Adult']:
    df = grps.get_group(g)
    dff_channels = dff_channels.append(df, ignore_index=True, sort=False)

In [None]:
array_channels = dff_channels[CHANNELS]
array_cohorts = np.array(dff_channels['COHORT'].tolist())
array_ages = np.array(dff_channels['AGE_RANGE'].tolist())
array_subtypes = np.array(dff_channels['SUBTYPE_LABEL'].tolist())
array_patients = np.array(dff_channels['COMPARISON'].tolist())

## UMAP

Uniform Manifold Approximation and Projection (manifold learning and dimension reduction algorithm)

In [None]:
reducer = umap.UMAP(random_state=53, n_neighbors=20, min_dist=0.2)
embedding = reducer.fit_transform(array_channels)
embedding.shape

In [None]:
fig = plt.figure(figsize=(10,10))
outer = gridspec.GridSpec(1,1, wspace=0, hspace=0)
ax = plt.subplot(outer[0,0])

for i, label in enumerate(array_subtypes):
    if "TALL" in label:
        ax.scatter(embedding[i, 0], embedding[i, 1],s=200, c=COLORS_SUBTYPES[label], label=label,marker="^",
                  linewidths=0.8, edgecolors='#000000')
    else:
        ax.scatter(embedding[i, 0], embedding[i, 1],s=140,c=COLORS_SUBTYPES[label], label=label,
                  linewidths=0.8, edgecolors='#252525',alpha=0.9)
    ax.set_yticks([])
    ax.set_xticks([])
    ax.set_xlabel("Component 1", fontsize=14, labelpad=10)
    ax.set_ylabel("Component 2",fontsize=14,labelpad=10)
    ax.set_title("UMAP of ALL subtypes", fontsize=18,pad=10)
# Legend
handles, labels = plt.gca().get_legend_handles_labels()
by_label = OrderedDict(zip(labels, handles))
plt.legend(by_label.values(), by_label.keys(),prop={'size': 14}, loc='center right', bbox_to_anchor=(1.3, 0.5))
fig.savefig(os.path.join(dire_plot,"umap_all_subtypes.svg"), dpi=300, bbox_inches='tight')
plt.show()

In [None]:
fig = plt.figure(figsize=(10,10))
outer = gridspec.GridSpec(1,1, wspace=0, hspace=0)
ax = plt.subplot(outer[0,0])

for i, label in enumerate(array_ages):
    com = array_patients[i]
    if ('AE' in com) or ('SJTALL' in com):
        ax.scatter(embedding[i, 0], embedding[i, 1], s=200,c=COLORS_AGES_TALL[label], label=label,marker="^",
             linewidths=0.8, edgecolors='#000000')
    else:
        ax.scatter(embedding[i, 0], embedding[i, 1], s=140,c=COLORS_AGES[label], label=label,
             linewidths=0.8, edgecolors='#252525')
    ax.set_yticks([])
    ax.set_xticks([])
    ax.set_xlabel("Component 1", fontsize=14, labelpad=10)
    ax.set_ylabel("Component 2",fontsize=14,labelpad=10)
    ax.set_title("UMAP of ALL age groups", fontsize=18,pad=10)
# Legend
handles, labels = plt.gca().get_legend_handles_labels()
by_label = OrderedDict(zip(labels, handles))
plt.legend(by_label.values(), by_label.keys(),prop={'size': 14}, loc='center right', bbox_to_anchor=(1.3, 0.5))
fig.savefig(os.path.join(dire_plot,"umap_all_age.svg"), dpi=300, bbox_inches='tight')
plt.show()