In [None]:
import os
import sys
print("Python version" + sys.version)
os.getcwd()
print(sys.executable)

In [None]:
import numpy as np
np.random.seed(123)
import pandas as pd
import scipy
import itertools

import umap
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import scanpy as sc
import anndata as ad
import scvelo as scv
from tqdm.notebook import tqdm

from pathlib import Path

In [None]:
import venn

In [None]:
import cellrank as cr

In [None]:
cr.__version__

In [None]:
from cellrank.kernels import PseudotimeKernel

In [None]:
from cellrank.kernels import CytoTRACEKernel

In [None]:
import scanpy.external as sce

In [None]:
from pysankey import sankey

In [None]:
import upsetplot
from upsetplot import from_contents
from upsetplot import UpSet
from upsetplot import plot

In [None]:
sc.settings.verbosity = 1
sc.logging.print_header()
sc.settings.set_figure_params(dpi=150, facecolor='white')

In [None]:
# remove weird grid from scvelo
plt.rcParams['axes.grid'] = False

In [None]:
plt.rcParams.keys()

In [None]:
from matplotlib_inline.backend_inline import set_matplotlib_formats
set_matplotlib_formats('svg', dpi = 300)

In [None]:
plt.rcParams['svg.fonttype'] = 'none'
plt.rcParams['font.family'] = 'DejaVu Sans'
plt.rcParams['font.size'] = '8'


In [None]:
# revised from Stefan's cell type signature
signatures_path_ = '../cell_type_from_stefan/scrnaseq_signature_collection/'
from score_and_classify import *

In [None]:
new_data_folder = '/fast/users/twei_m/work/crc/datasets_new_preprocessing'

In [None]:
adata_epi = sc.read(Path(new_data_folder)/'202306_CB_epi_Numbat_Scitcem_inferCNV_icms_Uhlitz_scanvi.h5')

In [None]:
tsamples = ['p007t', 'p008t', 'p009t1','p009t2', 'p013t', 'p014t', 'p016t', 
           'p020t', 'p021t', 'p026t', 'p035t'] 

In [None]:
samples = ['p007n', 'p008n', 'p009n1', 'p009n2','p013n', 'p014n', 'p016n', 
           'p020n', 'p021n'] + tsamples

### Numbat and Scitcem

In [None]:
[(adata_epi.obs['numbat'] == 'tumour\n(tumour sample)') & (adata_epi.obs['scitcem_call'] == 'tumour\n(tumour sample)')]

In [None]:
adata_epi.obs['Numbat_Scitcem_tumour'] = list((adata_epi.obs['numbat'] == 'tumour\n(tumour sample)') &
                                                    (adata_epi.obs['scitcem_call'] == 'tumour\n(tumour sample)'))
adata_epi.obs['Numbat_Scitcem_tumour'] = list((adata_epi.obs['numbat'] == 'tumour\n(tumour sample)') &
                                                    (adata_epi.obs['scitcem_call'] == 'tumour\n(tumour sample)'))

adata_epi.obs['tumour_normal_normal'] = None

for i in np.arange(0, adata_epi.shape[0]):
    if adata_epi.obs['sample_origin'][i] == '\nnormal\nsample\n':
        adata_epi.obs['tumour_normal_normal'][i] = 'normal_sample'
    elif (adata_epi.obs['numbat'][i] == 'tumour\n(tumour sample)') & (adata_epi.obs['scitcem_call'][i] == 'tumour\n(tumour sample)'):
        adata_epi.obs['tumour_normal_normal'][i] = 'genomically_tumour'
    elif (adata_epi.obs['numbat'][i] == 'normal\n(tumour sample)') & (adata_epi.obs['scitcem_call'][i] == 'normal\n(tumour sample)'):
        adata_epi.obs['tumour_normal_normal'][i] = 'genomically_normal'
    else:
        adata_epi.obs['tumour_normal_normal'][i] = 'no confident assignment'
adata_epi.obs['tumour_normal_normal'] = adata_epi.obs['tumour_normal_normal'].astype('category')

In [None]:
adata_epi.obs['tumour_normal_normal'].value_counts()

In [None]:
adata_epi.obs['tumour_normal_normal'] = adata_epi.obs['tumour_normal_normal'
                                                     ].cat.reorder_categories([
    'genomically_tumour', 'no confident assignment',
    'genomically_normal', 'normal_sample'])

In [None]:
adata_epi.uns['tumour_normal_normal_colors'] = ['#ff7f0e', '#9b1ee3','#1f77b4','#d3d3d3']

In [None]:
anatomical_loc = pd.DataFrame({'patient': list(np.unique(adata_epi.obs['patient'])) ,
                               'anatomical_location':['0_Cecum', '0_Cecum', '7_Sigmoid', '2_Ascending', '2_Ascending',
                                                      '9_Rectum', '9_Rectum', '6_Descending', '0_Cecum', '2_Ascending']})

In [None]:
adata_epi.obs = adata_epi.obs.merge(anatomical_loc, on = 'patient', how='left').set_axis(adata_epi.obs.index)

### Venn

In [None]:
set_infercnv = set(adata_epi[(adata_epi.obs['inferCNV_result'] == 'CNA\n(tumour sample)') & 
                                     (adata_epi.obs['sample_origin'] == 'tumour\nsample')].obs.index)
len(set_infercnv)

In [None]:
set_scitcem = set(adata_epi[(adata_epi.obs['scitcem_call'] == 'tumour\n(tumour sample)') & 
                                     (adata_epi.obs['sample_origin'] == 'tumour\nsample')].obs.index)
len(set_scitcem)

In [None]:
set_numbat = set(adata_epi[(adata_epi.obs['numbat'] == 'tumour\n(tumour sample)')& 
                                     (adata_epi.obs['sample_origin'] == 'tumour\nsample')].obs.index)
len(set_numbat)

labels = venn.get_labels([set_scitcem, set_numbat, set_infercnv], fill = ['number'])
fig, ax = venn.venn3(labels, names = [ 'Scitcem', 'Numbat', 'inferCNV_new'], 
                    )
fig.suptitle('agreement on tumour cells (tumour sample only)', fontsize=24)
fig.show()

In [None]:
set_icms_scANVI = set(adata_epi[(adata_epi.obs['iCMS_scANVI'] != 'normal\n(tumour sample)')& 
                                     (adata_epi.obs['sample_origin'] == 'tumour\nsample')].obs.index)
len(set_icms_scANVI)

labels = venn.get_labels([set_icms_scANVI, set_scitcem, set_numbat, set_infercnv], fill = ['number'])
fig, ax = venn.venn4(labels, names = ['iCMS' ,'Scitcem', 'Numbat', 'inferCNV'], 
                    )
fig.suptitle('agreement on tumour cells (tumour sample only)', fontsize=24)
fig.show()

### upset plot

In [None]:
upset_dic = from_contents({'iCMS':sorted(set_icms_scANVI),
               'Scitcem':sorted(set_scitcem),
               'Numbat':sorted(set_numbat),
               'inferCNV':sorted(set_infercnv)
    
})

In [None]:
11008/17623

In [None]:
fig = plt.figure(figsize=(12, 5))
upsetp = plot(upset_dic, show_counts=True, fig=fig, element_size=None);

In [None]:
upset_df = adata_epi[adata_epi.obs['sample_origin'] == 'tumour\nsample'].obs.copy()

upset_df = upset_df.set_index(upset_df.iCMS_scANVI.isin(['iCMS2\n(tumour sample)','iCMS3\n(tumour sample)'])).set_index(
upset_df.inferCNV_result == 'CNA\n(tumour sample)', append=True).set_index(
upset_df.numbat == 'tumour\n(tumour sample)', append=True).set_index(
upset_df.scitcem_call == 'tumour\n(tumour sample)', append=True)

upset_df.index.rename(['iCMS', 'inferCNV', 'Numbat', 'Scitcem'], inplace=True)

In [None]:
fig = plt.figure(figsize=(12, 6))

upsetp = UpSet(upset_df, show_counts='%d', show_percentages=False, 
               intersection_plot_elements = 0, element_size=40)
upsetp.add_stacked_bars(by='MS_status', elements = 10)
upsetp.plot()
plt.legend(bbox_to_anchor=(1.04, 1), loc="upper left")
;

In [None]:
upset_df.inferCNV_result

In [None]:
upset_df_tumour_cells = upset_df[(upset_df.iCMS_scANVI != 'normal\n(tumour sample)') |
        (upset_df.numbat != 'normal\n(tumour sample)') |
        (upset_df.scitcem_call != 'normal\n(tumour sample)') |
        (~upset_df.inferCNV_result.isin(['CNN\n(tumour sample)', 'failed_sample']))]

In [None]:
adata_epi.obs[['MS_status', 'anatomical_location', 'patient']].value_counts().sort_index()

In [None]:
fig = plt.figure(figsize=(12, 6))

upsetp = UpSet(upset_df_tumour_cells, show_counts=True, show_percentages=False, 
               intersection_plot_elements = 0, element_size=40)
upsetp.add_stacked_bars(by='anatomical_location', elements = 10)
upsetp.plot()
plt.legend(bbox_to_anchor=(1.04, 1), loc="upper left")
;

In [None]:
fig = plt.figure(figsize=(12, 6))

upsetp = UpSet(upset_df_tumour_cells, show_counts=True, show_percentages=False, 
               intersection_plot_elements = 0, element_size=40)
upsetp.add_stacked_bars(by='patient', elements = 10)
upsetp.plot()
plt.legend(bbox_to_anchor=(1.04, 1), loc="upper left")
;

In [None]:
fig = plt.figure(figsize=(12, 6))

upsetp = UpSet(upset_df_tumour_cells, show_counts=True, show_percentages=False, 
               intersection_plot_elements = 0, element_size=40)
upsetp.add_stacked_bars(by='Uhlitz_scANVI', elements = 10)
upsetp.plot()
plt.legend(bbox_to_anchor=(1.04, 1), loc="upper left")
;

### Consensus cell identity

- normal sample
- Numbat == tumour and Scitcem == tumour > tumour; same for normal
- Scitcem == 0.5 & Numbat > 0.5 > tumour
- MSS: Scitcem < 0.5 & Numbat > 0.5 > tumour
- MSI: Scitcem > 0.5 & WGS normal sample Mutect2 alt. allele == 0

- cell_identity_list = pd.read_csv(Path(new_data_folder)/'conflicting_cell_identity_scitcem_high.txt', index_col=0)

In [None]:
adata_epi.obs['cell_identity'] = None

for i in np.arange(0, adata_epi.shape[0]):
    if adata_epi.obs['sample_origin'][i] == '\nnormal\nsample\n':
        adata_epi.obs['cell_identity'][i] = 'normal_sample'
    elif (adata_epi.obs['numbat'][i] == 'tumour\n(tumour sample)') & (adata_epi.obs['scitcem_call'][i] == 'tumour\n(tumour sample)'):
        adata_epi.obs['cell_identity'][i] = 'genomically_tumour'
    elif (adata_epi.obs['numbat'][i] == 'normal\n(tumour sample)') & (adata_epi.obs['scitcem_call'][i] == 'normal\n(tumour sample)'):
        adata_epi.obs['cell_identity'][i] = 'genomically_normal'
    # no observed alt. >> leave this to the MSS/MSI decision 
    #elif (adata_epi.obs['scitcem_p'][i] == 0.5) & (adata_epi.obs['p_cnv'][i] > 0.5):
    #    adata_epi.obs['cell_identity'][i] = 'genomically_tumour'
    
    # Numbat is higher in MSS
    elif (adata_epi.obs['MS_status'][i] == 'MSS') & (adata_epi.obs['p_cnv'][i] > 0.5):
        adata_epi.obs['cell_identity'][i] = 'genomically_tumour'
    # Scitcem is higher in MSI
    elif (adata_epi.obs['MS_status'][i] == 'MSI') & (adata_epi.obs['scitcem_p'][i] > 0.5):
        adata_epi.obs['cell_identity'][i] = 'genomically_tumour'
    else:
        adata_epi.obs['cell_identity'][i] = 'no confident assignment'
        
adata_epi.obs['cell_identity'] = adata_epi.obs['cell_identity'].astype('category')
          

In [None]:
adata_epi.obs['cell_identity'].value_counts().sort_index()

In [None]:
adata_epi[adata_epi.obs['cell_identity'] == 'no confident assignment'].obs['sample'].value_counts().sort_index()

In [None]:
adata_epi.uns['cell_identity_colors'] = adata_epi.uns['tumour_normal_normal_colors']

In [None]:
adata_epi.obs['cell_identity'] = adata_epi.obs['cell_identity'].cat.reorder_categories(['genomically_tumour','no confident assignment', 
                                                             'genomically_normal','normal_sample'])

In [None]:
adata_epi.obs['tumour_normal_normal'] = adata_epi.obs['tumour_normal_normal'].cat.reorder_categories(['genomically_tumour','no confident assignment', 
                                                             'genomically_normal','normal_sample'])

adata_epi.write(Path(new_data_folder)/'202310_CB_epi_Numbat_Scitcem_inferCNV_icms_Uhlitz_scanvi_resolved_identity.h5')

adata_epi.obs['cell_identity'].to_frame().to_csv(Path(new_data_folder)/'20231108_Numbat_and_Scitcem_resolved_assignment_cellid.csv')

In [None]:
# re-run to include cell_identity col
upset_df = adata_epi[adata_epi.obs['sample_origin'] == 'tumour\nsample'].obs.copy()

upset_df = upset_df.set_index(upset_df.iCMS_scANVI.isin(['iCMS2\n(tumour sample)','iCMS3\n(tumour sample)'])).set_index(
upset_df.inferCNV_result == 'CNA\n(tumour sample)', append=True).set_index(
upset_df.numbat == 'tumour\n(tumour sample)', append=True).set_index(
upset_df.scitcem_call == 'tumour\n(tumour sample)', append=True)

upset_df.index.rename(['iCMS', 'inferCNV', 'Numbat', 'Scitcem'], inplace=True)

In [None]:
fig = plt.figure(figsize=(12, 6))

upsetp = UpSet(upset_df, show_counts='%d', show_percentages=False, 
               intersection_plot_elements = 0, element_size=40)
upsetp.add_stacked_bars(by='cell_identity', elements = 10)
upsetp.plot()
plt.legend(bbox_to_anchor=(1.04, 1), loc="upper left")
;

In [None]:
fig = plt.figure(figsize=(12, 6))

upsetp = UpSet(upset_df, show_counts='%d', show_percentages=False, 
               intersection_plot_elements = 0, element_size=40, max_subset_size=2000)
upsetp.add_stacked_bars(by='cell_identity', elements = 10)
upsetp.plot()
plt.legend(bbox_to_anchor=(1.04, 1), loc="upper left")
;

In [None]:
fig = plt.figure(figsize=(12, 6))

upsetp = UpSet(upset_df, show_counts='%d', show_percentages=False, 
               intersection_plot_elements = 0, element_size=40, max_subset_size=2000)
upsetp.add_stacked_bars(by='MS_status', elements = 10)
upsetp.plot()
plt.legend(bbox_to_anchor=(1.04, 1), loc="upper left")
;

In [None]:
adata_epi[(adata_epi.obs['tumour_normal_normal'] == 'genomically_tumour')].shape

In [None]:
adata_epi[(adata_epi.obs['tumour_normal_normal'] == 'genomically_tumour')].obs['filter_alt'].value_counts()

### p035t 

In [None]:
# Scitcem > 0.5 > 226 cells
p035t_cells = adata_epi[(adata_epi.obs['sample'] == 'p035t') & 
          (adata_epi.obs['tumour_normal_normal'] == 'no confident assignment') &
          (adata_epi.obs['scitcem_p'] > 0.5)].obs.sort_values(['scitcem_p', 'p_cnv'])

In [None]:
(p035t_cells['scitcem_p'] < 1).value_counts()

In [None]:
plt.figure(figsize=(12, 5))
sns.scatterplot(data = p035t_cells.reset_index(), x = 'index', y = 'p_cnv', hue = 'iCMS_scANVI')
plt.xticks(rotation=90)
plt.xticks('');

In [None]:
p035t_cells[p035t_cells['scitcem_p'] < 0.999999999999999833].shape

In [None]:
plt.figure(figsize=(12, 5))
sns.scatterplot(data = p035t_cells[p035t_cells['scitcem_p'] < 0.999999999999999833].reset_index(), 
                x = 'index', y = 'p_cnv')
plt.xticks(rotation=90);

In [None]:
p035t_cells[85:100]

In [None]:
p035t_cells.reset_index()[p035t_cells.reset_index()['filter_alt'] > 100]

In [None]:
p035t_cells.shape[0]-86

### huge heatmap of cell identity

In [None]:
sns.color_palette("blend:#1f77b4,#d5d5d5,#ff7f0e", as_cmap=True)

In [None]:
# assign tumour to 1 and normal to 0 for heatmap
dummy_ci_df = pd.get_dummies(adata_epi[adata_epi.obs['sample'].isin(tsamples)].obs[['numbat', 'scitcem_call']])[[
    'numbat_tumour\n(tumour sample)', 'scitcem_call_tumour\n(tumour sample)'
]].merge(adata_epi[adata_epi.obs['sample'].isin(tsamples)].obs[['sample','p_cnv','scitcem_p', 
                                                                'filter_alt', 'filter_dp','mean_VAF']], 
         left_index=True, right_index=True)

In [None]:
dummy_ci_df.shape

In [None]:
dummy_ci_df[['numbat_tumour\n(tumour sample)','scitcem_call_tumour\n(tumour sample)']].value_counts()

In [None]:
with plt.style.context('./plt_style'):
    cmap = sns.color_palette("blend:#1f77b4,#ff7f0e", as_cmap=True)
    fig, (ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8, ax9, ax10, ax11) = plt.subplots(nrows = 11)
    
    sns.heatmap(dummy_ci_df[dummy_ci_df['sample'] == tsamples[0]].sort_values(['numbat_tumour\n(tumour sample)',
                                                                               'scitcem_call_tumour\n(tumour sample)'
                                                                                  ])[['numbat_tumour\n(tumour sample)',
                                                                                      'scitcem_call_tumour\n(tumour sample)']].T,
                xticklabels = False,
                cmap=cmap,ax=ax1, cbar=False, yticklabels = False)
    
    sns.heatmap(dummy_ci_df[dummy_ci_df['sample'] == tsamples[1]].sort_values(['numbat_tumour\n(tumour sample)',
                                                                               'scitcem_call_tumour\n(tumour sample)'
                                                                                  ])[['numbat_tumour\n(tumour sample)',
                                                                                      'scitcem_call_tumour\n(tumour sample)']].T,
                xticklabels = False,
                cmap=cmap, ax=ax2, cbar=False, yticklabels = False)
    
    sns.heatmap(dummy_ci_df[dummy_ci_df['sample'] == tsamples[2]].sort_values(['numbat_tumour\n(tumour sample)',
                                                                               'scitcem_call_tumour\n(tumour sample)'
                                                                                  ])[['numbat_tumour\n(tumour sample)',
                                                                                      'scitcem_call_tumour\n(tumour sample)']].T,
                xticklabels = False,
                cmap=cmap, ax=ax3, cbar=False, yticklabels = False)
    
    sns.heatmap(dummy_ci_df[dummy_ci_df['sample'] == tsamples[3]].sort_values(['numbat_tumour\n(tumour sample)',
                                                                               'scitcem_call_tumour\n(tumour sample)'
                                                                                  ])[['numbat_tumour\n(tumour sample)',
                                                                                      'scitcem_call_tumour\n(tumour sample)']].T,
                xticklabels = False,
                cmap=cmap, ax=ax4, cbar=False, yticklabels = False)
    
    sns.heatmap(dummy_ci_df[dummy_ci_df['sample'] == tsamples[4]].sort_values(['numbat_tumour\n(tumour sample)',
                                                                               'scitcem_call_tumour\n(tumour sample)'
                                                                                  ])[['numbat_tumour\n(tumour sample)',
                                                                                      'scitcem_call_tumour\n(tumour sample)']].T,
                xticklabels = False,
                cmap=cmap, ax=ax5, cbar=False, yticklabels = False)
    
    sns.heatmap(dummy_ci_df[dummy_ci_df['sample'] == tsamples[5]].sort_values(['numbat_tumour\n(tumour sample)',
                                                                               'scitcem_call_tumour\n(tumour sample)'
                                                                                  ])[['numbat_tumour\n(tumour sample)',
                                                                                      'scitcem_call_tumour\n(tumour sample)']].T,
                xticklabels = False,
                cmap=cmap, ax=ax6, cbar=False, yticklabels = False)
    
    sns.heatmap(dummy_ci_df[dummy_ci_df['sample'] == tsamples[6]].sort_values(['numbat_tumour\n(tumour sample)',
                                                                               'scitcem_call_tumour\n(tumour sample)'
                                                                                  ])[['numbat_tumour\n(tumour sample)',
                                                                                      'scitcem_call_tumour\n(tumour sample)']].T,
                xticklabels = False,
                cmap=cmap, ax=ax7, cbar=False, yticklabels = False)
    
    sns.heatmap(dummy_ci_df[dummy_ci_df['sample'] == tsamples[7]].sort_values(['numbat_tumour\n(tumour sample)',
                                                                               'scitcem_call_tumour\n(tumour sample)'
                                                                                  ])[['numbat_tumour\n(tumour sample)',
                                                                                      'scitcem_call_tumour\n(tumour sample)']].T,
                xticklabels = False,
                cmap=cmap, ax=ax8, cbar=False, yticklabels = False)
    
    sns.heatmap(dummy_ci_df[dummy_ci_df['sample'] == tsamples[8]].sort_values(['numbat_tumour\n(tumour sample)',
                                                                               'scitcem_call_tumour\n(tumour sample)'
                                                                                  ])[['numbat_tumour\n(tumour sample)',
                                                                                      'scitcem_call_tumour\n(tumour sample)']].T,
                xticklabels = False,
                cmap=cmap, ax=ax9, cbar=False, yticklabels = False)
    
    sns.heatmap(dummy_ci_df[dummy_ci_df['sample'] == tsamples[9]].sort_values(['numbat_tumour\n(tumour sample)',
                                                                               'scitcem_call_tumour\n(tumour sample)'
                                                                                  ])[['numbat_tumour\n(tumour sample)',
                                                                                      'scitcem_call_tumour\n(tumour sample)']].T,
                xticklabels = False,
                cmap=cmap, ax=ax10, cbar=False, yticklabels = False)
    
    sns.heatmap(dummy_ci_df[dummy_ci_df['sample'] == tsamples[10]].sort_values(['numbat_tumour\n(tumour sample)',
                                                                               'scitcem_call_tumour\n(tumour sample)'
                                                                                  ])[['numbat_tumour\n(tumour sample)',
                                                                                      'scitcem_call_tumour\n(tumour sample)']].T,
                xticklabels = False,
                cmap=cmap, ax=ax11, cbar=False, yticklabels = False)

    ax1.set(ylabel=tsamples[0])
    ax1.yaxis.label.set(rotation=0, ha='right', va= 'center', size = 16)
    
    ax2.set(ylabel=tsamples[1])
    ax2.yaxis.label.set(rotation=0, ha='right', va= 'center', size = 16)
    
    ax3.set(ylabel=tsamples[2])
    ax3.yaxis.label.set(rotation=0, ha='right', va= 'center', size = 16)
    
    ax4.set(ylabel=tsamples[3])
    ax4.yaxis.label.set(rotation=0, ha='right', va= 'center', size = 16)
    
    ax5.set(ylabel=tsamples[4])
    ax5.yaxis.label.set(rotation=0, ha='right', va= 'center', size = 16)
    
    ax6.set(ylabel=tsamples[5])
    ax6.yaxis.label.set(rotation=0, ha='right', va= 'center', size = 16)
    
    ax7.set(ylabel=tsamples[6])
    ax7.yaxis.label.set(rotation=0, ha='right', va= 'center', size = 16)
    
    ax8.set(ylabel=tsamples[7])
    ax8.yaxis.label.set(rotation=0, ha='right', va= 'center', size = 16)
    
    ax9.set(ylabel=tsamples[8])
    ax9.yaxis.label.set(rotation=0, ha='right', va= 'center', size = 16)
    
    ax10.set(ylabel=tsamples[9])
    ax10.yaxis.label.set(rotation=0, ha='right', va= 'center', size = 16)
    
    ax11.set(ylabel=tsamples[10])
    ax11.yaxis.label.set(rotation=0, ha='right', va= 'center', size = 16)
    
    ax1.set_title('Cell identity assignment\n')
    
    
;

In [None]:
cmap = sns.color_palette("blend:#1f77b4,#ff7f0e", as_cmap=True)
fig, (axes) = plt.subplots(nrows = 11, subplot_kw={'yticks':[]}, figsize = (10,10))

axins = inset_axes(axes[0], width="100%", height="2000%",
                   bbox_to_anchor=(1.05, -2, .03, .4),
                   bbox_transform=axes[0].transAxes, loc=2, borderpad=0)
axins.tick_params(left=False, right=True, labelleft=False, labelright=True)
    

images = []

for i in np.arange(0,11):
    data = np.asarray(dummy_ci_df[dummy_ci_df['sample'] == tsamples[i]].sort_values(['numbat_tumour\n(tumour sample)',
                                                                               'scitcem_call_tumour\n(tumour sample)'
                                                                                  ])[['numbat_tumour\n(tumour sample)',
                                                                                      'scitcem_call_tumour\n(tumour sample)']].T)
    
        
    axes[i].set(ylabel=tsamples[i])
    axes[i].yaxis.label.set(rotation=0, ha='right', va= 'center')
    
    images.append(axes[i].imshow(data, cmap = cmap, aspect = 'auto',
                       interpolation = 'nearest', interpolation_stage = 'rgba'))
        
fig.colorbar(images[0], cax = axins, fraction = .03)

fig.subplots_adjust(hspace = 0.6)

;

In [None]:
dummy_ci_df[dummy_ci_df.p_cnv > 1] #3275

In [None]:
dummy_ci_df['mod_p_cnv'] = None

for i in np.arange(0, dummy_ci_df.shape[0]):
    if np.round(dummy_ci_df['p_cnv'][i], 2) == 1:
        dummy_ci_df['mod_p_cnv'][i] = 1
    else:
        dummy_ci_df['mod_p_cnv'][i] = dummy_ci_df['p_cnv'][i]
dummy_ci_df['mod_p_cnv'] = dummy_ci_df['mod_p_cnv'].astype('float64')

In [None]:
tsamples_original = tsamples.copy()

In [None]:
tsamples = ['P07t',
 'P08t',
 'P09t',
 'P09t',
 'P13t',
 'P14t',
 'P16t',
 'P20t',
 'P21t',
 'P26t',
 'P35t']

In [None]:
dummy_ci_df['sample_rev'] = None

for i in np.arange(0, dummy_ci_df.shape[0]):
    if dummy_ci_df['sample'][i] in ['p009t1', 'p009t2']:
        dummy_ci_df['sample_rev'][i] = 'P09t'
    elif dummy_ci_df['sample'][i] in ['p009n1', 'p009n2']:
        dummy_ci_df['sample_rev'][i] = 'P09n'
    else:
        dummy_ci_df['sample_rev'][i] = dummy_ci_df['sample'][i].replace('p0', 'P')
        
dummy_ci_df['sample_rev'] = dummy_ci_df['sample_rev'].astype('category')

In [None]:
dummy_ci_df['sample_original'] = dummy_ci_df['sample'].copy()

In [None]:
dummy_ci_df['sample'] = dummy_ci_df['sample_rev'].copy()

In [None]:
# map the probability
with plt.style.context('./plt_style'):
    cmap = sns.color_palette("blend:#1f77b4,#333333,#ff7f0e", as_cmap=True)
    fig, (ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8, ax9, ax10, ax11) = plt.subplots(nrows = 11)
    
    from mpl_toolkits.axes_grid1.inset_locator import inset_axes
    #cbar_ax = fig.add_axes([.91, .3, .03, .4])
    
    axins = inset_axes(ax1, width="100%", height="2000%",
                   bbox_to_anchor=(1.05, -2, .03, .4),
                   bbox_transform=ax1.transAxes, loc=2, borderpad=0)
    axins.tick_params(left=False, right=True, labelleft=False, labelright=True)
    
    sns.heatmap(dummy_ci_df[dummy_ci_df['sample'] == tsamples[0]].sort_values(['numbat_tumour\n(tumour sample)',
                                                                               'scitcem_call_tumour\n(tumour sample)',
                                                                               'mod_p_cnv','scitcem_p']
                                                                             )[['mod_p_cnv','scitcem_p']].T,
                xticklabels = False,
                cmap=cmap,ax=ax1, cbar_ax=axins, yticklabels = False)
    
    sns.heatmap(dummy_ci_df[dummy_ci_df['sample'] == tsamples[1]].sort_values(['numbat_tumour\n(tumour sample)',
                                                                               'scitcem_call_tumour\n(tumour sample)',
                                                                                'mod_p_cnv','scitcem_p'
                                                                                  ])[['mod_p_cnv','scitcem_p']].T,
                xticklabels = False,
                cmap=cmap, ax=ax2, cbar=False, yticklabels = False)
    
    sns.heatmap(dummy_ci_df[dummy_ci_df['sample'] == tsamples[2]].sort_values(['numbat_tumour\n(tumour sample)',
                                                                               'scitcem_call_tumour\n(tumour sample)',
                                                                                'mod_p_cnv','scitcem_p'
                                                                                  ])[['mod_p_cnv','scitcem_p']].T,
                xticklabels = False,
                cmap=cmap, ax=ax3, cbar=False, yticklabels = False)
    
    sns.heatmap(dummy_ci_df[dummy_ci_df['sample'] == tsamples[3]].sort_values(['numbat_tumour\n(tumour sample)',
                                                                               'scitcem_call_tumour\n(tumour sample)',
                                                                                'mod_p_cnv','scitcem_p'
                                                                                  ])[['mod_p_cnv','scitcem_p']].T,
                xticklabels = False,
                cmap=cmap, ax=ax4, cbar=False, yticklabels = False)
    
    sns.heatmap(dummy_ci_df[dummy_ci_df['sample'] == tsamples[4]].sort_values(['numbat_tumour\n(tumour sample)',
                                                                               'scitcem_call_tumour\n(tumour sample)',
                                                                                'mod_p_cnv','scitcem_p'
                                                                                  ])[['mod_p_cnv','scitcem_p']].T,
                xticklabels = False,
                cmap=cmap, ax=ax5, cbar=False, yticklabels = False)
    
    sns.heatmap(dummy_ci_df[dummy_ci_df['sample'] == tsamples[5]].sort_values(['numbat_tumour\n(tumour sample)',
                                                                               'scitcem_call_tumour\n(tumour sample)',
                                                                                'mod_p_cnv','scitcem_p'
                                                                                  ])[['mod_p_cnv','scitcem_p']].T,
                xticklabels = False,
                cmap=cmap, ax=ax6, cbar=False, yticklabels = False)
    
    sns.heatmap(dummy_ci_df[dummy_ci_df['sample'] == tsamples[6]].sort_values(['numbat_tumour\n(tumour sample)',
                                                                               'scitcem_call_tumour\n(tumour sample)',
                                                                                'mod_p_cnv','scitcem_p'
                                                                                  ])[['mod_p_cnv','scitcem_p']].T,
                xticklabels = False,
                cmap=cmap, ax=ax7, cbar=False, yticklabels = False)
    
    sns.heatmap(dummy_ci_df[dummy_ci_df['sample'] == tsamples[7]].sort_values(['numbat_tumour\n(tumour sample)',
                                                                               'scitcem_call_tumour\n(tumour sample)',
                                                                                'mod_p_cnv','scitcem_p'
                                                                                  ])[['mod_p_cnv','scitcem_p']].T,
                xticklabels = False,
                cmap=cmap, ax=ax8, cbar=False, yticklabels = False)
    
    sns.heatmap(dummy_ci_df[dummy_ci_df['sample'] == tsamples[8]].sort_values(['numbat_tumour\n(tumour sample)',
                                                                               'scitcem_call_tumour\n(tumour sample)',
                                                                                'mod_p_cnv','scitcem_p'
                                                                                  ])[['mod_p_cnv','scitcem_p']].T,
                xticklabels = False,
                cmap=cmap, ax=ax9, cbar=False, yticklabels = False)
    
    sns.heatmap(dummy_ci_df[dummy_ci_df['sample'] == tsamples[9]].sort_values(['numbat_tumour\n(tumour sample)',
                                                                               'scitcem_call_tumour\n(tumour sample)',
                                                                                'mod_p_cnv','scitcem_p'
                                                                                  ])[['mod_p_cnv','scitcem_p']].T,
                xticklabels = False,
                cmap=cmap, ax=ax10, cbar=False, yticklabels = False)
    
    sns.heatmap(dummy_ci_df[dummy_ci_df['sample'] == tsamples[10]].sort_values(['numbat_tumour\n(tumour sample)',
                                                                               'scitcem_call_tumour\n(tumour sample)',
                                                                                'mod_p_cnv','scitcem_p'
                                                                                  ])[['mod_p_cnv','scitcem_p']].T,
                xticklabels = False,
                cmap=cmap, ax=ax11, cbar=False, yticklabels = False)

    ax1.set(ylabel=tsamples[0])
    ax1.yaxis.label.set(rotation=0, ha='right', va= 'center', size = 16)
    
    ax2.set(ylabel=tsamples[1])
    ax2.yaxis.label.set(rotation=0, ha='right', va= 'center', size = 16)
    
    ax3.set(ylabel=tsamples[2])
    ax3.yaxis.label.set(rotation=0, ha='right', va= 'center', size = 16)
    
    ax4.set(ylabel=tsamples[3])
    ax4.yaxis.label.set(rotation=0, ha='right', va= 'center', size = 16)
    
    ax5.set(ylabel=tsamples[4])
    ax5.yaxis.label.set(rotation=0, ha='right', va= 'center', size = 16)
    
    ax6.set(ylabel=tsamples[5])
    ax6.yaxis.label.set(rotation=0, ha='right', va= 'center', size = 16)
    
    ax7.set(ylabel=tsamples[6])
    ax7.yaxis.label.set(rotation=0, ha='right', va= 'center', size = 16)
    
    ax8.set(ylabel=tsamples[7])
    ax8.yaxis.label.set(rotation=0, ha='right', va= 'center', size = 16)
    
    ax9.set(ylabel=tsamples[8])
    ax9.yaxis.label.set(rotation=0, ha='right', va= 'center', size = 16)
    
    ax10.set(ylabel=tsamples[9])
    ax10.yaxis.label.set(rotation=0, ha='right', va= 'center', size = 16)
    
    ax11.set(ylabel=tsamples[10])
    ax11.yaxis.label.set(rotation=0, ha='right', va= 'center', size = 16)
    
    #ax1.set_title('Cell identity assignment probability\n')
    
    #plt.tight_layout()
    
;

In [None]:
tsamples = ['P07t',
 'P08t',
 'P09t',
 'P13t',
 'P14t',
 'P16t',
 'P20t',
 'P21t',
 'P26t',
 'P35t']

In [None]:
dummy_ci_df['sample'].value_counts().sort_index()

In [None]:
adata_epi.obs['sample'].value_counts().sort_index()

In [None]:
data.shape[1]/2

In [None]:
cmap = sns.color_palette("blend:#1f77b4,#333333,#ff7f0e", as_cmap=True)

fig, (axes) = plt.subplots(nrows = 10, subplot_kw={'xticks':[],'yticks':[]}, figsize = (7/2.54,10/2.54))

axins = inset_axes(axes[0], width="100%", height="2000%",
                   bbox_to_anchor=(1.05, -1, .03, .4),
                   bbox_transform=axes[0].transAxes, loc=2, borderpad=0)
axins.tick_params(left=False, right=True, labelleft=False, labelright=True)
    

images = []

for i in np.arange(0,10):
    data = np.asarray(dummy_ci_df[dummy_ci_df['sample'] == tsamples[i]].sort_values(['numbat_tumour\n(tumour sample)',
                                                                               'scitcem_call_tumour\n(tumour sample)',
                                                                                'mod_p_cnv','scitcem_p'
                                                                                  ])[['mod_p_cnv','scitcem_p']].T)
         
    axes[i].set(ylabel=tsamples[i])
    axes[i].yaxis.label.set(rotation=0, ha='right', va= 'center')
    axes[i].set_frame_on(False)
    
    images.append(axes[i].imshow(data, cmap = cmap, aspect = 'auto',
                       interpolation = 'nearest', interpolation_stage = 'rgba'))
    
axes[9].set_xticks([0, data.shape[1]/2, data.shape[1]-1], ['0.0', '0.5', '1.0'])
axes[9].set(xlabel = 'Proportion')

        
cbar = fig.colorbar(images[0], cax = axins, fraction = .03)
cbar.solids.set_edgecolor('face')

fig.subplots_adjust(hspace = 0.1)

;

import matplotlib.gridspec as gridspec

height = 6
width = 8

test = np.array(dummy_ci_df[dummy_ci_df['sample'] == tsamples[6]].sort_values([
                                                               'p_cnv','scitcem_p']
                                                                             )[['p_cnv']])

test2 = np.array(dummy_ci_df[dummy_ci_df['sample'] == tsamples[6]].sort_values([
                                                               'p_cnv','scitcem_p']
                                                                             )[['scitcem_p']])

fig = plt.figure(figsize=(width, height))
axs = gridspec.GridSpec(nrows = 2, ncols =1)

heatmap_ax1 = fig.add_subplot(axs[0, :])
heatmap_ax1.imshow(test.reshape(1,-1), 
                  aspect='auto', interpolation='nearest')
heatmap_ax1.set_ylim(0,0.5)
heatmap_ax1.tick_params(axis="both", which = 'both',labelleft=False, left=False, bottom = False,
                       labelbottom=False)

heatmap_ax2 = fig.add_subplot(axs[1, :])
heatmap_ax2.imshow(test2.reshape(1,-1), 
                  aspect='auto', interpolation='nearest')
heatmap_ax2.set_ylim(0,0.5)
heatmap_ax2.tick_params(axis="both", which = 'both',labelleft=False, left=False, bottom = False,
                       labelbottom=False)



plt.show()

p014
with plt.rc_context({'ytick.major.size':16, 'axes.titlesize':18}):
    plt.figure(figsize = (10,5))
    sns.heatmap(dummy_ci_df[(dummy_ci_df['sample'] == tsamples[6])&
                           (dummy_ci_df['numbat_tumour\n(tumour sample)'] != dummy_ci_df['scitcem_call_tumour\n(tumour sample)'])].sort_values(
        ['scitcem_p', 'p_cnv'])[['p_cnv','scitcem_p',
                                 'numbat_tumour\n(tumour sample)',
                                 'scitcem_call_tumour\n(tumour sample)']].T,
                xticklabels = False,
                cmap='tab20', #cbar=False, 
            yticklabels = False)
;

p035
with plt.rc_context({'ytick.major.size':16, 'axes.titlesize':18}):
    plt.figure(figsize = (10,5))
    sns.heatmap(dummy_ci_df[(dummy_ci_df['sample'] == tsamples[10])&
                           (dummy_ci_df['numbat_tumour\n(tumour sample)'] != dummy_ci_df['scitcem_call_tumour\n(tumour sample)'])].sort_values(
        ['scitcem_p', 'p_cnv'])[['p_cnv','scitcem_p',
                                 'numbat_tumour\n(tumour sample)',
                                 'scitcem_call_tumour\n(tumour sample)']].T,
                xticklabels = False,
                cmap='tab20', #cbar=False, 
            yticklabels = False)
;

In [None]:
dummy_ci_df_conflicting = dummy_ci_df[(dummy_ci_df['numbat_tumour\n(tumour sample)'] != dummy_ci_df['scitcem_call_tumour\n(tumour sample)'])]

In [None]:
dummy_ci_df_conflicting

In [None]:
dummy_ci_df_conflicting[dummy_ci_df_conflicting['scitcem_p'] == 0.5]['filter_alt'].value_counts(dropna= False)

In [None]:
dummy_ci_df_conflicting[dummy_ci_df_conflicting['scitcem_p'] == 0.5]['filter_dp'].value_counts(dropna= False)

In [None]:
dummy_ci_df_conflicting[dummy_ci_df_conflicting['scitcem_p'] == 0.5]['mean_VAF'].value_counts(dropna= False)

In [None]:
dummy_ci_df_conflicting[dummy_ci_df_conflicting['scitcem_p'] == 0.5]['sample'].value_counts(
    dropna= False).sort_index()

In [None]:
dummy_ci_df_conflicting['sample'].value_counts(
    dropna= False).sort_index()

In [None]:
# proportion of conflicting cells 
(dummy_ci_df_conflicting[dummy_ci_df_conflicting['scitcem_p'] == 0.5]['sample'].value_counts(
    dropna= False).sort_index())/(dummy_ci_df_conflicting['sample'].value_counts(
    dropna= False).sort_index())

In [None]:
# proportion of all cells 
(dummy_ci_df_conflicting[dummy_ci_df_conflicting['scitcem_p'] == 0.5]['sample'].value_counts(
    dropna= False).sort_index())/(dummy_ci_df['sample'].value_counts(
    dropna= False).sort_index())

In [None]:
for i in tsamples:
    plt.figure(figsize = (6,2))
    ax1 = plt.subplot()
    ax2 = ax1.twinx()
    df = dummy_ci_df_conflicting[dummy_ci_df_conflicting['sample'] == i].reset_index().sort_values([
        'scitcem_p', 'p_cnv'])
    sns.scatterplot(data = df,
                    x = 'index',
                    y = 'p_cnv', ax = ax1, s = 10)
    sns.scatterplot(data = df,
                    x = 'index',
                    y = 'scitcem_p', ax = ax2, color = 'firebrick', s = 10)
    plt.title(f'{i} ({df.shape[0]} cells)')
    ax1.set_ylim(-0.1,1.1)
    ax2.set_ylim(-0.1,1.1)
    ax1.set_xticklabels('')

In [None]:
# number of alt
for i in tsamples:
    plt.figure(figsize = (10/2.54,5/2.54))
    ax1 = plt.subplot()
    ax2 = ax1.twinx()
    ax3 = ax1.twinx()
    
    df = dummy_ci_df_conflicting[dummy_ci_df_conflicting['sample'] == i].reset_index().sort_values([
        'scitcem_p', 'p_cnv'])
    sns.scatterplot(data = df,
                    x = 'index',
                    y = 'p_cnv', ax = ax1, s = 10)
    sns.scatterplot(data = df,
                    x = 'index',
                    y = 'scitcem_p', ax = ax2, color = 'firebrick', s = 10)
    sns.scatterplot(data = df,
                    x = 'index',
                    y = 'filter_alt', ax = ax3, color = 'orange', s = 8)
    
    plt.title(f'{i} ({df.shape[0]} cells)')
    
    ax1.set_ylim(-0.1,1.1)
    ax2.set_ylim(-0.1,1.1)
    
    ax1.set_xticklabels('')
    ax1.set_xticks([])
    ax1.set_ylabel('probability')
    
    ax2.set_yticklabels('')
    ax2.set_yticks([])
    ax2.set_ylabel('')

In [None]:
# mean_VAF
for i in tsamples:
    plt.figure(figsize = (10/2.54,5/2.54))
    ax1 = plt.subplot()
    ax2 = ax1.twinx()
    ax3 = ax1.twinx()
    
    df = dummy_ci_df_conflicting[dummy_ci_df_conflicting['sample'] == i].reset_index().sort_values([
        'scitcem_p', 'p_cnv'])
    sns.scatterplot(data = df,
                    x = 'index',
                    y = 'p_cnv', ax = ax1, s = 10)
    sns.scatterplot(data = df,
                    x = 'index',
                    y = 'scitcem_p', ax = ax2, color = 'firebrick', s = 10)
    sns.scatterplot(data = df,
                    x = 'index',
                    y = 'mean_VAF', ax = ax3, color = 'orange', s = 8)
    
    plt.title(f'{i} ({df.shape[0]} cells)')
    
    ax1.set_ylim(-0.1,1.1)
    ax2.set_ylim(-0.1,1.1)
    ax3.set_ylim(-0.1,1.1)
    
    ax1.set_xticklabels('')
    ax1.set_xticks([])
    ax1.set_ylabel('probability')
    
    ax2.set_yticklabels('')
    ax2.set_yticks([])
    ax2.set_ylabel('')

In [None]:
dummy_ci_df_conflicting_NA = dummy_ci_df_conflicting[dummy_ci_df_conflicting['scitcem_p'] == 0.5]

In [None]:
dummy_ci_df_conflicting_NA_solved = dummy_ci_df_conflicting[dummy_ci_df_conflicting['scitcem_p'] != 0.5]

In [None]:
dummy_ci_df_conflicting_NA_solved.shape

In [None]:
dummy_ci_df_conflicting_NA_solved[(dummy_ci_df_conflicting_NA_solved['sample'] == 'p026t')&
                                 (dummy_ci_df_conflicting_NA_solved['scitcem_p'] < 0.5)][[
    'filter_dp','filter_alt', 'mean_VAF']].value_counts().sort_index()

In [None]:
dummy_ci_df_conflicting_NA_solved[(dummy_ci_df_conflicting_NA_solved['scitcem_p'] < 0.5)]['mean_VAF'].value_counts()

In [None]:
dummy_ci_df_conflicting_NA_solved[(dummy_ci_df_conflicting_NA_solved['scitcem_p'] < 0.5)].shape

In [None]:
1209/2562

In [None]:
dummy_ci_df_conflicting_NA_solved[(dummy_ci_df_conflicting_NA_solved['scitcem_p'] < 0.5)]['p_cnv'].hist()
plt.xlim(0,1)

In [None]:
dummy_ci_df_conflicting_NA_solved[(dummy_ci_df_conflicting_NA_solved['scitcem_p'] < 0.5) &
                                 (dummy_ci_df_conflicting_NA_solved['p_cnv'] > 0.5)].shape

In [None]:
dummy_ci_df_conflicting_NA_solved_high_scitcem = dummy_ci_df_conflicting_NA_solved[(dummy_ci_df_conflicting_NA_solved['scitcem_p'] > 0.5)]

In [None]:
dummy_ci_df_conflicting_NA_solved_high_scitcem

dummy_ci_df_conflicting_NA_solved_high_scitcem.to_csv(Path(new_data_folder)/'conflicting_cell_identity_scitcem_high.txt')

In [None]:
646/2562

### after remove the Scticem == 0.5 cells

In [None]:
# number of alt
for i in tsamples:
    plt.figure(figsize = (6,2))
    ax1 = plt.subplot()
    ax2 = ax1.twinx()
    ax3 = ax1.twinx()
    
    df = dummy_ci_df_conflicting_NA_solved[dummy_ci_df_conflicting_NA_solved['sample'] == i].reset_index().sort_values([
        'scitcem_p', 'p_cnv'])
    sns.scatterplot(data = df,
                    x = 'index',
                    y = 'p_cnv', ax = ax1, s = 10)
    sns.scatterplot(data = df,
                    x = 'index',
                    y = 'scitcem_p', ax = ax2, color = 'firebrick', s = 10)
    sns.scatterplot(data = df,
                    x = 'index',
                    y = 'filter_alt', ax = ax3, color = 'orange', s = 8)
    
    plt.title(f'{i} ({df.shape[0]} cells)')
    
    ax1.set_ylim(-0.1,1.1)
    ax2.set_ylim(-0.1,1.1)
    
    ax1.set_xticklabels('')
    ax1.set_xticks([])
    ax1.set_ylabel('probability')
    
    ax2.set_yticklabels('')
    ax2.set_yticks([])
    ax2.set_ylabel('')

In [None]:
# number of dp
for i in tsamples:
    plt.figure(figsize = (6,2))
    ax1 = plt.subplot()
    ax2 = ax1.twinx()
    ax3 = ax1.twinx()
    
    df = dummy_ci_df_conflicting_NA_solved[dummy_ci_df_conflicting_NA_solved['sample'] == i].reset_index().sort_values([
        'scitcem_p', 'p_cnv'])
    sns.scatterplot(data = df,
                    x = 'index',
                    y = 'p_cnv', ax = ax1, s = 10)
    sns.scatterplot(data = df,
                    x = 'index',
                    y = 'scitcem_p', ax = ax2, color = 'firebrick', s = 10)
    sns.scatterplot(data = df,
                    x = 'index',
                    y = 'filter_dp', ax = ax3, color = 'orange', s = 8)
    
    plt.title(f'{i} ({df.shape[0]} cells)')
    
    ax1.set_ylim(-0.1,1.1)
    ax2.set_ylim(-0.1,1.1)
    
    ax1.set_xticklabels('')
    ax1.set_xticks([])
    ax1.set_ylabel('probability')
    
    ax2.set_yticklabels('')
    ax2.set_yticks([])
    ax2.set_ylabel('')

In [None]:
# mean_VAF
for i in tsamples:
    plt.figure(figsize = (6,2))
    ax1 = plt.subplot()
    ax2 = ax1.twinx()
    ax3 = ax1.twinx()
    
    df = dummy_ci_df_conflicting_NA_solved[dummy_ci_df_conflicting_NA_solved['sample'] == i].reset_index().sort_values([
        'scitcem_p', 'p_cnv'])
    sns.scatterplot(data = df,
                    x = 'index',
                    y = 'p_cnv', ax = ax1, s = 10)
    sns.scatterplot(data = df,
                    x = 'index',
                    y = 'scitcem_p', ax = ax2, color = 'firebrick', s = 10)
    sns.scatterplot(data = df,
                    x = 'index',
                    y = 'mean_VAF', ax = ax3, color = 'orange', s = 8)
    
    plt.title(f'{i} ({df.shape[0]} cells)')
    
    ax1.set_ylim(-0.1,1.1)
    ax2.set_ylim(-0.1,1.1)
    ax3.set_ylim(-0.1,1.1)
    
    ax1.set_xticklabels('')
    ax1.set_xticks([])
    ax1.set_ylabel('probability')
    
    ax2.set_yticklabels('')
    ax2.set_yticks([])
    ax2.set_ylabel('')

In [None]:
dummy_ci_df_conflicting_NA.index

In [None]:
adata_epi.obs['cell_identity_NA_resolved'] = np.where(adata_epi.obs.index.isin(dummy_ci_df_conflicting_NA.index),
                                                     'no observed variant', adata_epi.obs['tumour_normal_normal'])
adata_epi.obs['cell_identity_NA_resolved'] = adata_epi.obs['cell_identity_NA_resolved'].astype('category')

In [None]:
adata_epi.obs['cell_identity_NA_resolved'].value_counts()

### check the cellrank ijynb for the following analysis

#### try diffmap

In [None]:
sc.tl.diffmap(adata_epi, n_comps=15)

In [None]:
# try different components 
scv.pl.scatter(adata_epi, basis='diffmap', color=['Uhlitz_scANVI'], 
               components = ['1,2', '2,3', '1,3', '2,4', '1,4'],
               ncols=2, dpi=300, legend_loc=None, size = 2,
               title = ['DC 1,2', 'DC 2,3', 'DC 1,3', 'DC 2,4', 'DC 1,4'])

In [None]:
scv.pl.scatter(adata_epi, basis='diffmap', color=['normal_vs_MS'], 
               components = ['1,2', '2,3', '1,3', '2,4', '1,4'],
               ncols=2, dpi=300, legend_loc=None, size = 2,
               title = ['DC 1,2', 'DC 2,3', 'DC 1,3', 'DC 2,4', 'DC 1,4'])

In [None]:
scv.pl.scatter(adata_epi, basis='diffmap', color=['cell_identity'], 
               components = ['1,2', '2,3', '1,3', '2,4', '1,4'],
               ncols=2, dpi=300, legend_loc='right', size = 2,
               title = ['DC 1,2', 'DC 2,3', 'DC 1,3', 'DC 2,4', 'DC 1,4'])

In [None]:
scv.pl.scatter(adata_epi, basis='diffmap', color=['tumour_normal_normal'], 
               components = ['1,2', '2,3', '1,3', '2,4', '1,4'],
               ncols=2, dpi=300, legend_loc=None, size = 2,
               title = ['DC 1,2', 'DC 2,3', 'DC 1,3', 'DC 2,4', 'DC 1,4'])

In [None]:
scv.pl.scatter(adata_epi, basis='diffmap', color=['iCMS_scANVI'], 
               components = ['1,2', '2,3', '1,3', '2,4', '1,4'],
               ncols=2, dpi=300, legend_loc=None, size = 2,
               title = ['DC 1,2', 'DC 2,3', 'DC 1,3', 'DC 2,4', 'DC 1,4'])

In [None]:
scv.pl.scatter(adata_epi, basis='diffmap', color=['BMP4'], 
               components = ['1,2', '2,3', '1,3', '2,4', '1,4'],
               ncols=2, dpi=300, legend_loc=None, size = 2,
               title = ['DC 1,2', 'DC 2,3', 'DC 1,3', 'DC 2,4', 'DC 1,4'])

In [None]:
scv.pl.scatter(adata_epi, basis='diffmap', color=['TGFB1'], 
               components = ['1,2', '2,3', '1,3', '2,4', '1,4'],
               ncols=2, dpi=300, legend_loc=None, size = 2,
               title = ['DC 1,2', 'DC 2,3', 'DC 1,3', 'DC 2,4', 'DC 1,4'])

In [None]:
adata_epi.obsm['X_diffmap'].shape

### a force-directed graph

In [None]:
adata_copy = adata_epi.copy()

In [None]:
sc.tl.draw_graph(adata_copy)

In [None]:
sc.pl.draw_graph(adata_copy, color='Uhlitz_scANVI')

#### try denoise

In [None]:
# look aweful
sc.pp.neighbors(adata_copy, n_neighbors=20, use_rep='X_diffmap')

In [None]:
sc.tl.draw_graph(adata_copy)

In [None]:
sc.pl.draw_graph(adata_copy, color='Uhlitz_scANVI')

#### PAGA

In [None]:
sc.tl.paga(adata_epi, groups='leiden')

In [None]:
sc.pl.paga(adata_epi, color=['louvain', 'leiden', 'Uhlitz_scANVI'], node_size_scale=3)

In [None]:
# check them
sc.pl.paga(adata_epi, color=['TFF3', 'FABP1', 'ATOH1', 'MKI67', 'KRT19', 'KRT14', 'KRT20', 'AXIN2', 
                             'ALDOA', 'ETV4', 'TRPM2', 'EREG', 'EGR1', 'OLFM4', 'LGR5', 'EPHB2'])


In [None]:
sc.pl.paga(adata_epi, color=['TGFB1', 'BMP2', 'BMP4', 'CTNNB1', 'SMAD4', 'TP53'])

In [None]:
sc.tl.paga(adata_epi, groups='Uhlitz_scANVI')

In [None]:
sc.pl.paga(adata_epi, color=['louvain', 'leiden', 'Uhlitz_scANVI'], node_size_scale=3)

In [None]:
sc.tl.draw_graph(adata_epi, init_pos='paga')

In [None]:
sc.pl.draw_graph(adata_epi, color=['louvain', 'leiden', 'Uhlitz_scANVI'])

In [None]:
# diffusion psudotime e.g. DC2 > stem cells 0  to entorocytes 1 > visual 
# tumour cells before the stem cells > <0
# normal vs tumour normal cells > differentiaion vs dedifferentiation 

In [None]:
sample_color_map = dict(zip(adata_epi.obs['Uhlitz_scANVI'].cat.categories, adata_epi.uns['Uhlitz_scANVI_colors']))

In [None]:
row_color = adata_epi.obs['Uhlitz_scANVI'].replace(pd.Series(sample_color_map), inplace=False)

### CytoTRACE pseudotime


In [None]:
# format hack
adata_epi.layers["spliced"] = adata_epi.X
adata_epi.layers["unspliced"] = adata_epi.X

# calculate 
scv.pp.moments(adata_epi, n_pcs=15, n_neighbors=20)

In [None]:
ctk = CytoTRACEKernel(adata_epi).compute_cytotrace()

In [None]:
ctk.compute_transition_matrix(threshold_scheme="soft", nu=0.5)

In [None]:
ctk.plot_projection(basis="diffmap", recompute=False, density = 1, legend_loc = 'right', color = 'Uhlitz_scANVI',
                   stream = True,
                   size = 2, alpha = 1, components = '1,2')

### dpt pseudotime

In [None]:
# try DC
# Choose a root cell for diffusion pseudotime.

adata_epi[adata_epi.obs['Uhlitz_scANVI'] == 'Stem'].obsm['X_diffmap'][:, 2].argmin()

In [None]:
adata_epi[adata_epi.obs['Uhlitz_scANVI'] == 'Stem'].obsm['X_diffmap'][:, 3].argmin()

In [None]:
adata_epi[adata_epi.obs['Uhlitz_scANVI'] == 'Stem'][1449].obs.index

In [None]:
row_number_to_index = adata_epi.obs.reset_index(inplace=False)['index']

In [None]:
row_number_to_index[row_number_to_index == 'p013t:CTCATCGCAGTCCGTG']

In [None]:
scv.pl.scatter(adata_epi, basis='diffmap', color=['Uhlitz_scANVI', 32999], 
               components = ['1,2'],
               ncols=3, dpi=300, legend_loc='right margin', size = 2)

In [None]:
adata_epi.uns['iroot'] = 32999

#### run dpt with branching or without

In [None]:
sc.tl.dpt(adata_epi, n_branchings = 1, n_dcs = 7)
# DC1-5 cannot run

In [None]:
sc.pl.dpt_groups_pseudotime(adata_epi)

In [None]:
adata_normal = adata_epi[(adata_epi.obs['tumour_normal_normal'] == 'normal_sample') | 
                        (adata_epi.obs['tumour_normal_normal'] == 'genomically_normal')].copy()

In [None]:
adata_epi.obs['dpt_groups'].value_counts()

In [None]:
scv.pl.scatter(adata_epi, basis='umap', color=['dpt_pseudotime', 'dpt_groups'], 
               ncols=3, dpi=300, legend_loc='right margin', size = 2)

In [None]:
scv.pl.scatter(adata_normal, basis='umap', color=['dpt_pseudotime', 'dpt_groups'], 
               ncols=3, dpi=300, legend_loc='right margin', size = 2)

In [None]:
scv.pl.scatter(adata_epi, basis='diffmap', color=['dpt_pseudotime', 'dpt_groups'], 
               components = ['1,2'],
               ncols=3, dpi=300, legend_loc='right margin', size = 2)

In [None]:
adata_normal.obs['dpt_groups'].value_counts()

In [None]:
scv.pl.scatter(adata_normal, basis='diffmap', color=['dpt_pseudotime', 'dpt_groups'], 
               components = ['1,2'],
               ncols=3, dpi=300, legend_loc='right margin', size = 2)

subset = adata_epi[:, adata_epi.var["highly_variable"]]

sc.pl.dpt_timeseries(subset)

In [None]:
# DC 2 + other to visualise

In [None]:
scv.pl.scatter(adata_epi, basis='diffmap', color=['dpt_pseudotime'], 
               components = ['1,2', '2,3', '1,3', '2,4', '1,4'],
               ncols=3, dpi=300, legend_loc='right margin', size = 2)

In [None]:
scv.pl.scatter(adata_epi, basis='diffmap', color=['dpt_groups'], 
               components = ['1,2', '2,3', '1,3', '2,4', '1,4'],
               ncols=3, dpi=300, legend_loc='right margin', size = 2)

In [None]:
sc.pl.violin(adata_epi, keys=["dpt_pseudotime"], groupby="Uhlitz_scANVI", rotation=90)

In [None]:
scv.pl.scatter(adata_epi, basis='diffmap', color=['Uhlitz_scANVI', 'dpt_pseudotime', 'ct_pseudotime'], 
               components = ['1,2'],
               ncols=3, dpi=300, legend_loc='right margin', size = 2)

In [None]:
# heatmap pseudotime vs normal normal, normal tumour, tumour vs cell type
with plt.rc_context({'ytick.major.size':16}):
    sc.pl.heatmap(adata_epi, ['dpt_pseudotime','ct_pseudotime'],
              groupby='tumour_normal_normal', swap_axes=True, figsize=[16,4])


In [None]:
with plt.rc_context({'ytick.major.size':16}):
    sc.pl.heatmap(adata_epi, ['dpt_pseudotime','ct_pseudotime'],
              groupby='Uhlitz_scANVI', swap_axes=True, figsize=[16,4])


In [None]:
pk = cr.kernels.PseudotimeKernel(adata_epi, time_key="dpt_pseudotime")
pk.compute_transition_matrix()


In [None]:
pk.plot_projection(basis="umap", recompute=False, density = 1, legend_loc = 'right', color = 'Uhlitz_scANVI',
                  size = 2, alpha = 1)

In [None]:
# could work if remove the Tuft and the number 
# desity of the string 

In [None]:
# something breaks when more than one components
pk.plot_projection(basis="diffmap", recompute=False, density = 1, legend_loc = 'right', color = 'Uhlitz_scANVI',
                   stream = True,
                   size = 2, alpha = 1, components = '1,2')

In [None]:
# something breaks when more than one components
pk.plot_projection(basis="diffmap", recompute=False, density = 1, legend_loc = 'right', color = 'Uhlitz_scANVI',
                   stream = True,
                   size = 2, alpha = 1, components = '2,3')

In [None]:
# something breaks when more than one components
pk.plot_projection(basis="diffmap", recompute=False, density = 1, legend_loc = 'right', color = 'Uhlitz_scANVI',
                   stream = True,
                   size = 2, alpha = 1, components = '1,3')

In [None]:
# something breaks when more than one components
pk.plot_projection(basis="diffmap", recompute=False, density = 1, legend_loc = 'right', color = 'Uhlitz_scANVI',
                   stream = True,
                   size = 2, alpha = 1, components = '1,4')

In [None]:
# something breaks when more than one components
pk.plot_projection(basis="draw_graph_fa", recompute=False, density = 2, legend_loc = 'right', 
                   color = 'Uhlitz_scANVI',
                   stream = True,
                   size = 2, alpha = 1)

### Score different pathways


In [None]:
CRC_sig = pd.read_excel("../datasets_new_preprocessing/Signatures_Single_cells.xlsx", 
                        skiprows=[1], index_col=None, sheet_name = 'fixed_var_name')


In [None]:
CRC_sig.shape

### lots of var_names to be fixed!!!

- WARNING: genes are not in var_names and ignored: ['G6PC1']
- WARNING: genes are not in var_names and ignored: ['MPTX1', 'ANG4', 'BHLHB8', 'C4BP', 'CCL6', 'CCL9', 'DEFA1', 'DEFCR20', 'DEFCR6', 'PNLIPRP2']
- WARNING: genes are not in var_names and ignored: ['BLR1', 'CTSL1', 'GNB2L1', 'IL8', 'IL8RB', 'UGCGL1']
- WARNING: genes are not in var_names and ignored: ['C17ORF97', 'C1ORF116', 'C3ORF52', 'MIR22']
- WARNING: genes are not in var_names and ignored: ['CCDC46', 'FAM64A', 'LOC285141']
- WARNING: genes are not in var_names and ignored: ['OSTALPHA']
- WARNING: genes are not in var_names and ignored: ['ARMCX5-GPRASP2', 'CASP12', 'SYCN']
- WARNING: genes are not in var_names and ignored: ['GGTA1', 'LY6A', 'LY6C1']
- WARNING: genes are not in var_names and ignored: ['C19ORF33', 'C1ORF116']
- WARNING: genes are not in var_names and ignored: ['AC087721.2', 'AC087721.2']
- WARNING: genes are not in var_names and ignored: ['AIRN ', 'ANG4 ', 'ATP5G2 ', 'AY761184 ', 'BAALC ', 'BCL6 ', 'BOC ', 'C76336 ', 'CAD ', 'CBX1 ', 'CCDC55 ', 'CCDC66 ', 'CCDC91 ', 'CCL27A ', 'CD44 ', 'CD86 ', 'CENPB ', 'CEP97 ', 'CLEC2I ', 'CLU ', 'CNNM1 ', 'CSNK1E ', 'CWF19L2 ', 'D16ERTD472E ', 'DDX24 ', 'DEFA21', 'DEFA22', 'DEFA-RS1 ', 'DGKD ', 'DYNLT1F ', 'EDN1 ', 'EIF4G1 ', 'EMP2 ', 'EPHA4 ', 'EXOC6B ', 'EXOSC10 ', 'EXT2 ', 'FAM115A ', 'FAM222A ', 'FANCG ', 'FGFR4 ', 'FGFRL1 ', 'FOXN3 ', 'GM10071', 'GM15247 ', 'GM16740 ', 'GM17250 ', 'GM8096', 'GM996 ', 'GTF2F1 ', 'HMGCS2 ', 'HS2ST1 ', 'HSD17B14 ', 'IFT27 ', 'IGF1R ', 'INPP5F ', 'KCNQ1OT1 ', 'KDM5A ', 'KIF12 ', 'KIF1C ', 'KIF2A ', 'LAMC1 ', 'LBH ', 'LIMK2 ', 'LOC101243624 ', 'LPHN1 ', 'LYZ1 ', 'LYZ2 ', 'MAD2L2 ', 'MAN2A2 ', 'MBD4 ', 'MED24 ', 'MFGE8 ', 'MID1 ', 'MSI1 ', 'MTR ', 'MYL6 ', 'NAV2 ', 'NOLC1 ', 'NR2E3 ', 'PACS1 ', 'PDAP1 ', 'PDE7A ', 'PHF14 ', 'PHF3 ', 'PHGDH ', 'PITPNC1 ', 'PNLIPRP2 ', 'PPIG ', 'PTPRS ', 'RAD50 ', 'RASSF10 ', 'RGS12 ', 'RPRD1A ', 'RSRC2 ', 'SERF1 ', 'SIRT5 ', 'SLC29A1 ', 'SLC41A1 ', 'SLC44A2 ', 'SORCS2 ', 'SOX4 ', 'SPICE1 ', 'STK3 ', 'TCF3 ', 'TCTN3 ', 'TLR2 ', 'TLX2 ', 'TMEM107 ', 'TMEM132A ', 'TMEM206 ', 'TNFRSF19 ', 'TRIM44 ', 'TYROBP ', 'UBN1 ', 'WWOX ', 'ZDHHC14 ', 'ZFP316 ', 'ZFP553 ', 'ZFP703 ', 'ZFP787 ', 'ZFP865 ', 'ZSCAN2 ']
- WARNING: genes are not in var_names and ignored: ['FAM46A', 'IGF2', 'NGFRAP1', 'PVRL1', 'RP11-401P9.4']
- WARNING: genes are not in var_names and ignored: ['FAM64A', 'SGOL1', 'SGOL2', 'KIAA0101']

Paneth-Sato and Barrigaetal from mouse...
- CCL6 and CCL9 only in mice... human orthologs is CCL15
- DEFA1, DEFCR6 and DEFCR20 have no human orthologs
- LY6A, LY6C1 mouse
- CCL27A > human CCL27

In [None]:
for i in CRC_sig.columns:
    score_genes(adata_epi, np.array(CRC_sig[i].dropna()), score_name=i)

In [None]:
new_order = ['TC1', 'TC2', 'TC3', 'TC4', 'Stem', 'Stem/TA',
             'Immature Goblet', 'Goblet',
             'Enterocyte progenitor', 'Enterocytes', 'Tuft']

In [None]:
for i in CRC_sig.columns:
    with plt.rc_context({'ytick.major.size':16, 'axes.titlesize':18}):
        plt.figure(figsize = (10,5))
        sns.heatmap(adata_epi.obs.pivot_table(index='tumour_normal_normal',
                                          columns = 'Uhlitz_scANVI', values=i, aggfunc='median')[new_order], 
                    #vmin = -0.015, vmax = 0.015, 
                    square=True)
        plt.title(f'{i} (median)\n')
        plt.ylabel('Numbat and Scitcem assignment\n', size = 18)
        plt.xlabel('Cell type',size = 18)

In [None]:
for i in CRC_sig.columns:
    with plt.rc_context({'ytick.major.size':16, 'axes.titlesize':18}):
        plt.figure(figsize = (10,5))
        sns.heatmap(adata_epi.obs.pivot_table(index='tumour_normal_normal',
                                          columns = 'Uhlitz_scANVI', values=i, aggfunc='mean')[new_order], 
                    #vmin = -0.015, vmax = 0.015, 
                    square=True)
        plt.title(f'{i} (mean)\n')
        plt.ylabel('Numbat and Scitcem assignment\n', size = 18)
        plt.xlabel('Cell type',size = 18)