In [None]:
import os
import sys
print("Python version" + sys.version)
os.getcwd()
print(sys.executable)

In [None]:
import numpy as np
np.random.seed(123)
import pandas as pd
import scipy
import itertools

import umap
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import scanpy as sc
import anndata as ad
import scvelo as scv
from tqdm.notebook import tqdm

from pathlib import Path

In [None]:
from IPython.display import clear_output

In [None]:
from muon import prot as pt
from joblib import dump, load

In [None]:
import scrublet as scr

In [None]:
sc.settings.verbosity = 1
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

In [7]:
# remove weird grid from scvelo
plt.rcParams['axes.grid'] = False

In [8]:
import itertools

In [9]:
from vcf_functions import *

In [10]:
from CCISM import *

### data input and variables

In [12]:
data_folder = '../data'

In [13]:
new_data_folder = '../processed_data'

In [14]:
tsamples = ['p007t', 'p008t', 'p009t1','p009t2', 'p013t', 'p014t', 'p016t', 
           'p020t', 'p021t', 'p026t', 'p035t'] 

In [15]:
samples = ['p007n', 'p008n', 'p009n1', 'p009n2','p013n', 'p014n', 'p016n', 
           'p020n', 'p021n'] + tsamples

### data

In [None]:
adata_epi = sc.read(Path(new_data_folder)/'CB_epi_cells_iCMS.h5')

### CCISM_call: can run in this ipynb or use script/run_CCISM.sh

In [71]:
snv_sc = pd.DataFrame(columns=['donor', 'sample', 't/n',
                               'alt_per_cell', 'dp_per_cell',
                              ])
filter_var = pd.DataFrame(columns=['cell_id','filter_dp','filter_alt', 'mean_VAF','scitcem_call', 'scitcem_p'])

In [None]:
for i in np.arange(0,len(samples)):
    cellsnp_path = os.path.join(new_data_folder + '/cellSNP/' + samples[i])
    
    if os.path.exists(cellsnp_path):            
        print(samples[i] + ' exists')
        cell, AD, DP, cellsnp_vcf = read_cellsnp(cellsnp_path)
                
        # per sample average 
        df_add = pd.DataFrame({'donor': [samples[i][:4]],
                               'sample': [samples[i]],
                               't/n':[samples[i][4:5]],
                               'alt_per_cell': [np.nanmean(np.nansum(AD.toarray(), axis = 0))],
                               'dp_per_cell': [np.nanmean(np.nansum(DP.toarray(), axis = 0))]
                              })
        snv_sc = pd.concat([snv_sc,df_add], ignore_index=True)

        # retrieve all the var information
        # number of alt and rd
        alt = pd.DataFrame(np.nansum(AD.toarray(), axis = 0), 
                                                  columns=['filter_alt'])
        alt['cell_id'] = samples[i] + ":" + cell
        alt['cell_id'] = alt['cell_id'].apply(lambda x: x.replace('-1',''))

        alt['filter_dp'] = np.nansum(DP.toarray(), axis = 0)  
        alt['mean_VAF'] = np.nanmean((AD.toarray())/(DP.toarray()), axis = 0)

        # scitcem
        # skip normal samples
        if samples[i][4:5] == 'n':
            alt['CCISM_call'] = 'normal_sample'
        else:
            CCISM_em = CCISM.EMoptimize(AD, DP, 0.4, 1.e-4)
            alt['CCISM_call'] = np.array([0,1])[(CCISM_em['p'] > .5).astype(int)]
            alt['CCISM_call'].replace({0: "genomically normal\n(tumour sample)", 
                                         1: "genomically tumour\n(tumour sample)"}, inplace=True)
            # plot distribution of p
            plt.figure(figsize=(6, 3))
            plt.hist(CCISM_em['p'])
            plt.title('CCISM p value distribution of ' + samples[i])
            alt['CCISM_p'] = CCISM_em['p']
            # sum into one df    
        filter_var = pd.concat([filter_var, alt], ignore_index=True)

In [124]:
filter_var_copy = filter_var
filter_var_copy = filter_var_copy.set_index('cell_id', verify_integrity=True, drop=True)

In [None]:
filter_var_copy.to_csv(Path(new_data_folder)/'CCISM_all_output.csv')

In [127]:
filter_var_copy = filter_var_copy.reindex(adata_epi.obs.index)

In [219]:
adata_epi.obs[['filter_dp', 'filter_alt', 'CCISM_call', 
               'mean_VAF', 'CCISM_p']] = filter_var_copy[['filter_dp', 'filter_alt', 'CCISM_call', 
                                                            'mean_VAF', 'CCISM_p']]

In [222]:
adata_epi.obs['CCISM_call'] = adata_epi.obs['CCISM_call'].astype('category')

In [223]:
adata_epi.obs['CCISM_call'] = adata_epi.obs['CCISM_call'].cat.reorder_categories([
    'genomically tumour\n(tumour sample)', 'genomically normal\n(tumour sample)', 'normal_sample'])

In [224]:
adata_epi.uns['CCISM_call_colors'] = ['#ff7f0e','#1f77b4', '#d3d3d3']

In [None]:
scv.pl.scatter(adata_epi, basis='umap', color=['CCISM_call', 'CCISM_p'], 
               ncols=1, dpi=300, legend_loc='right margin', size = 2)

### Numbat
generated by script/numbatcollect_numbat_result.ipynb

In [135]:
numbat_all = pd.read_csv(Path(new_data_folder)/'numbat/numbat_all_output_clone_post_combined9.csv',
                         index_col=0) 

In [137]:
numbat_results_copy = numbat_all.reindex(adata_epi.obs.index)

In [235]:
adata_epi.obs['numbat'] = None

In [234]:
adata_epi.obs['sample_origin'].cat.categories

Index(['normal', 'tumour'], dtype='object')

In [236]:
for i in np.arange(0, adata_epi.shape[0]):
    if adata_epi.obs['sample_origin'][i] == 'normal':
        adata_epi.obs['numbat'][i] = 'normal_sample'

    else:
        adata_epi.obs['numbat'][i] = numbat_results_copy.loc[adata_epi.obs['numbat'].index[i]]['compartment_opt']

In [238]:
adata_epi.obs[['clone_opt', 'GT_opt', 'p_cnv']] = numbat_results_copy[['clone_opt', 'GT_opt', 'p_cnv']]

### inferCNV

In [241]:
infercnv_result = pd.read_csv(Path(new_data_folder)/'inferCNV/infercnv_clone_scores.tsv', sep='\t')

In [244]:
infercnv_result = infercnv_result.set_index('cell_id')

In [245]:
infercnv_result = infercnv_result.reindex(adata_epi.obs.index)

In [None]:
adata_epi.obs[['cna_score', 'cna_clone', 'clone']] = infercnv_result[['cna_score', 'cna_clone', 'clone']]

In [311]:
adata_epi.obs['inferCNV_result'] = None

In [312]:
for i in np.arange(0, adata_epi.shape[0]):
    if adata_epi.obs['cna_clone'][i] == 'Normal':
        adata_epi.obs['inferCNV_result'][i] = 'normal_sample'

    else:
        adata_epi.obs['inferCNV_result'][i] = adata_epi.obs['cna_clone'][i]

### fix colors and UMAP

In [254]:
adata_epi.obs['sample_origin'] = adata_epi.obs['sample_origin'].cat.rename_categories({
    'normal':'\nnormal\nsample\n', 'tumour':'tumour\nsample'})

In [328]:
adata_epi.uns['sample_origin_colors'] = ['#d3d3d3', '#ff7f0e']

In [321]:
adata_epi.uns['MS_status_colors'] = ['#D45B6E', '#EAA945']

In [323]:
# normal vs MSS vs MSI
adata_epi.obs['normal_vs_MS'] = None

for i in np.arange(0, adata_epi.shape[0]):
    if adata_epi.obs['sample_origin'][i] == 'tumour\nsample':
        adata_epi.obs['normal_vs_MS'][i] = adata_epi.obs['MS_status'][i]
    else:
        adata_epi.obs['normal_vs_MS'][i] = 'normal\nsample'

In [324]:
adata_epi.uns['normal_vs_MS_colors'] = ['#D45B6E', '#EAA945', '#d3d3d3']

In [310]:
adata_epi.obs['filter_dp']=adata_epi.obs['filter_dp'].astype(int)
adata_epi.obs['filter_alt']=adata_epi.obs['filter_alt'].astype(int)

In [None]:
adata_epi.write(Path(new_data_folder)/'CB_epi_Numbat_CCISM_inferCNV_iCMS.h5')