In [None]:
import scanpy as sc
import scanpy.external as sce
import pandas as pd
import numpy as np
import os
import triku as tk
import matplotlib.pyplot as plt
import matplotlib as mpl
from tqdm.notebook import tqdm
import ray
import subprocess

In [None]:
seed = 10

In [None]:
data_dir = 'data/proietti_2021/'
fig_dir = 'figures/'
mouse_gencode_dir = '/media/seth/SETH_DATA/SETH_Alex/Programs/mouse_GRCm38_gencode.v31'
        
os.makedirs(data_dir, exist_ok=True)
os.makedirs(data_dir + '/tmp', exist_ok=True)
os.makedirs(fig_dir, exist_ok=True)

In [None]:
# Palettes for UMAP gene expression

magma = [plt.get_cmap('magma')(i) for i in np.linspace(0,1, 80)]
magma[0] = (0.88, 0.88, 0.88, 1)
magma = mpl.colors.LinearSegmentedColormap.from_list("", magma[:65])

In [None]:
mpl.rcParams['figure.dpi'] = 150  # Set this to make higher quality pictures

In [None]:
def assign_cats(adata, dict_cats, column_groupby='leiden', quantile_gene_sel=0.7, do_return=False, intermediate_states=False, diff=0.05, 
                key_added='assigned_cats', min_score=0.6, others_name='unassigned'):
    """
    This functions uses a set of genes assigned to different categories so that leiden clusters can be assigned to one of these categories.
    For example, to categorize fibroblasts from pericytes, endothelial cells, or cells with high mitochondrial content.
    It could be done with each cell individually, but it is better to use clusters to discern the different categories because
    the method, although efficient, can sometimes be noisy due to the noisiness of the sc datasets.
    """
    
    for cat in list(dict_cats.keys()):
        mat_cat = np.zeros((len(adata), len(dict_cats[cat])), dtype=float)
        
        for gene_idx, gene in enumerate(dict_cats[cat]):
            try:
                mat_cat[:, gene_idx] = np.asarray(np.dot(adata.obsp['connectivities'], adata[:, gene].X).todense()).ravel() / adata.uns['neighbors']['params']['n_neighbors']
                mat_cat[mat_cat[:, gene_idx] > 0, gene_idx] = np.argsort(np.argsort(mat_cat[mat_cat[:, gene_idx] > 0, gene_idx]))
                mat_cat[:, gene_idx] /= np.max(mat_cat[:, gene_idx])
            except:
                print(f'Gene {gene} is not on the list')    
            
        sum_mat_cat = np.asarray(mat_cat.mean(1)).ravel()       
        adata.obs[cat] = sum_mat_cat
    
    score_per_cluster = adata.obs[[column_groupby] + list(dict_cats.keys())].groupby(column_groupby).quantile(quantile_gene_sel)
    max_cat_dict_std = dict(zip(score_per_cluster.std(1).index, score_per_cluster.std(1).values))
    adata.obs[f'{key_added}_std'] = [max_cat_dict_std[i] for i in adata.obs[column_groupby]]
    max_cat_dict_mean = dict(zip(score_per_cluster.mean(1).index, score_per_cluster.mean(1).values))
    adata.obs[f'{key_added}_mean'] = [max_cat_dict_mean[i] for i in adata.obs[column_groupby]]
    max_cat_dict_max = dict(zip(score_per_cluster.max(1).index, score_per_cluster.max(1).values))
    adata.obs[f'{key_added}_max'] = [max_cat_dict_max[i] for i in adata.obs[column_groupby]]
    adata.obs[f'{key_added}_CV'] = adata.obs[f'{key_added}_mean'] / adata.obs[f'{key_added}_std']
    
    for cat in score_per_cluster.columns:
        max_cat_dict = dict(zip(score_per_cluster.index, score_per_cluster[cat].values))        
        adata.obs[f'{key_added}_{cat}'] = [max_cat_dict[i] for i in adata.obs[column_groupby]]
    
    if intermediate_states: # For each cluster we will identify which categories are close to the highest one, and merge their names.
        list_names_cats_per_cluster = []
        for cluster in score_per_cluster.index:
            scores_cluster = score_per_cluster.loc[cluster]
            scores_cluster = scores_cluster[scores_cluster > scores_cluster.max() - diff]
            list_names_cats_per_cluster.append('/'.join(scores_cluster.index))
        
        final_cat_dict = dict(zip(score_per_cluster.idxmax(axis=1).index, list_names_cats_per_cluster))
    else:        
        final_cat_dict = dict(zip(score_per_cluster.idxmax(axis=1).index, score_per_cluster.idxmax(axis=1).values))
    
    adata.obs[f'{key_added}'] = [str(final_cat_dict[i]) for i in adata.obs[column_groupby]]
    
    adata.obs[f'{key_added}'][adata.obs[f'{key_added}_max'] < min_score] = others_name
    
    if do_return:
        return score_per_cluster

# Data download

In [None]:
SRA_list_ITGA_muscle_PRJNA626530_DEN = ['SRR11574458', 'SRR11574462', 'SRR11574463', 'SRR11574464']
SRA_list_ITGA_muscle_PRJNA626530_CTRL = ['SRR11574459', 'SRR11574460', 'SRR11574461', 'SRR11574465']

In [None]:
df = pd.DataFrame({'name': ['PRJNA626530_DEN', 'PRJNA626530_CTRL'], 
                   'technology': ['10xv3'] * 2, 
                   'targetnumcells': [6000] * 2})

df.to_csv(data_dir + '/metadata.tab', sep='\t', index=None)

In [None]:
for idx, name in enumerate(SRA_list_ITGA_muscle_PRJNA626530_DEN):
    os.system(f'cd {data_dir} && parallel-fastq-dump -s {name} -t {16} --split-files -O . --tmpdir tmp --gzip')
    os.system(f'cd {data_dir} && mv {name}_1.fastq.gz PRJNA626530_DEN_L00{idx+1}_R1_001.fastq.gz')
    os.system(f'cd {data_dir} && mv {name}_2.fastq.gz PRJNA626530_DEN_L00{idx+1}_R2_001.fastq.gz')

for idx, name in enumerate(SRA_list_ITGA_muscle_PRJNA626530_CTRL):
    os.system(f'cd {data_dir} && parallel-fastq-dump -s {name} -t {16} --split-files -O . --tmpdir tmp --gzip')
    os.system(f'cd {data_dir} && mv {name}_1.fastq.gz PRJNA626530_CTRL_L00{idx+1}_R1_001.fastq.gz')
    os.system(f'cd {data_dir} && mv {name}_2.fastq.gz PRJNA626530_CTRL_L00{idx+1}_R2_001.fastq.gz')

In [None]:
print(f'''cd {data_dir} && loompy fromfq PRJNA626530_DEN.loom PRJNA626530_DEN {mouse_gencode_dir} metadata.tab 
      PRJNA626530_DEN_L001_R1_001.fastq.gz PRJNA626530_DEN_L001_R2_001.fastq.gz 
      PRJNA626530_DEN_L002_R1_001.fastq.gz PRJNA626530_DEN_L002_R2_001.fastq.gz 
      PRJNA626530_DEN_L003_R1_001.fastq.gz PRJNA626530_DEN_L003_R2_001.fastq.gz 
      PRJNA626530_DEN_L004_R1_001.fastq.gz PRJNA626530_DEN_L004_R2_001.fastq.gz''')

In [None]:
os.system(f'''cd {data_dir} && loompy fromfq PRJNA626530_DEN.loom PRJNA626530_DEN {mouse_gencode_dir} metadata.tab 
      PRJNA626530_DEN_L001_R1_001.fastq.gz PRJNA626530_DEN_L001_R2_001.fastq.gz 
      PRJNA626530_DEN_L002_R1_001.fastq.gz PRJNA626530_DEN_L002_R2_001.fastq.gz 
      PRJNA626530_DEN_L003_R1_001.fastq.gz PRJNA626530_DEN_L003_R2_001.fastq.gz 
      PRJNA626530_DEN_L004_R1_001.fastq.gz PRJNA626530_DEN_L004_R2_001.fastq.gz''')

In [None]:
print(f'''cd {data_dir} && loompy fromfq PRJNA626530_CTRL.loom PRJNA626530_CTRL {mouse_gencode_dir} metadata.tab 
      PRJNA626530_CTRL_L001_R1_001.fastq.gz PRJNA626530_CTRL_L001_R2_001.fastq.gz 
      PRJNA626530_CTRL_L002_R1_001.fastq.gz PRJNA626530_CTRL_L002_R2_001.fastq.gz 
      PRJNA626530_CTRL_L003_R1_001.fastq.gz PRJNA626530_CTRL_L003_R2_001.fastq.gz 
      PRJNA626530_CTRL_L004_R1_001.fastq.gz PRJNA626530_CTRL_L004_R2_001.fastq.gz''')

In [None]:
os.system(f'''cd {data_dir} && loompy fromfq PRJNA626530_CTRL.loom PRJNA626530_CTRL {mouse_gencode_dir} metadata.tab 
      PRJNA626530_CTRL_L001_R1_001.fastq.gz PRJNA626530_CTRL_L001_R2_001.fastq.gz 
      PRJNA626530_CTRL_L002_R1_001.fastq.gz PRJNA626530_CTRL_L002_R2_001.fastq.gz 
      PRJNA626530_CTRL_L003_R1_001.fastq.gz PRJNA626530_CTRL_L003_R2_001.fastq.gz 
      PRJNA626530_CTRL_L004_R1_001.fastq.gz PRJNA626530_CTRL_L004_R2_001.fastq.gz''')

# Preprocess dataset

In [None]:
adata_proietti = sc.read(data_dir+'/PRJNA626530.loom')
adata_proietti.var_names_make_unique()

In [None]:
# Basic QC filtering
adata_proietti.var['mt'] = adata_proietti.var_names.str.startswith('mt-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata_proietti, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(adata_proietti, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(adata_proietti, x='total_counts', y='pct_counts_mt')

In [None]:
sc.pp.filter_cells(adata_proietti, min_genes=250)
sc.pp.filter_genes(adata_proietti, min_cells=5)

In [None]:
sc.pp.log1p(adata_proietti)
sc.pp.normalize_per_cell(adata_proietti)

In [None]:
tk.tl.triku(adata_proietti, n_procs=1, random_state=seed)
sc.pp.pca(adata_proietti, random_state=seed, n_comps=30)
sc.pp.neighbors(adata_proietti, random_state=seed, knn=len(adata_proietti) ** 0.5 // 2, metric='cosine')

In [None]:
sc.tl.umap(adata_proietti, min_dist=0.1, random_state=seed)
sc.tl.leiden(adata_proietti, resolution=1.5, random_state=seed)
sc.pl.umap(adata_proietti, color=['leiden', 'n_counts', 'pct_counts_mt'], legend_loc='on data')

In [None]:
dict_cats_paper = {'glial cells': ['Plp1', 'Mpz', 'Kcna1', 'S100b'], 
                   'MuSCs': ['Myh11', 'Rgs5', 'Myl9', 'Pln'], 
                   'SMMCs': ['Pax7', 'Myf5', 'Vcam1', 'Sdc4'],
                   'Residual (FAPs)': ['Mfap5'], 
                   'Residual (Teno)': ['Scx', 'Tnmd', 'Col1a1'], 
                   'Residual (Others)': ['Ly6a', 'Pecam1', 'Tek']}
assign_cats(adata_proietti, dict_cats=dict_cats_paper, key_added='cats_paper', others_name='Residual cells', min_score=0.4)

In [None]:
sc.pl.umap(adata_proietti, color=['leiden', 'cats_paper'])

In [None]:
sc.pl.umap(adata_proietti, color=  dict_cats_paper['glial cells'] + ['cats_paper'], cmap=magma, ncols=3)

In [None]:
sc.pl.umap(adata_proietti, color= dict_cats_paper['MuSCs'] + ['cats_paper'], cmap=magma, ncols=3)

In [None]:
sc.pl.umap(adata_proietti, color= dict_cats_paper['SMMCs'] + ['cats_paper'], cmap=magma, ncols=3)

In [None]:
sc.pl.umap(adata_proietti, color= dict_cats_paper['Residual (FAPs)'] + ['cats_paper'], cmap=magma, ncols=3)

In [None]:
sc.pl.umap(adata_proietti, color= dict_cats_paper['Residual (Teno)'] + ['cats_paper'], cmap=magma, ncols=3)

In [None]:
sc.pl.umap(adata_proietti, color= dict_cats_paper['Residual (Others)'] + ['cats_paper'], cmap=magma, ncols=3)

In [None]:
sc.pl.umap(adata_proietti, color= ['Ngfr'] + ['cats_paper'], cmap=magma, ncols=3)

There are some interesting minor populations, so we are going to extract DEGs. There are two minor populations within MuSCs, the Residual (Others)

In [None]:
sc.tl.rank_genes_groups(adata_proietti, groupby='leiden')

In [None]:
sc.pl.rank_genes_groups_tracksplot(adata_proietti, dendrogram=False)