In [None]:
from bokeh.io import show, output_notebook, reset_output
from IPython.display import display, HTML
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import os
import pandas as pd
import scanpy as sc
import scanpy.external as sce
import seaborn as sns
from scipy.sparse import csr_matrix
import subprocess
from tqdm import tqdm
import triku as tk
import bbknn

reset_output()
output_notebook()

In [None]:
from cellassign import assign_cats

In [None]:
data_dir = 'data/'
fig_dir = 'figures/'

In [None]:
# Palettes for UMAP gene expression

magma = [plt.get_cmap('magma')(i) for i in np.linspace(0,1, 80)]
magma[0] = (0.88, 0.88, 0.88, 1)
magma = mpl.colors.LinearSegmentedColormap.from_list("", magma[:65])

seed = 0

In [None]:
dict_cats_general = {'FAP Lum+': ['Apod', 'Lum', 'Ly6a', 'Pdgfra', 'Mfap5', 'Dcn'], 
                     'FAP Prg4+': ['Prg4', 'Fbn1', 'Ly6a', 'Pdgfra', 'Mfap5', 'Dcn'], 
                     'Endothelial': ['Pecam1', 'Kdr', 'Fabp4', 'Cav1', 'Cdh5', 'Tek'], 
                     'Pericyte': ['Rgs5', 'Notch3', 'Myl9', 'Ndufa4l2', 'Itga7', 'Myh11', 'Pln', 'Abcc9'], 
                     'Satellite cell': ['Pax7', 'Myod1', 'Chodl', 'Vcam1', 'Sdc4', 'Myf5',], 
                     'Myonuclei': ['Tnnc2', 'Myh4', 'Acta1', 'Ckm', 'Tpm2', 'Eno3', 'Slc25a4'], 
                     'Tenocyte': ['Scx', 'Tnmd', 'Mkx', 'Col12a1', 'Col1a1', 'Tnc', 'Fmod', 'Comp'], 
                     'Neural cell': ['Mpz', 'Ptn', 'S100b'], 
                     'Glial cell': ['Plp1', 'Kcna1', 'S100b', 'Mbp', 'Mpz',],
                     'Guide cell': ['Ncam2'],
                     'Immune': ['H2-Aa', 'Cd74'], 
                     'APC': ['H2-Eb1', 'H2-Ab1'],
                     'B cell': ['Cd19', 'Cd22', 'Ms4a1', 'Ptprc'], 
                     'T cell': ['Cd3d', 'Cd3e', 'Cd3g', 'Cd8a', 'Cd4', 'Ptprc', 'Cd28'], 
                     'Monocyte': ['Csf1r', 'Adgre1'], 
                     'Macrophage': ['Itgam', 'Csf1r', 'Adgre1', 'Itgb1', 'Cd68'],
                     'Neutrophil': ['S100a8', 'S100a9', 'Itgam', 'Cd14', ], 
                     }

# Oprescu 2020

In [None]:
oprescu_dir = data_dir + '/oprescu'

In [None]:
link = 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE138nnn/GSE138826/suppl/GSE138826%5Fexpression%5Fmatrix%2Etxt%2Egz'

In [None]:
!aria2c -x 16 {link} -d {oprescu_dir}

In [None]:
adata_oprescu = sc.read_text(oprescu_dir+'/GSE138826_expression_matrix.txt.gz').transpose()
adata_oprescu.write_loom(oprescu_dir+'/adata_oprescu.loom')

In [None]:
adata_oprescu = sc.read_loom(oprescu_dir+'/adata_oprescu.loom')

In [None]:
adata_oprescu.obs['batch'] = [i.split('_')[0] for i in adata_oprescu.obs_names]

In [None]:
# Basic QC filtering
adata_oprescu.var['mt'] = adata_oprescu.var_names.str.startswith('mt-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata_oprescu, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(adata_oprescu, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(adata_oprescu, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(adata_oprescu, x='total_counts', y='n_genes_by_counts', color='batch')

In [None]:
sc.pp.filter_cells(adata_oprescu, min_genes=150)

In [None]:
adata_oprescu.X = np.asarray(adata_oprescu.X.todense())

In [None]:
adata_oprescu_d0 = adata_oprescu[adata_oprescu.obs['batch'] == 'Noninjured'].copy()
adata_oprescu_d05 = adata_oprescu[adata_oprescu.obs['batch'] == 'X0.5.DPI'].copy()
adata_oprescu_d2 = adata_oprescu[adata_oprescu.obs['batch'] == 'X2.DPI'].copy()
adata_oprescu_d35 = adata_oprescu[adata_oprescu.obs['batch'] == 'X3.5.DPI'].copy()
adata_oprescu_d5 = adata_oprescu[adata_oprescu.obs['batch'] == 'X5.DPI'].copy()
adata_oprescu_d10 = adata_oprescu[adata_oprescu.obs['batch'] == 'X10.DPI'].copy()
adata_oprescu_d21 = adata_oprescu[adata_oprescu.obs['batch'] == 'X21.DPI'].copy()

In [None]:
adata_oprescu_d0.X = np.array(adata_oprescu_d0.X).copy()
adata_oprescu_d05.X = np.array(adata_oprescu_d05.X).copy()
adata_oprescu_d2.X = np.array(adata_oprescu_d2.X).copy()
adata_oprescu_d35.X = np.array(adata_oprescu_d35.X).copy()
adata_oprescu_d5.X = np.array(adata_oprescu_d5.X).copy()
adata_oprescu_d10.X = np.array(adata_oprescu_d10.X).copy()
adata_oprescu_d21.X = np.array(adata_oprescu_d21.X).copy()

In [None]:
set(adata_oprescu.obs['batch'].values)

## Individual adata processing

In [None]:
for adata in [adata_oprescu_d0, adata_oprescu_d05, adata_oprescu_d2, adata_oprescu_d35, adata_oprescu_d5, adata_oprescu_d10, adata_oprescu_d21]:
    sc.pp.filter_genes(adata, min_counts=1)
    sc.pp.normalize_per_cell(adata)
    sc.pp.log1p(adata)
    
    sc.pp.pca(adata, random_state=seed, n_comps=30)
    sc.pp.neighbors(adata, random_state=seed, n_neighbors=int(len(adata) ** 0.5 // 5), metric='cosine')
    tk.tl.triku(adata)
    
    sc.tl.umap(adata, min_dist=0.25, random_state=seed)
    sc.tl.leiden(adata, resolution=3, random_state=seed)
    sc.pl.umap(adata, color=['leiden', 'n_counts'], legend_loc='on data')

### Subpopulation characterization (based on paper markers)

In [None]:
for adata in [adata_oprescu_d0, adata_oprescu_d05, adata_oprescu_d2, adata_oprescu_d35, adata_oprescu_d5, adata_oprescu_d10, adata_oprescu_d21]:
    assign_cats(adata, dict_cats_general, column_groupby='leiden', quantile_gene_sel=0.7, do_return=False, intermediate_states=False, diff=0.05, 
                key_added='cell_type', min_score=0.45, others_name='unassigned')

In [None]:
for adata in [adata_oprescu_d0, adata_oprescu_d05, adata_oprescu_d2, adata_oprescu_d35, adata_oprescu_d5, adata_oprescu_d10, adata_oprescu_d21]:
    sc.pl.umap(adata, color=['cell_type'])

### Adata saving

In [None]:
for adata, day in zip([adata_oprescu_d0, adata_oprescu_d05, adata_oprescu_d2, adata_oprescu_d35, adata_oprescu_d5, adata_oprescu_d10, adata_oprescu_d21], ['D0', 'D0.5', 'D2', 'D3.5', 'D5', 'D10', 'D21']):
    adata.obs['dataset'] = f'Oprescu {day}'
    day_save = day.replace('.', '').lower()
    adata.write_h5ad(f'data/processed/oprescu_{day_save}.h5')

# Scott 2019

In [None]:
!wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM2976nnn/GSM2976778/suppl/GSM2976778%5Fqsnt%5Fbarcodes%2Etsv%2Egz -P {data_dir}/scott
!wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM2976nnn/GSM2976778/suppl/GSM2976778%5Fqsnt%5Fgenes%2Etsv%2Egz -P {data_dir}/scott
!wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM2976nnn/GSM2976778/suppl/GSM2976778%5Fqsnt%5Fmatrix%2Emtx%2Egz -P {data_dir}/scott

In [None]:
adata_scott_d0 = sc.read_mtx(data_dir+'scott/GSM2976778_qsnt_matrix.mtx.gz').transpose()
barcodes = pd.read_csv(data_dir+'scott/GSM2976778_qsnt_barcodes.tsv.gz', sep='\t', header=None)[0].values
features = pd.read_csv(data_dir+'scott/GSM2976778_qsnt_genes.tsv.gz', sep='\t', header=None)[1].values

adata_scott_d0.var_names = features
adata_scott_d0.obs_names = barcodes

In [None]:
adata_scott_d0.var_names_make_unique()

In [None]:
!wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM2976nnn/GSM2976779/suppl/GSM2976779%5Fd1%5Fbarcodes%2Etsv%2Egz -P {data_dir}/scott
!wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM2976nnn/GSM2976779/suppl/GSM2976779%5Fd1%5Fgenes%2Etsv%2Egz -P {data_dir}/scott
!wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM2976nnn/GSM2976779/suppl/GSM2976779%5Fd1%5Fmatrix%2Emtx%2Egz -P {data_dir}/scott

In [None]:
adata_scott_d1 = sc.read_mtx(data_dir+'scott/GSM2976779_d1_matrix.mtx.gz').transpose()
barcodes = pd.read_csv(data_dir+'scott/GSM2976779_d1_barcodes.tsv.gz', sep='\t', header=None)[0].values
features = pd.read_csv(data_dir+'scott/GSM2976779_d1_genes.tsv.gz', sep='\t', header=None)[1].values

adata_scott_d1.var_names = features
adata_scott_d1.obs_names = barcodes

In [None]:
!wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM2976nnn/GSM2976780/suppl/GSM2976780%5Fd2%5Fbarcodes%2Etsv%2Egz -P {data_dir}/scott
!wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM2976nnn/GSM2976780/suppl/GSM2976780%5Fd2%5Fgenes%2Etsv%2Egz -P {data_dir}/scott
!wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM2976nnn/GSM2976780/suppl/GSM2976780%5Fd2%5Fmatrix%2Emtx%2Egz -P {data_dir}/scott

In [None]:
adata_scott_d2 = sc.read_mtx(data_dir+'scott/GSM2976780_d2_matrix.mtx.gz').transpose()
barcodes = pd.read_csv(data_dir+'scott/GSM2976780_d2_barcodes.tsv.gz', sep='\t', header=None)[0].values
features = pd.read_csv(data_dir+'scott/GSM2976780_d2_genes.tsv.gz', sep='\t', header=None)[1].values

adata_scott_d2.var_names = features
adata_scott_d2.obs_names = barcodes

In [None]:
!wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM2976nnn/GSM2976781/suppl/GSM2976781%5Fd4%5Fbarcodes%2Etsv%2Egz -P {data_dir}/scott
!wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM2976nnn/GSM2976781/suppl/GSM2976781%5Fd4%5Fgenes%2Etsv%2Egz -P {data_dir}/scott
!wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM2976nnn/GSM2976781/suppl/GSM2976781%5Fd4%5Fmatrix%2Emtx%2Egz -P {data_dir}/scott

In [None]:
adata_scott_d4 = sc.read_mtx(data_dir+'scott/GSM2976781_d4_matrix.mtx.gz').transpose()
barcodes = pd.read_csv(data_dir+'scott/GSM2976781_d4_barcodes.tsv.gz', sep='\t', header=None)[0].values
features = pd.read_csv(data_dir+'scott/GSM2976781_d4_genes.tsv.gz', sep='\t', header=None)[1].values

adata_scott_d4.var_names = features
adata_scott_d4.obs_names = barcodes

In [None]:
!wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM2976nnn/GSM2976782/suppl/GSM2976782%5Fd14%5Fbarcodes%2Etsv%2Egz -P {data_dir}/scott
!wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM2976nnn/GSM2976782/suppl/GSM2976782%5Fd14%5Fgenes%2Etsv%2Egz -P {data_dir}/scott
!wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM2976nnn/GSM2976782/suppl/GSM2976782%5Fd14%5Fmatrix%2Emtx%2Egz -P {data_dir}/scott

In [None]:
adata_scott_d14 = sc.read_mtx(data_dir+'scott/GSM2976782_d14_matrix.mtx.gz').transpose()
barcodes = pd.read_csv(data_dir+'scott/GSM2976782_d14_barcodes.tsv.gz', sep='\t', header=None)[0].values
features = pd.read_csv(data_dir+'scott/GSM2976782_d14_genes.tsv.gz', sep='\t', header=None)[1].values

adata_scott_d14.var_names = features
adata_scott_d14.obs_names = barcodes

## Adata Scott D0

In [None]:
adata_scott_d0.X = np.asarray(adata_scott_d0.X.todense())

In [None]:
# Basic QC filtering
adata_scott_d0.var['mt'] = adata_scott_d0.var_names.str.startswith('mt-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata_scott_d0, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(adata_scott_d0, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(adata_scott_d0, x='total_counts', y='pct_counts_mt')


In [None]:
adata_scott_d0 = adata_scott_d0[adata_scott_d0.obs.n_genes_by_counts < 2500, :]
adata_scott_d0 = adata_scott_d0[adata_scott_d0.obs.pct_counts_mt < 10, :]

In [None]:
sc.pp.filter_cells(adata_scott_d0, min_genes=250)

In [None]:
sc.pp.filter_genes(adata_scott_d0, min_counts=1)
sc.pp.normalize_per_cell(adata_scott_d0)
sc.pp.log1p(adata_scott_d0)

In [None]:
sc.pp.pca(adata_scott_d0, random_state=seed, n_comps=30)
sc.pp.neighbors(adata_scott_d0, random_state=seed, n_neighbors=int(len(adata_scott_d0) ** 0.5 // 4), metric='cosine')
tk.tl.triku(adata_scott_d0)


In [None]:
sc.tl.umap(adata_scott_d0, min_dist=0.7, random_state=seed)
sc.tl.leiden(adata_scott_d0, resolution=3, random_state=seed)
sc.pl.umap(adata_scott_d0, color=['leiden', 'n_counts'], legend_loc='on data')

### Subpopulation characterization

In [None]:
assign_cats(adata_scott_d0, dict_cats_general, column_groupby='leiden', quantile_gene_sel=0.7, do_return=False, intermediate_states=False, diff=0.05, 
                key_added='cell_type', min_score=0.4, others_name='unassigned')

In [None]:
for cat in dict_cats_general.keys():
    print(cat)
    sc.pl.umap(adata_scott_d0, color=['cell_type'] + [i for i in dict_cats_general[cat] if i in adata_scott_d0.var_names], ncols=3, cmap=magma)

In [None]:
sc.pl.umap(adata_scott_d0, color=['leiden'] + ['Bricd5', 'Col9a2', 'Dlk1', 'Grin2b', 'Mpzl2', 'Saa1', 'Shisa3', 'Tenm2'], legend_loc='on data', cmap=magma, ncols=3)

In [None]:
sc.pl.umap(adata_scott_d0, color=['Nipal1', 'Trpm6', 'S100b', 'Gpld1', 'Plxdc1', 'Gfra2',
                                  'Cd38', 'Cd300lg'], legend_loc='on data', cmap=magma, ncols=3)

### Adata saving

In [None]:
adata_scott_d0.obs['dataset'] = 'Scott D0'
adata_scott_d0.write_h5ad('data/processed/scott_d0.h5')

# De Micheli et al 2020 (mouse)

## Adata download and preprocessing

In [None]:
!wget https://ftp.ncbi.nlm.nih.gov/geo/series/GSE143nnn/GSE143437/suppl/GSE143437%5FDeMicheli%5FMuSCatlas%5Frawdata%2Etxt%2Egz -P {data_dir}/demicheli_mouse
!wget https://ftp.ncbi.nlm.nih.gov/geo/series/GSE143nnn/GSE143437/suppl/GSE143437%5FDeMicheli%5FMuSCatlas%5Fmetadata%2Etxt%2Egz -P {data_dir}/demicheli_mouse

In [None]:
adata_de_micheli_mouse = sc.read_text(data_dir+'demicheli_mouse/GSE143437_DeMicheli_MuSCatlas_rawdata.txt.gz').transpose()
adata_de_micheli_mouse.obs['batch'] = ['_'.join(i.split('_')[:2]) for i in adata_de_micheli_mouse.obs_names]
adata_de_micheli_mouse.obs_names = [i.split('_')[-1] for i in adata_de_micheli_mouse.obs_names]
adata_de_micheli_mouse.obs_names_make_unique()

In [None]:
adata_de_micheli_mouse.write_loom(data_dir+'demicheli_mouse/adata_demicheli_mouse.loom')

In [None]:
adata_de_micheli_mouse = sc.read_loom(data_dir+'demicheli_mouse/adata_demicheli_mouse.loom')

In [None]:
# We'll remove some RPSs and RPLs
not_RPS = [i for i in adata_de_micheli_mouse.var_names if not (('Rps' in i) | ('Rpl' in i))]

In [None]:
adata_de_micheli_mouse = adata_de_micheli_mouse[:, not_RPS]

In [None]:
# Basic QC filtering
adata_de_micheli_mouse.var['mt'] = adata_de_micheli_mouse.var_names.str.startswith('mt-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata_de_micheli_mouse, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(adata_de_micheli_mouse, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(adata_de_micheli_mouse, x='n_genes_by_counts', y='pct_counts_mt')
sc.pl.scatter(adata_de_micheli_mouse, x='total_counts', y='n_genes_by_counts')

In [None]:
adata_de_micheli_mouse = adata_de_micheli_mouse[adata_de_micheli_mouse.obs.n_genes_by_counts < 6000, :]
adata_de_micheli_mouse = adata_de_micheli_mouse[adata_de_micheli_mouse.obs.total_counts < 30000, :]
adata_de_micheli_mouse = adata_de_micheli_mouse[adata_de_micheli_mouse.obs.pct_counts_mt < 10, :]

In [None]:
sc.pl.violin(adata_de_micheli_mouse, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(adata_de_micheli_mouse, x='n_genes_by_counts', y='pct_counts_mt')
sc.pl.scatter(adata_de_micheli_mouse, x='total_counts', y='n_genes_by_counts')

In [None]:
sc.pp.filter_genes(adata_de_micheli_mouse, min_counts=1)

In [None]:
adata_de_micheli_mouse.X = np.array(adata_de_micheli_mouse.X.todense())

In [None]:
set(adata_de_micheli_mouse.obs.batch)

In [None]:
adata_de_micheli_mouse_d0 = adata_de_micheli_mouse[adata_de_micheli_mouse.obs['batch'].isin(['D0_A', 'D0_B', 'D0_Cv3'])].copy()
adata_de_micheli_mouse_d2 = adata_de_micheli_mouse[adata_de_micheli_mouse.obs['batch'].isin(['D2_C', 'D2_D'])].copy()
adata_de_micheli_mouse_d5 = adata_de_micheli_mouse[adata_de_micheli_mouse.obs['batch'].isin(['D5_A', 'D5_B', 'D5_C'])].copy()
adata_de_micheli_mouse_d7 = adata_de_micheli_mouse[adata_de_micheli_mouse.obs['batch'].isin(['D7_C', 'D7_D'])].copy()

In [None]:
for adata in [adata_de_micheli_mouse_d0, adata_de_micheli_mouse_d2, 
              adata_de_micheli_mouse_d5, adata_de_micheli_mouse_d7]:

    sc.pp.filter_genes(adata, min_counts=1)
    sc.pp.normalize_total(adata)
    sc.pp.log1p(adata)

In [None]:
for adata in [adata_de_micheli_mouse_d0, adata_de_micheli_mouse_d2, 
              adata_de_micheli_mouse_d5, adata_de_micheli_mouse_d7]:
    
    sc.pp.pca(adata, random_state=seed, n_comps=30)
    sce.pp.bbknn(adata, metric='angular')
    tk.tl.triku(adata)

In [None]:
for adata in [adata_de_micheli_mouse_d0, adata_de_micheli_mouse_d2, 
              adata_de_micheli_mouse_d5, adata_de_micheli_mouse_d7]:

    sc.tl.umap(adata, min_dist=0.4,  random_state=seed)
    sc.tl.leiden(adata, resolution=2.5, random_state=seed)
    sc.pl.umap(adata, color=['leiden', 'batch', 'total_counts'], legend_loc='on data', ncols=2)

## Population characterization

In [None]:
for adata in [adata_de_micheli_mouse_d0, adata_de_micheli_mouse_d2, 
              adata_de_micheli_mouse_d5, adata_de_micheli_mouse_d7]:
    sc.tl.rank_genes_groups(adata, groupby='leiden', method='wilcoxon')
    assign_cats(adata, dict_cats_general, column_groupby='leiden', quantile_gene_sel=0.7, do_return=False, intermediate_states=False, diff=0.05, 
                key_added='cell_type', min_score=0.4, others_name='unassigned')
    sc.pl.umap(adata, color=['leiden', 'cell_type',], legend_loc='on data', ncols=2)

### Adata saving

In [None]:
adata_de_micheli_mouse_d0.obs['dataset'] = 'De Micheli mouse D0'
adata_de_micheli_mouse_d0.write_h5ad('data/processed/de_micheli_mouse_d0.h5')

adata_de_micheli_mouse_d2.obs['dataset'] = 'De Micheli mouse D2'
adata_de_micheli_mouse_d2.write_h5ad('data/processed/de_micheli_mouse_d2.h5')

adata_de_micheli_mouse_d5.obs['dataset'] = 'De Micheli mouse D5'
adata_de_micheli_mouse_d5.write_h5ad('data/processed/de_micheli_mouse_d5.h5')

adata_de_micheli_mouse_d7.obs['dataset'] = 'De Micheli mouse D7'
adata_de_micheli_mouse_d7.write_h5ad('data/processed/de_micheli_mouse_d7.h5')

# De Micheli et al 2020 (human)

This is a human dataset, consisting of 10 patients. To remove batch effects, we will directly run kallisto in all samples at once, and then run the analysis on them.

In [None]:
de_micheli_dir = os.getcwd() + '/data/demicheli'
os.makedirs(de_micheli_dir, exist_ok=True)

In [None]:
SRA_list = """
SRR10897760
SRR10897761
SRR10897762
SRR10897763
SRR10897764
SRR10897765
SRR10897766
SRR10897767
SRR10897768
SRR10897769
"""

with open(de_micheli_dir + '/accession.txt', 'w') as f:
    f.write(SRA_list)
    
df = pd.DataFrame({'name': ['de_micheli'], 'technology': ['10xv3'], 'targetnumcells': [5000]})

df.to_csv(de_micheli_dir + '/metadata.tab', sep='\t', index=None)

In [None]:
!fastq-dump SRR10897768 --split-files --gzip  # This is just one. Run from 60 to 69

In [None]:
from time import sleep

In [None]:
sleep(3600 * 4)
for SRA_idx, SRA in enumerate(SRA_list.split('\n')[1:-1]):
    os.rename(de_micheli_dir + f'/{SRA}_2.fastq.gz', 
              de_micheli_dir + f'/de_micheli_L00{SRA_idx+1}_R1_001.fastq.gz')
    os.rename(de_micheli_dir + f'/{SRA}_3.fastq.gz', 
              de_micheli_dir + f'/de_micheli_L00{SRA_idx+1}_R2_001.fastq.gz')

In [None]:
!loompy fromfq {de_micheli_dir}/de_micheli.loom de_micheli /media/seth/SETH_DATA/SETH_Alex/Programs/human_GRCh38_gencode.v31.600 {de_micheli_dir}/metadata.tab \
{de_micheli_dir}/de_micheli_L001_R1_001.fastq.gz {de_micheli_dir}/de_micheli_L001_R2_001.fastq.gz \
{de_micheli_dir}/de_micheli_L002_R1_001.fastq.gz {de_micheli_dir}/de_micheli_L002_R2_001.fastq.gz \
{de_micheli_dir}/de_micheli_L003_R1_001.fastq.gz {de_micheli_dir}/de_micheli_L003_R2_001.fastq.gz \
{de_micheli_dir}/de_micheli_L004_R1_001.fastq.gz {de_micheli_dir}/de_micheli_L004_R2_001.fastq.gz \
{de_micheli_dir}/de_micheli_L005_R1_001.fastq.gz {de_micheli_dir}/de_micheli_L005_R2_001.fastq.gz \
{de_micheli_dir}/de_micheli_L006_R1_001.fastq.gz {de_micheli_dir}/de_micheli_L006_R2_001.fastq.gz \
{de_micheli_dir}/de_micheli_L007_R1_001.fastq.gz {de_micheli_dir}/de_micheli_L007_R2_001.fastq.gz \
{de_micheli_dir}/de_micheli_L008_R1_001.fastq.gz {de_micheli_dir}/de_micheli_L008_R2_001.fastq.gz \
{de_micheli_dir}/de_micheli_L009_R1_001.fastq.gz {de_micheli_dir}/de_micheli_L009_R2_001.fastq.gz \
{de_micheli_dir}/de_micheli_L0010_R1_001.fastq.gz {de_micheli_dir}/de_micheli_L0010_R2_001.fastq.gz 

We will also load the matrix file because it contains the batches, and it will be useful for us to 
label some of the batches and map them later on. This will help us remove possible small spurious clusters.

In [None]:
link = 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE143nnn/GSE143704/suppl/GSE143704%5FDeMicheli%5FHumanMuscleAtlas%5Frawdata%2Etxt%2Egz'

In [None]:
!wget {link} -P {data_dir}/demicheli

## Adata load and preprocessing

In [None]:
adata_de_micheli_batch = sc.read_text(data_dir+'/demicheli/GSE143704_DeMicheli_HumanMuscleAtlas_rawdata.txt.gz').transpose()

In [None]:
adata_de_micheli_batch.obs['batch'] = [i.split('_')[1] for i in adata_de_micheli_batch.obs_names]
adata_de_micheli_batch.var_names_make_unique()

In [None]:
adata_de_micheli_batch.write_loom(de_micheli_dir+'/de_micheli_human.loom')

In [None]:
adata_de_micheli_human = sc.read_loom(de_micheli_dir + '/de_micheli_human.loom')


In [None]:
# Basic QC filtering
adata_de_micheli_human.var['mt'] = adata_de_micheli_human.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata_de_micheli_human, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(adata_de_micheli_human, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(adata_de_micheli_human, x='n_genes_by_counts', y='pct_counts_mt')
sc.pl.scatter(adata_de_micheli_human, x='total_counts', y='n_genes_by_counts')

In [None]:
adata_de_micheli_human = adata_de_micheli_human[adata_de_micheli_human.obs.n_genes_by_counts < 3500, :]
adata_de_micheli_human = adata_de_micheli_human[adata_de_micheli_human.obs.total_counts < 15000, :]
adata_de_micheli_human = adata_de_micheli_human[adata_de_micheli_human.obs.pct_counts_mt < 80, :]
adata_de_micheli_human = adata_de_micheli_human[~ ((adata_de_micheli_human.obs.pct_counts_mt < 10) & 
                                       (adata_de_micheli_human.obs.pct_counts_mt < 400)), :]

In [None]:
sc.pl.violin(adata_de_micheli_human, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(adata_de_micheli_human, x='n_genes_by_counts', y='pct_counts_mt')
sc.pl.scatter(adata_de_micheli_human, x='total_counts', y='n_genes_by_counts')

In [None]:
sc.pp.filter_genes(adata_de_micheli_human, min_counts=1)
sc.pp.normalize_total(adata_de_micheli_human)
sc.pp.log1p(adata_de_micheli_human)

In [None]:
sc.pp.pca(adata_de_micheli_human, random_state=seed, n_comps=30)
sce.pp.bbknn(adata_de_micheli_human, neighbors_within_batch=3)
tk.tl.triku(adata_de_micheli_human)

In [None]:
sc.tl.umap(adata_de_micheli_human, min_dist=0.3, random_state=seed)
sc.tl.leiden(adata_de_micheli_human, resolution=2, random_state=seed)
sc.pl.umap(adata_de_micheli_human, color=['leiden', 'batch', 'total_counts'], legend_loc='on data', ncols=1)

In [None]:
sc.pl.umap(adata_de_micheli_human, color=['SHISA3', 'NIPAL1', 'SAA1', 'COL9A2', 'S100B', 'GFRA2', 'CD300LG'], 
           legend_loc='on data', ncols=2, cmap=magma)

In [None]:
# ESTO HAY QUE PONERLE UPPER
assign_cats(adata_de_micheli_human, dict_cats_general, column_groupby='leiden', quantile_gene_sel=0.7, do_return=False, intermediate_states=False, diff=0.05, 
                key_added='cell_type', min_score=0.4, others_name='unassigned')

In [None]:
for cat in dict_cats_general.keys():
    print(cat)
    sc.pl.umap(adata_de_micheli_human, color=['cell_type'] + dict_cats_general[cat], ncols=3, cmap=magma)

### Adata saving

In [None]:
adata_de_micheli_human.obs['dataset'] = 'De Micheli human'
adata_de_micheli_human.write_h5ad('data/processed/de_micheli_human.h5')

# Giordani 2020

## Adata download and preprocessing

In [None]:
giordani_dir = os.getcwd() + '/data/giordani'
os.makedirs(giordani_dir, exist_ok=True)

In [None]:
SRA_list = """
SRR8352705
SRR8352706
"""

with open(giordani_dir + '/accession.txt', 'w') as f:
    f.write(SRA_list)

df = pd.DataFrame({'name': ['bamtofastq'], 'technology': ['10xv2'], 'targetnumcells': [5000]})

df.to_csv(giordani_dir + '/metadata.tab', sep='\t', index=None)

In [None]:
!cd {giordani_dir} && aria2c -x 16 https://sra-pub-src-1.s3.amazonaws.com/SRR8352706/20180917_uninjured_wt_possorted_genome_bam.bam.1

In [None]:
!cd {giordani_dir} && aria2c -x 16 https://sra-pub-src-1.s3.amazonaws.com/SRR8352705/20171018_uninjured_wt_possorted_genome_bam.bam.1

In [None]:
program_dir = '/media/seth/SETH_DATA/SETH_Alex/Programs/'

In [None]:
!cd {program_dir} && bamtofastq-1.3.2 {giordani_dir}/20171018_uninjured_wt_possorted_genome_bam.bam.1 {giordani_dir}

In [None]:
!cd {program_dir} && bamtofastq-1.3.2 {giordani_dir}/20180917_uninjured_wt_possorted_genome_bam.bam.1.1 {giordani_dir}

In [None]:
bamdata_str_20171018 = ''
for ROOT, DIR, FILES in os.walk(giordani_dir + f'/bam_20171018'):
    for R1_file in FILES:
        if "_R1_" in R1_file:
            R2_file = R1_file.replace('_R1_', '_R2_')
            bamdata_str_20171018 += f"{ROOT}/{R1_file} {ROOT}/{R2_file} "

bamdata_str_20180917 = ''
for ROOT, DIR, FILES in os.walk(giordani_dir + f'/bam_20180917'):
    for R1_file in FILES:
        if "_R1_" in R1_file:
            R2_file = R1_file.replace('_R1_', '_R2_')
            bamdata_str_20180917 += f"{ROOT}/{R1_file} {ROOT}/{R2_file} "


In [None]:
!loompy fromfq {giordani_dir}/giordani_20171018.loom bamtofastq /media/seth/SETH_DATA/SETH_Alex/Programs/mouse_GRCm38_gencode.v31 {giordani_dir}/metadata.tab {bamdata_str_20171018}

In [None]:
!loompy fromfq {giordani_dir}/giordani_20180917.loom bamtofastq /media/seth/SETH_DATA/SETH_Alex/Programs/mouse_GRCm38_gencode.v31 {giordani_dir}/metadata.tab {bamdata_str_20180917}

In [None]:
adata_giordani_20180917 = sc.read_loom(data_dir+'/giordani/giordani_20180917.loom')
adata_giordani_20180917.var_names_make_unique()
adata_giordani_20171018 = sc.read_loom(data_dir+'/giordani/giordani_20171018.loom')
adata_giordani_20171018.var_names_make_unique()

In [None]:
adata_giordani = sc.AnnData.concatenate(adata_giordani_20180917, adata_giordani_20171018)

In [None]:
# Basic QC filtering
adata_giordani.var['mt'] = adata_giordani.var_names.str.startswith('mt-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata_giordani, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
len(np.sum(adata_giordani.X.todense(), 1).ravel().tolist()[0])

In [None]:
h = plt.hist(np.sum(adata_giordani.X.todense(), 1).ravel().tolist()[0], bins=100)

In [None]:
sc.pl.violin(adata_giordani, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(adata_giordani, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(adata_giordani, x='total_counts', y='n_genes_by_counts', color='batch')

In [None]:
sc.pp.filter_cells(adata_giordani, min_genes=250)
sc.pp.filter_genes(adata_giordani, min_cells=1)
adata_giordani = adata_giordani[adata_giordani.obs['pct_counts_mt'] < 20]

In [None]:
sc.pp.filter_genes(adata_giordani, min_cells=1)
sc.pp.normalize_per_cell(adata_giordani)
sc.pp.log1p(adata_giordani)

In [None]:
sc.pp.pca(adata_giordani, random_state=seed, n_comps=30)
sce.pp.bbknn(adata_giordani)
tk.tl.triku(adata_giordani)

In [None]:
sc.tl.umap(adata_giordani, min_dist=0.3, random_state=seed)
sc.tl.leiden(adata_giordani, resolution=1.3, random_state=seed)
sc.pl.umap(adata_giordani, color=['leiden', 'batch', 'n_genes_by_counts'], legend_loc='on data')

## Population characterization

In [None]:
assign_cats(adata_giordani, dict_cats_general, column_groupby='leiden', quantile_gene_sel=0.7, do_return=False, intermediate_states=False, diff=0.05, 
                key_added='cell_type', min_score=0.3, others_name='unassigned')

In [None]:
for cat in dict_cats_general.keys():
    print(cat)
    sc.pl.umap(adata_giordani, color=['cell_type'] + dict_cats_general[cat], ncols=3, cmap=magma)

In [None]:
sc.pl.umap(adata_giordani, color=['cell_type'] + ['Saa1', 'Shisa3', 'Col9a2'], ncols=3, cmap=magma)

### Adata saving

In [None]:
adata_giordani.obs['dataset'] = 'Giordani'
adata_giordani.write_h5ad('data/processed/giordani.h5')

# Proietti 2021

## Adata download and preprocessing

In [None]:
proietti_dir = os.getcwd() + '/data/proietti_2021'
os.makedirs(proietti_dir, exist_ok=True)

mouse_gencode_dir = '/media/seth/SETH_DATA/SETH_Alex/Programs/mouse_GRCm38_gencode.v31'

In [None]:
SRA_list_ITGA_muscle_PRJNA626530_DEN = ['SRR11574458', 'SRR11574462', 'SRR11574463', 'SRR11574464']
SRA_list_ITGA_muscle_PRJNA626530_CTRL = ['SRR11574459', 'SRR11574460', 'SRR11574461', 'SRR11574465']

In [None]:
df = pd.DataFrame({'name': ['PRJNA626530_DEN', 'PRJNA626530_CTRL'], 
                   'technology': ['10xv3'] * 2, 
                   'targetnumcells': [6000] * 2})

df.to_csv(proietti_dir + '/metadata.tab', sep='\t', index=None)

In [None]:
for idx, name in enumerate(SRA_list_ITGA_muscle_PRJNA626530_DEN):
    os.system(f'cd {proietti_dir} && parallel-fastq-dump -s {name} -t {16} --split-files -O . --tmpdir tmp --gzip')
    os.system(f'cd {proietti_dir} && mv {name}_1.fastq.gz PRJNA626530_DEN_L00{idx+1}_R1_001.fastq.gz')
    os.system(f'cd {proietti_dir} && mv {name}_2.fastq.gz PRJNA626530_DEN_L00{idx+1}_R2_001.fastq.gz')

for idx, name in enumerate(SRA_list_ITGA_muscle_PRJNA626530_CTRL):
    os.system(f'cd {proietti_dir} && parallel-fastq-dump -s {name} -t {16} --split-files -O . --tmpdir tmp --gzip')
    os.system(f'cd {proietti_dir} && mv {name}_1.fastq.gz PRJNA626530_CTRL_L00{idx+1}_R1_001.fastq.gz')
    os.system(f'cd {proietti_dir} && mv {name}_2.fastq.gz PRJNA626530_CTRL_L00{idx+1}_R2_001.fastq.gz')

In [None]:
print(f'''cd {proietti_dir} && loompy fromfq PRJNA626530_DEN.loom PRJNA626530_DEN {mouse_gencode_dir} metadata.tab 
      PRJNA626530_DEN_L001_R1_001.fastq.gz PRJNA626530_DEN_L001_R2_001.fastq.gz 
      PRJNA626530_DEN_L002_R1_001.fastq.gz PRJNA626530_DEN_L002_R2_001.fastq.gz 
      PRJNA626530_DEN_L003_R1_001.fastq.gz PRJNA626530_DEN_L003_R2_001.fastq.gz 
      PRJNA626530_DEN_L004_R1_001.fastq.gz PRJNA626530_DEN_L004_R2_001.fastq.gz''')

In [None]:
os.system(f'''cd {proietti_dir} && loompy fromfq PRJNA626530_DEN.loom PRJNA626530_DEN {mouse_gencode_dir} metadata.tab 
      PRJNA626530_DEN_L001_R1_001.fastq.gz PRJNA626530_DEN_L001_R2_001.fastq.gz 
      PRJNA626530_DEN_L002_R1_001.fastq.gz PRJNA626530_DEN_L002_R2_001.fastq.gz 
      PRJNA626530_DEN_L003_R1_001.fastq.gz PRJNA626530_DEN_L003_R2_001.fastq.gz 
      PRJNA626530_DEN_L004_R1_001.fastq.gz PRJNA626530_DEN_L004_R2_001.fastq.gz''')

In [None]:
print(f'''cd {proietti_dir} && loompy fromfq PRJNA626530_CTRL.loom PRJNA626530_CTRL {mouse_gencode_dir} metadata.tab 
      PRJNA626530_CTRL_L001_R1_001.fastq.gz PRJNA626530_CTRL_L001_R2_001.fastq.gz 
      PRJNA626530_CTRL_L002_R1_001.fastq.gz PRJNA626530_CTRL_L002_R2_001.fastq.gz 
      PRJNA626530_CTRL_L003_R1_001.fastq.gz PRJNA626530_CTRL_L003_R2_001.fastq.gz 
      PRJNA626530_CTRL_L004_R1_001.fastq.gz PRJNA626530_CTRL_L004_R2_001.fastq.gz''')

In [None]:
os.system(f'''cd {proietti_dir} && loompy fromfq PRJNA626530_CTRL.loom PRJNA626530_CTRL {mouse_gencode_dir} metadata.tab 
      PRJNA626530_CTRL_L001_R1_001.fastq.gz PRJNA626530_CTRL_L001_R2_001.fastq.gz 
      PRJNA626530_CTRL_L002_R1_001.fastq.gz PRJNA626530_CTRL_L002_R2_001.fastq.gz 
      PRJNA626530_CTRL_L003_R1_001.fastq.gz PRJNA626530_CTRL_L003_R2_001.fastq.gz 
      PRJNA626530_CTRL_L004_R1_001.fastq.gz PRJNA626530_CTRL_L004_R2_001.fastq.gz''')

## Preprocess dataset

In [None]:
adata_proietti_DEN = sc.read(data_dir+'/proietti_2021/PRJNA626530_DEN.loom')
adata_proietti_DEN.var_names_make_unique()

adata_proietti_CTRL = sc.read(data_dir+'/proietti_2021/PRJNA626530_CTRL.loom')
adata_proietti_CTRL.var_names_make_unique()

adata_proietti = sc.AnnData.concatenate(adata_proietti_DEN, adata_proietti_CTRL, batch_key='condition', batch_categories=['DEN', 'CTRL'])

In [None]:
# Basic QC filtering
adata_proietti.var['mt'] = adata_proietti.var_names.str.startswith('mt-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata_proietti, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(adata_proietti, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(adata_proietti, x='total_counts', y='pct_counts_mt')

In [None]:
adata_proietti = adata_proietti[adata_proietti.obs['pct_counts_mt'] < 25]
adata_proietti = adata_proietti[adata_proietti.obs['n_genes_by_counts'] < 3000]

In [None]:
sc.pp.filter_cells(adata_proietti, min_genes=250)
sc.pp.filter_genes(adata_proietti, min_cells=5)

In [None]:
sc.pp.normalize_per_cell(adata_proietti)
sc.pp.log1p(adata_proietti)

In [None]:
sc.pp.pca(adata_proietti, random_state=seed, n_comps=30)
sce.pp.bbknn(adata_proietti, metric='angular', neighbors_within_batch=3, batch_key='condition')
tk.tl.triku(adata_proietti)

In [None]:
sc.tl.umap(adata_proietti, min_dist=0.3, random_state=seed)
sc.tl.leiden(adata_proietti, resolution=1.5, random_state=seed)
sc.pl.umap(adata_proietti, color=['leiden', 'n_counts', 'condition', 'pct_counts_mt'], legend_loc='on data', cmap=magma, ncols=2)

In [None]:
assign_cats(adata_proietti, dict_cats_general, column_groupby='leiden', quantile_gene_sel=0.7, do_return=False, 
            intermediate_states=False, diff=0.05, 
                key_added='cell_type', min_score=0.35,  others_name='unassigned')

In [None]:
for cat in dict_cats_general.keys():
    print(cat)
    sc.pl.umap(adata_proietti, color=['cell_type'] + [i for i in dict_cats_general[cat] if i in adata_proietti.var_names], ncols=3, cmap=magma)

### Adata saving

In [None]:
adata_proietti.obs['dataset'] = 'Proietti'
adata_proietti.write_h5ad('data/processed/proietti.h5')

# Ronzoni 2021

## Adata download and preprocessing

In [None]:
ronzoni_dir = os.getcwd() + '/data/ronzoni'
os.makedirs(ronzoni_dir, exist_ok=True)

In [None]:
!aria2c -x 16 -d {ronzoni_dir} https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4447nnn/GSM4447776/suppl/GSM4447776%5Fraw%5Fcounts%2Ecsv%2Egz
!aria2c -x 16 -d {ronzoni_dir} https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4447nnn/GSM4447776/suppl/GSM4447776%5Fmetadata%2Ecsv%2Egz

## Preprocess dataset

In [None]:
gene2symbol = pd.read_csv('data/mouse_gene_to_symbol.txt', sep='\t')
dictgene2symbol = dict(zip(gene2symbol['Gene stable ID'].values, gene2symbol['Gene name'].values))

In [None]:
adata_ronzoni = sc.read(ronzoni_dir+'/GSM4447776_raw_counts.csv.gz').transpose()

list_names = []
for gene in adata_ronzoni.var_names:
    try:
        list_names.append(dictgene2symbol[gene])
    except:
        list_names.append(gene)

adata_ronzoni.var_names = list_names
adata_ronzoni.var_names_make_unique()

In [None]:
# Basic QC filtering
adata_ronzoni.var['mt'] = adata_ronzoni.var_names.str.startswith('mt-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata_ronzoni, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(adata_ronzoni, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(adata_ronzoni, x='total_counts', y='pct_counts_mt')

In [None]:
adata_ronzoni = adata_ronzoni[adata_ronzoni.obs['pct_counts_mt'] < 12]
adata_ronzoni = adata_ronzoni[adata_ronzoni.obs['n_genes_by_counts'] < 6000]

In [None]:
sc.pp.filter_cells(adata_ronzoni, min_genes=250)
sc.pp.filter_genes(adata_ronzoni, min_cells=5)

In [None]:
sc.pp.normalize_per_cell(adata_ronzoni)
sc.pp.log1p(adata_ronzoni, base=10)

In [None]:
sc.pp.pca(adata_ronzoni, random_state=seed, n_comps=30)
sc.pp.neighbors(adata_ronzoni, random_state=seed)
tk.tl.triku(adata_ronzoni)

In [None]:
sc.tl.umap(adata_ronzoni, min_dist=0.7, random_state=seed)
sc.tl.leiden(adata_ronzoni, resolution=0.5, random_state=seed)
sc.pl.umap(adata_ronzoni, color=['leiden', 'n_counts'], legend_loc='on data', cmap=magma, ncols=2)

In [None]:
assign_cats(adata_ronzoni, dict_cats_general, column_groupby='leiden', quantile_gene_sel=0.7, do_return=False, intermediate_states=False, diff=0.05, 
                key_added='cell_type', min_score=0.5,  others_name='unassigned')

In [None]:
sc.pl.umap(adata_ronzoni, color=['cell_type'] +  ['Rgs5', 'Ndufa4l2', 'Pax7', 'Ncam2'], ncols=3, cmap=magma)

In [None]:
for cat in dict_cats_general.keys():
    print(cat)
    sc.pl.umap(adata_ronzoni, color=['cell_type'] + [i for i in dict_cats_general[cat] if i in adata_ronzoni.var_names], ncols=3, cmap=magma)

### Adata saving

In [None]:
adata_ronzoni.obs['dataset'] = 'Ronzoni'
adata_ronzoni.write_h5ad('data/processed/ronzoni.h5')

# Dell'Orso 2019

## Adata download and preprocessing

In [None]:
dellorso_dir = os.getcwd() + '/data/dellorso'
os.makedirs(dellorso_dir, exist_ok=True)

In [None]:
!aria2c -x 16 -d {dellorso_dir} https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM3614nnn/GSM3614992/suppl/GSM3614992%5Ftotal%5Fmuscle%5Fwt%5Frep1%5Fbarcodes%2Etsv%2Egz
!aria2c -x 16 -d {dellorso_dir} https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM3614nnn/GSM3614992/suppl/GSM3614992%5Ftotal%5Fmuscle%5Fwt%5Frep1%5Fgenes%2Etsv%2Egz
!aria2c -x 16 -d {dellorso_dir} https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM3614nnn/GSM3614992/suppl/GSM3614992%5Ftotal%5Fmuscle%5Fwt%5Frep1%5Fmatrix%2Emtx%2Egz

!aria2c -x 16 -d {dellorso_dir} https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM3614nnn/GSM3614993/suppl/GSM3614993%5Ftotal%5Fmuscle%5Fwt%5Frep2%5Fbarcodes%2Etsv%2Egz
!aria2c -x 16 -d {dellorso_dir} https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM3614nnn/GSM3614993/suppl/GSM3614993%5Ftotal%5Fmuscle%5Fwt%5Frep2%5Fgenes%2Etsv%2Egz
!aria2c -x 16 -d {dellorso_dir} https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM3614nnn/GSM3614993/suppl/GSM3614993%5Ftotal%5Fmuscle%5Fwt%5Frep2%5Fmatrix%2Emtx%2Egz

## Preprocess dataset

In [None]:
gene2symbol = pd.read_csv('data/mouse_gene_to_symbol.txt', sep='\t')
dictgene2symbol = dict(zip(gene2symbol['Gene stable ID'].values, gene2symbol['Gene name'].values))

In [None]:
adata_dellorso_1 = sc.read(dellorso_dir+'/GSM3614992_total_muscle_wt_rep1_matrix.mtx.gz').transpose()
adata_dellorso_1.obs_names = pd.read_csv(dellorso_dir+'/GSM3614992_total_muscle_wt_rep1_barcodes.tsv.gz', header=None)[0].values
adata_dellorso_1.var_names = pd.read_csv(dellorso_dir+'/GSM3614992_total_muscle_wt_rep1_genes.tsv.gz', header=None, sep='\t')[1].values
adata_dellorso_1.var_names_make_unique()

adata_dellorso_2 = sc.read(dellorso_dir+'/GSM3614993_total_muscle_wt_rep2_matrix.mtx.gz').transpose()
adata_dellorso_2.obs_names = pd.read_csv(dellorso_dir+'/GSM3614993_total_muscle_wt_rep2_barcodes.tsv.gz', header=None)[0].values
adata_dellorso_2.var_names = pd.read_csv(dellorso_dir+'/GSM3614993_total_muscle_wt_rep2_genes.tsv.gz', header=None, sep='\t')[1].values
adata_dellorso_2.var_names_make_unique()

In [None]:
adata_dellorso = sc.AnnData.concatenate(adata_dellorso_1, adata_dellorso_2)

In [None]:
# Basic QC filtering
adata_dellorso.var['mt'] = adata_dellorso.var_names.str.startswith('mt-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata_dellorso, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(adata_dellorso, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(adata_dellorso, x='total_counts', y='pct_counts_mt')

In [None]:
adata_dellorso = adata_dellorso[adata_dellorso.obs['pct_counts_mt'] < 2.5]
adata_dellorso = adata_dellorso[adata_dellorso.obs['n_genes_by_counts'] < 3000]
adata_dellorso = adata_dellorso[adata_dellorso.obs['n_genes_by_counts'] > 500]

In [None]:
sc.pp.filter_cells(adata_dellorso, min_genes=250)
sc.pp.filter_genes(adata_dellorso, min_cells=5)

In [None]:
sc.pp.normalize_per_cell(adata_dellorso)
sc.pp.log1p(adata_dellorso)

In [None]:
sc.pp.pca(adata_dellorso, random_state=seed, n_comps=30)
sce.pp.bbknn(adata_dellorso, neighbors_within_batch=3)
tk.tl.triku(adata_dellorso)

In [None]:
sc.tl.umap(adata_dellorso, min_dist=0.7, random_state=seed)
sc.tl.leiden(adata_dellorso, resolution=0.8, random_state=seed)
sc.pl.umap(adata_dellorso, color=['leiden', 'batch', 'total_counts'], legend_loc='on data', cmap=magma, ncols=2)

In [None]:
assign_cats(adata_dellorso, dict_cats_general, column_groupby='leiden', quantile_gene_sel=0.7, do_return=False, intermediate_states=False, diff=0.05, 
                key_added='cell_type', min_score=0.5,  others_name='unassigned')

In [None]:
sc.pl.umap(adata_dellorso, color=['cell_type'] + ['Mpz', 'S100b'], ncols=3, cmap=magma)

In [None]:
for cat in dict_cats_general.keys():
    print(cat)
    sc.pl.umap(adata_dellorso, color=['cell_type'] + [i for i in dict_cats_general[cat] if i in adata_dellorso.var_names], ncols=3, cmap=magma)

### Adata saving

In [None]:
adata_dellorso.obs['dataset'] = "Dell'Orso"
adata_dellorso.write_h5ad('data/processed/dellorso.h5')

# Fan et al 2021 (mouse) [it is only endothelial!!!]

In [None]:
fan_dir = os.getcwd() + '/data/fan'
os.makedirs(fan_dir, exist_ok=True)

In [None]:
SRR_dict = {'SRR12769482': 'Sedentary_1', 'SRR12769483': 'Sedentary_2', 'SRR12769484': '2weeks_run_1', 'SRR12769485': '2weeks_run_2'}

df = pd.DataFrame({'name': list(SRR_dict.values()), 'technology': ['10xv2'] * len(SRR_dict), 'targetnumcells': [5000] * len(SRR_dict)})
df.to_csv(fan_dir + '/metadata.tab', sep='\t', index=None)

In [None]:
for SRR, name in SRR_dict.items():
    print(SRR, name)
    os.system(f"cd {fan_dir} && parallel-fastq-dump  -t 32 -s {SRR} --gzip --split-files")
    os.system(f"loompy fromfq {fan_dir}/{name}.loom {name} /media/seth/SETH_DATA/SETH_Alex/Programs/mouse_GRCm38_gencode.v31 {fan_dir}/metadata.tab {fan_dir}/{SRR}_2.fastq.gz {fan_dir}/{SRR}_3.fastq.gz") 

## Adata load and preprocessing

In [None]:
adata_fan_sedentary1 = sc.read(fan_dir + '/Sedentary_1.loom')
adata_fan_sedentary1.var_names_make_unique()
adata_fan_sedentary2 = sc.read(fan_dir + '/Sedentary_2.loom')
adata_fan_sedentary2.var_names_make_unique()


adata_fan_sedentary = sc.AnnData.concatenate(adata_fan_sedentary1, adata_fan_sedentary2)

In [None]:
adata_fan_2weeks_run_1 = sc.read(fan_dir + '/2weeks_run_1.loom')
adata_fan_2weeks_run_1.var_names_make_unique()
adata_fan_2weeks_run_2 = sc.read(fan_dir + '/2weeks_run_2.loom')
adata_fan_2weeks_run_2.var_names_make_unique()


adata_fan_2weeks_run = sc.AnnData.concatenate(adata_fan_2weeks_run_1, adata_fan_2weeks_run_2)

### sedentary processing

In [None]:
adata_fan_sedentary.var['mt'] = adata_fan_sedentary.var_names.str.startswith('mt-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata_fan_sedentary, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(adata_fan_sedentary, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(adata_fan_sedentary, x='n_genes_by_counts', y='pct_counts_mt')

In [None]:
batches = sorted(list(set(adata_fan_sedentary.obs['batch'].values)))
for batch in batches:
    counts = adata_fan_sedentary.obs['n_genes_by_counts'].loc[adata_fan_sedentary.obs['batch'] == batch].values
    sns.distplot(counts)

In [None]:
adata_fan_sedentary = adata_fan_sedentary[((adata_fan_sedentary.obs.n_genes_by_counts < 1500) & 
                                    (adata_fan_sedentary.obs.n_genes_by_counts > 500)).values, :]
adata_fan_sedentary = adata_fan_sedentary[adata_fan_sedentary.obs.pct_counts_mt < 2.5, :]

In [None]:
sc.pp.filter_genes(adata_fan_sedentary, min_counts=1)
sc.pp.normalize_total(adata_fan_sedentary)
sc.pp.log1p(adata_fan_sedentary)

In [None]:
sc.pp.pca(adata_fan_sedentary, random_state=seed, n_comps=30)
sce.pp.harmony_integrate(adata_fan_sedentary, key='batch', max_iter_harmony=50)
sc.pp.neighbors(adata_fan_sedentary, use_rep='X_pca_harmony',  n_neighbors=int(0.5 * len(adata_fan_sedentary) ** 0.5 // 2), metric='cosine')
tk.tl.triku(adata_fan_sedentary)

In [None]:
sc.tl.umap(adata_fan_sedentary, min_dist=0.3, random_state=seed)
sc.tl.leiden(adata_fan_sedentary, resolution=2, random_state=seed)
sc.pl.umap(adata_fan_sedentary, color=['leiden', 'batch', 'total_counts'], legend_loc='on data', ncols=1)

In [None]:
assign_cats(adata_fan_sedentary, dict_cats_general, column_groupby='leiden', quantile_gene_sel=0.7, do_return=False, intermediate_states=False, diff=0.05, 
                key_added='cell_type', min_score=0.4, others_name='unassigned')

In [None]:
sc.pl.umap(adata_fan_sedentary, color=['Vcam1', 'Clu', 'Car4', 'Itm2a'], ncols=3, cmap=magma)

In [None]:
for cat in dict_cats_general.keys():
    print(cat)
    sc.pl.umap(adata_fan_sedentary, color=['cell_type'] + dict_cats_general[cat], ncols=3, cmap=magma)

### 2 weeks processing

In [None]:
adata_fan_2weeks_run.var['mt'] = adata_fan_2weeks_run.var_names.str.startswith('mt-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata_fan_2weeks_run, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(adata_fan_2weeks_run, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(adata_fan_2weeks_run, x='n_genes_by_counts', y='pct_counts_mt')

In [None]:
batches = sorted(list(set(adata_fan_sedentary.obs['batch'].values)))
for batch in batches:
    counts = adata_fan_2weeks_run.obs['n_genes_by_counts'].loc[adata_fan_2weeks_run.obs['batch'] == batch].values
    sns.distplot(counts)

In [None]:
adata_fan_2weeks_run = adata_fan_2weeks_run[((adata_fan_2weeks_run.obs.n_genes_by_counts < 2000) & 
                                    (adata_fan_2weeks_run.obs.n_genes_by_counts > 300)).values, :]
adata_fan_2weeks_run = adata_fan_2weeks_run[adata_fan_2weeks_run.obs.pct_counts_mt < 4, :]

In [None]:
sc.pp.filter_genes(adata_fan_2weeks_run, min_counts=1)
sc.pp.normalize_total(adata_fan_2weeks_run)
sc.pp.log1p(adata_fan_2weeks_run)

In [None]:
sc.pp.pca(adata_fan_2weeks_run, random_state=seed, n_comps=30)
sce.pp.harmony_integrate(adata_fan_2weeks_run, key='batch', max_iter_harmony=50)
sc.pp.neighbors(adata_fan_2weeks_run, use_rep='X_pca_harmony',  n_neighbors=int(0.5 * len(adata_fan_2weeks_run) ** 0.5 // 2), metric='cosine')
tk.tl.triku(adata_fan_2weeks_run)

In [None]:
sc.tl.umap(adata_fan_2weeks_run, min_dist=0.3, random_state=seed)
sc.tl.leiden(adata_fan_2weeks_run, resolution=2, random_state=seed)
sc.pl.umap(adata_fan_2weeks_run, color=['leiden', 'batch', 'total_counts'], legend_loc='on data', ncols=1)

In [None]:
assign_cats(adata_fan_2weeks_run, dict_cats_general, column_groupby='leiden', quantile_gene_sel=0.7, do_return=False, intermediate_states=False, diff=0.05, 
                key_added='cell_type', min_score=0.4, others_name='unassigned')

In [None]:
sc.pl.umap(adata_fan_2weeks_run, color=['Vcam1', 'Clu', 'Car4', 'Itm2a'], ncols=3, cmap=magma)

In [None]:
for cat in dict_cats_general.keys():
    print(cat)
    sc.pl.umap(adata_fan_2weeks_run, color=['cell_type'] + dict_cats_general[cat], ncols=3, cmap=magma)

### Adata saving

In [None]:
adata_fan_sedentary.obs['dataset'] = 'Fan - Sedentary'
adata_fan_sedentary.write_h5ad('data/processed/fan_sedentary.h5')

In [None]:
adata_fan_2weeks_run_2.obs['dataset'] = 'Fan - 2 weeks run'
adata_fan_2weeks_run_2.write_h5ad('data/processed/fan_2weeks_run.h5')

# De Leinroth et al 2022 (mouse)

## Adata download and preprocessing

In [None]:
# The data was downloaded as an RDS file. I have the script in R to change it to a csv matrix

In [None]:
df = pd.read_csv(data_dir+'leinroth/leinroth.csv', sep=',', index_col=0).transpose()
df_metadata = pd.read_csv(data_dir+'leinroth/leinroth_metadata.csv', sep=',', index_col=0)
df_metadata['seurat_clusters'] = df_metadata['seurat_clusters'].astype(str)

In [None]:
adata_leinroth = sc.AnnData(df)
adata_leinroth.obs = df_metadata

adata_leinroth.var_names_make_unique()

In [None]:
# Basic QC filtering
adata_leinroth.var['mt'] = adata_leinroth.var_names.str.startswith('mt-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata_leinroth, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(adata_leinroth, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(adata_leinroth, x='n_genes_by_counts', y='pct_counts_mt')
sc.pl.scatter(adata_leinroth, x='total_counts', y='n_genes_by_counts')

In [None]:
adata_leinroth = adata_leinroth[adata_leinroth.obs.n_genes_by_counts < 5500, :]
adata_leinroth = adata_leinroth[adata_leinroth.obs.n_genes_by_counts > 2000, :]

adata_leinroth = adata_leinroth[adata_leinroth.obs.total_counts < 25000, :]
adata_leinroth = adata_leinroth[adata_leinroth.obs.pct_counts_mt < 10, :]

In [None]:
sc.pl.violin(adata_leinroth, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(adata_leinroth, x='n_genes_by_counts', y='pct_counts_mt')
sc.pl.scatter(adata_leinroth, x='total_counts', y='n_genes_by_counts')

In [None]:
sc.pp.filter_genes(adata_leinroth, min_counts=1)

In [None]:
sc.pp.filter_genes(adata_leinroth, min_counts=1)
sc.pp.normalize_total(adata_leinroth)
sc.pp.log1p(adata_leinroth)

In [None]:
sc.pp.pca(adata_leinroth, random_state=seed)
sc.pp.neighbors(adata_leinroth, metric='cosine')
tk.tl.triku(adata_leinroth)

In [None]:
sc.tl.umap(adata_leinroth, min_dist=0.4,  random_state=seed)
sc.tl.leiden(adata_leinroth, resolution=3.5, random_state=seed)
sc.pl.umap(adata_leinroth, color=['leiden', 'seurat_clusters', 'total_counts'], legend_loc='on data', ncols=2)

## Population characterization

In [None]:
sc.tl.rank_genes_groups(adata_leinroth, groupby='leiden', method='wilcoxon')
assign_cats(adata_leinroth, dict_cats_general, column_groupby='leiden', quantile_gene_sel=0.7, do_return=False, intermediate_states=False, diff=0.05, 
            key_added='cell_type', min_score=0.4, others_name='unassigned')
sc.pl.umap(adata_leinroth, color=['leiden', 'seurat_clusters', 'cell_type',], legend_loc='on data', ncols=2)

### Adata saving

In [None]:
adata_leinroth.obs['dataset'] = 'Leinroth'
adata_leinroth.write_h5ad('data/processed/leinroth.h5')