In [None]:
import scanpy as sc
import scanpy.external as sce
import pandas as pd
import numpy as np
import os
import triku as tk
import matplotlib.pyplot as plt
import matplotlib as mpl
from tqdm.notebook import tqdm
import ray
import subprocess

In [None]:
seed = 10

In [None]:
data_dir = 'data/'

In [None]:
# Palettes for UMAP gene expression

magma = [plt.get_cmap('magma')(i) for i in np.linspace(0,1, 80)]
magma[0] = (0.88, 0.88, 0.88, 1)
magma = mpl.colors.LinearSegmentedColormap.from_list("", magma[:65])

# Oprescu 2020

In [None]:
link = 'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE138nnn/GSE138826/suppl/GSE138826%5Fexpression%5Fmatrix%2Etxt%2Egz'

In [None]:
!wget {link} -P {data_dir}/oprescu

In [None]:
adata_oprescu = sc.read_text(data_dir+'/oprescu/GSE138826_expression_matrix.txt.gz').transpose()

In [None]:
adata_oprescu.obs['batch'] = [i.split('_')[0] for i in adata_oprescu.obs_names]

In [None]:
adata_oprescu_d0 = adata_oprescu[adata_oprescu.obs['batch'] == 'Noninjured']

In [None]:
# Basic QC filtering
adata_oprescu.var['mt'] = adata_oprescu.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata_oprescu, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(adata_oprescu, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(adata_oprescu, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(adata_oprescu, x='total_counts', y='n_genes_by_counts', color='batch')

In [None]:
# adata_oprescu = adata_oprescu[adata_oprescu.obs.n_genes_by_counts < 5000, :]
# adata_oprescu = adata_oprescu[adata_oprescu.obs.pct_counts_mt < 30, :]

In [None]:
sc.pp.filter_cells(adata_oprescu, min_genes=250)

In [None]:
sc.pp.filter_genes(adata_oprescu, min_counts=1)
sc.pp.log1p(adata_oprescu)
sc.pp.normalize_per_cell(adata_oprescu)

In [None]:
tk.tl.triku(adata_oprescu, n_procs=1, random_state=seed)
sc.pp.pca(adata_oprescu, random_state=seed, n_comps=30)
sc.pp.neighbors(adata_oprescu, random_state=seed, knn=len(adata_oprescu) ** 0.5 // 2, metric='cosine')

In [None]:
sc.tl.umap(adata_oprescu, min_dist=0.1, random_state=seed)
sc.tl.leiden(adata_oprescu, resolution=1, random_state=seed)
sc.pl.umap(adata_oprescu, color=['leiden', 'batch', 'n_counts'], legend_loc='on data')

## Oprescu Day 0

In [None]:
sc.pp.filter_cells(adata_oprescu_d0, min_genes=200)

In [None]:
sc.pp.filter_genes(adata_oprescu_d0, min_counts=1)
sc.pp.log1p(adata_oprescu_d0)
sc.pp.normalize_per_cell(adata_oprescu_d0)

In [None]:
tk.tl.triku(adata_oprescu_d0, n_procs=1, random_state=seed)
sc.pp.pca(adata_oprescu_d0, random_state=seed, n_comps=30)
sc.pp.neighbors(adata_oprescu_d0, random_state=seed, knn=len(adata_oprescu_d0) ** 0.5 // 2, metric='cosine')

In [None]:
sc.tl.umap(adata_oprescu_d0, min_dist=0.1, random_state=seed)
sc.tl.leiden(adata_oprescu_d0, resolution=1, random_state=seed)
sc.pl.umap(adata_oprescu_d0, color=['leiden', 'batch', 'n_counts'], legend_loc='on data')

In [None]:
sc.pl.umap(adata_oprescu_d0, color=['leiden', 'Pdgfra', 'Lum', 'Dcn', 'Vim', 'Dpp4', 'Sfrp2', 'Apoe', 'Ccl19', 
                                    'Wisp2', 'Wif1', 'Col18a1', 'Postn', 'Col11a1', 'Crabp1', 'Coch'], legend_loc='on data', ncols=2, cmap=magma)

In [None]:
sc.tl.rank_genes_groups(adata_oprescu_d0, groupby='leiden', method='wilcoxon')
sc.pl.rank_genes_groups_tracksplot(adata_oprescu_d0, dendrogram=False, n_genes=20)

In [None]:
sc.tl.rank_genes_groups(adata_oprescu_d0, groupby='leiden', method='wilcoxon', groups=['18'], reference='rest', n_genes=2000)
sc.pl.rank_genes_groups_tracksplot(adata_oprescu_d0, dendrogram=False, n_genes=200)

We are interested in finding genes specific of cluster 18. To do that we are going to filter out genes with expression in the rest of clusters (0, 1, 4, 5, 6, 9, 11).

In [None]:
genes_pos, ratio = [], []
for gene in tqdm(adata_oprescu_d0.uns['rank_genes_groups']['names']['18']):
    list_means = []
    for group in ['0', '1', '4', '5', '6', '9', '2', '3', '8', '13', '15', '11', '12']:
        list_means.append(np.mean(adata_oprescu_d0.X[adata_oprescu_d0.obs['leiden'].isin(
        [group]), adata_oprescu_d0.var_names == gene].ravel()))
    
    r = np.mean(adata_oprescu_d0.X[adata_oprescu_d0.obs['leiden'].isin(['18']), 
                   adata_oprescu_d0.var_names == gene].ravel()) / max(list_means)
    
    ratio.append(r)
    genes_pos.append(gene)

In [None]:
selected = np.array(ratio) > 10**0.25

In [None]:
plt.scatter(np.random.random(len(ratio)), np.log10(ratio), color=['#007ab7' if i else '#ababab' for i in selected])

In [None]:
selected_genes_18_oprescu = np.array(genes_pos)[np.array(selected).astype(bool)]
selected_genes_18_oprescu

In [None]:
sc.tl.rank_genes_groups(adata_oprescu_d0, groupby='leiden', method='wilcoxon', groups=['19'], reference='rest', n_genes=200)
sc.pl.rank_genes_groups_tracksplot(adata_oprescu_d0, dendrogram=False, n_genes=200)

In [None]:
sc.pl.umap(adata_oprescu_d0, color=['leiden', 'Cd34', 'Pdgfrb', 
                                    'Ngfr'], legend_loc='on data', cmap=magma, ncols=2)

In [None]:
sc.pl.umap(adata_oprescu_d0, color=['leiden', 'Sox9', 'Col9a2', 'Shisa3', 'Csrp1', 'Rasgrp2', 
                                    'Gfra1', 'Glb1', 'Atxn1', 'Cspg4'], legend_loc='on data', cmap=magma, ncols=2)

# Scott 2019

In [None]:
!wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM2976nnn/GSM2976778/suppl/GSM2976778%5Fqsnt%5Fbarcodes%2Etsv%2Egz -P {data_dir}/scott
!wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM2976nnn/GSM2976778/suppl/GSM2976778%5Fqsnt%5Fgenes%2Etsv%2Egz -P {data_dir}/scott
!wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM2976nnn/GSM2976778/suppl/GSM2976778%5Fqsnt%5Fmatrix%2Emtx%2Egz -P {data_dir}/scott

In [None]:
adata_scott_d0 = sc.read_mtx(data_dir+'scott/GSM2976778_qsnt_matrix.mtx.gz').transpose()
barcodes = pd.read_csv(data_dir+'scott/GSM2976778_qsnt_barcodes.tsv.gz', sep='\t', header=None)[0].values
features = pd.read_csv(data_dir+'scott/GSM2976778_qsnt_genes.tsv.gz', sep='\t', header=None)[1].values

adata_scott_d0.var_names = features
adata_scott_d0.obs_names = barcodes

In [None]:
adata_scott_d0.var_names_make_unique()

In [None]:
!wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM2976nnn/GSM2976779/suppl/GSM2976779%5Fd1%5Fbarcodes%2Etsv%2Egz -P {data_dir}/scott
!wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM2976nnn/GSM2976779/suppl/GSM2976779%5Fd1%5Fgenes%2Etsv%2Egz -P {data_dir}/scott
!wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM2976nnn/GSM2976779/suppl/GSM2976779%5Fd1%5Fmatrix%2Emtx%2Egz -P {data_dir}/scott

In [None]:
adata_scott_d1 = sc.read_mtx(data_dir+'scott/GSM2976779_d1_matrix.mtx.gz').transpose()
barcodes = pd.read_csv(data_dir+'scott/GSM2976779_d1_barcodes.tsv.gz', sep='\t', header=None)[0].values
features = pd.read_csv(data_dir+'scott/GSM2976779_d1_genes.tsv.gz', sep='\t', header=None)[1].values

adata_scott_d1.var_names = features
adata_scott_d1.obs_names = barcodes

In [None]:
!wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM2976nnn/GSM2976780/suppl/GSM2976780%5Fd2%5Fbarcodes%2Etsv%2Egz -P {data_dir}/scott
!wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM2976nnn/GSM2976780/suppl/GSM2976780%5Fd2%5Fgenes%2Etsv%2Egz -P {data_dir}/scott
!wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM2976nnn/GSM2976780/suppl/GSM2976780%5Fd2%5Fmatrix%2Emtx%2Egz -P {data_dir}/scott

In [None]:
adata_scott_d2 = sc.read_mtx(data_dir+'scott/GSM2976780_d2_matrix.mtx.gz').transpose()
barcodes = pd.read_csv(data_dir+'scott/GSM2976780_d2_barcodes.tsv.gz', sep='\t', header=None)[0].values
features = pd.read_csv(data_dir+'scott/GSM2976780_d2_genes.tsv.gz', sep='\t', header=None)[1].values

adata_scott_d2.var_names = features
adata_scott_d2.obs_names = barcodes

In [None]:
!wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM2976nnn/GSM2976781/suppl/GSM2976781%5Fd4%5Fbarcodes%2Etsv%2Egz -P {data_dir}/scott
!wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM2976nnn/GSM2976781/suppl/GSM2976781%5Fd4%5Fgenes%2Etsv%2Egz -P {data_dir}/scott
!wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM2976nnn/GSM2976781/suppl/GSM2976781%5Fd4%5Fmatrix%2Emtx%2Egz -P {data_dir}/scott

In [None]:
adata_scott_d4 = sc.read_mtx(data_dir+'scott/GSM2976781_d4_matrix.mtx.gz').transpose()
barcodes = pd.read_csv(data_dir+'scott/GSM2976781_d4_barcodes.tsv.gz', sep='\t', header=None)[0].values
features = pd.read_csv(data_dir+'scott/GSM2976781_d4_genes.tsv.gz', sep='\t', header=None)[1].values

adata_scott_d4.var_names = features
adata_scott_d4.obs_names = barcodes

In [None]:
!wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM2976nnn/GSM2976782/suppl/GSM2976782%5Fd14%5Fbarcodes%2Etsv%2Egz -P {data_dir}/scott
!wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM2976nnn/GSM2976782/suppl/GSM2976782%5Fd14%5Fgenes%2Etsv%2Egz -P {data_dir}/scott
!wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM2976nnn/GSM2976782/suppl/GSM2976782%5Fd14%5Fmatrix%2Emtx%2Egz -P {data_dir}/scott

In [None]:
adata_scott_d14 = sc.read_mtx(data_dir+'scott/GSM2976782_d14_matrix.mtx.gz').transpose()
barcodes = pd.read_csv(data_dir+'scott/GSM2976782_d14_barcodes.tsv.gz', sep='\t', header=None)[0].values
features = pd.read_csv(data_dir+'scott/GSM2976782_d14_genes.tsv.gz', sep='\t', header=None)[1].values

adata_scott_d14.var_names = features
adata_scott_d14.obs_names = barcodes

## Adata Scott D0

In [None]:
adata_scott_d0.X = np.asarray(adata_scott_d0.X.todense())

In [None]:
# Basic QC filtering
adata_scott_d0.var['mt'] = adata_scott_d0.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata_scott_d0, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(adata_scott_d0, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(adata_scott_d0, x='total_counts', y='pct_counts_mt')


In [None]:
# adata_oprescu = adata_oprescu[adata_oprescu.obs.n_genes_by_counts < 5000, :]
# adata_oprescu = adata_oprescu[adata_oprescu.obs.pct_counts_mt < 30, :]

In [None]:
sc.pp.filter_cells(adata_scott_d0, min_genes=250)

In [None]:
sc.pp.filter_genes(adata_scott_d0, min_counts=1)
sc.pp.log1p(adata_scott_d0)
sc.pp.normalize_per_cell(adata_scott_d0)

In [None]:
tk.tl.triku(adata_scott_d0, n_procs=1, random_state=seed)
sc.pp.pca(adata_scott_d0, random_state=seed, n_comps=30)
sc.pp.neighbors(adata_scott_d0, random_state=seed, knn=len(adata_scott_d0) ** 0.5 // 2, metric='cosine')

In [None]:
sc.tl.umap(adata_scott_d0, min_dist=0.1, random_state=seed)
sc.tl.leiden(adata_scott_d0, resolution=1.5, random_state=seed)
sc.pl.umap(adata_scott_d0, color=['leiden', 'n_counts'], legend_loc='on data')

In [None]:
sc.tl.rank_genes_groups(adata_scott_d0, groupby='leiden', method='wilcoxon')
sc.pl.rank_genes_groups_tracksplot(adata_scott_d0, dendrogram=False, n_genes=50)

In [None]:
sc.pl.umap(adata_scott_d0, color=['leiden', 'Pdgfra', 'Lum', 'Dcn', 'Vim', 'Dpp4', 'Sfrp2', 'Apoe', 'Ccl19', 
                                    'Wisp2', 'Wif1', 'Col18a1', 'Postn', 'Col11a1', 'Crabp1', 'Coch'], legend_loc='on data', ncols=2, cmap=magma)

In [None]:
sc.tl.rank_genes_groups(adata_scott_d0, groupby='leiden', method='wilcoxon', groups=['5', '13'], reference='rest', n_genes=2000)
sc.pl.rank_genes_groups_tracksplot(adata_scott_d0, dendrogram=False, n_genes=100)

In [None]:
genes_pos, ratio = [], []
for gene in tqdm(set(list(adata_scott_d0.uns['rank_genes_groups']['names']['5']) + 
                 list(adata_scott_d0.uns['rank_genes_groups']['names']['13']))):
    list_means = []
    for group in ['6', '8', '2', '10', '11', '7', '9', '1', '12', '3', '4']:
        list_means.append(np.mean(adata_scott_d0.X[adata_scott_d0.obs['leiden'].isin(
        [group]), adata_scott_d0.var_names == gene].ravel()))
    
    r = np.mean(adata_scott_d0.X[adata_scott_d0.obs['leiden'].isin(['5', '13']), 
                   adata_scott_d0.var_names == gene].ravel()) / max(list_means)
    
    ratio.append(r)
    genes_pos.append(gene)

In [None]:
selected = np.array(ratio) > 10**0.25

In [None]:
plt.scatter(np.random.random(len(ratio)), np.log10(ratio), color=['#007ab7' if i else '#ababab' for i in selected])

In [None]:
selected_genes_9_scott = np.array(genes_pos)[np.array(selected).astype(bool)]
np.array(sorted(selected_genes_9_scott))

In [None]:
selected_genes = np.intersect1d(selected_genes_9_scott, selected_genes_18_oprescu)

In [None]:
selected_genes

In [None]:
sc.pl.umap(adata_scott_d0, color=selected_genes, legend_loc='on data', cmap=magma, ncols=3)

In [None]:
sc.pl.umap(adata_oprescu_d0, color=selected_genes, legend_loc='on data', cmap=magma, ncols=3)

In [None]:
# Genes that are exclusive of cluster 18 in Oprescu
filtered_genes = ['Bricd5', 'Col9a2', 'Dlk1', 'Grin2b', 'Mpzl2', 'Saa1', 'Shisa3', 'Tenm2']


sc.pl.umap(adata_oprescu_d0, color=['leiden'] + filtered_genes, legend_loc='on data', cmap=magma, ncols=3)
sc.pl.umap(adata_scott_d0, color=['leiden'] + filtered_genes, legend_loc='on data', cmap=magma, ncols=3)

In [None]:
sc.pl.umap(adata_oprescu_d0, color=['Nipal1', 'Trpm6', 'S100b', 'Gpld1', 'Plxdc1', 'Gfra2',
                                  'Cd38', 'Cd300lg'], legend_loc='on data', cmap=magma, ncols=3)

In [None]:
sc.pl.umap(adata_scott_d0, color=['Nipal1', 'Trpm6', 'S100b', 'Gpld1', 'Plxdc1', 'Gfra2',
                                  'Cd38', 'Cd300lg'], legend_loc='on data', cmap=magma, ncols=3)