In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import scanpy as sc
import sys

In [None]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.settings.set_figure_params(dpi=80, facecolor='white', frameon=False, figsize=(5,5))

In [None]:
from find_merfish_markers import *

# Script to select cell type marker genes for MERFISH

In [None]:
adata = sc.read("/faststorage/brain_aging/rna_analysis/adata_finalclusts_annot.h5ad")

# Known markers

In [None]:

cortex_major_markers = ['Slc17a7', 'Slc32a1', 'Slc30a3', 'Cux2', 'Rorb', 'Sulf2', 
                  'Ptpru', 'Car3', 'Fam84b', 'Syt6', 'Nxph4', 'Tshz2', 'Pvalb', 'Sst', 'Vip',
                 'Sncg', 'Lamp5', 'Sox10', 'Pdgfra', 'Aqp4', 'Igf2', 'Ctss', 'Cldn5', 'Flt1', 'Bgn', 'Vtn', 'Gfap',
                        'Gad1', 'Gad2', 'Fn1', 'Myh11', 'Cd24a', 'Selplg', 'Pdgfra', 'Aqp4', 'Mbp', 'Ttyh2',
                        "Crhbp", "Cnr1", "Cpne5", "Crh", "Kcnip2", "Tbr1", "Lamp5", "Rorb", "Syt6", "Aldoc", "Gfap",
                        "Serpinf1", "Mfge8", "Sox10", "Plp1", "Pdgfra", "Tmem8", "Itpr2", "Ctps", "Bmp4", "Anln",
                        "Hexb", "Mrc1", "Vtn", "Flt1", "Apln", "Acta2", "Ttr", "Foxj1"
                       ]

cortex_major_markers.extend([
"Acta2","Aqp4","Bgn", "Calb2","Car3", "Cd14", "Chat", "Chodl", "Chrna2", "Cldn5", "Crhr2",
"Crispld2","Cspg4","Ctss","Cux2","Egfr","Enpp6","Fam84b","Fezf2","Flt1","Foxp2","Gfap","Hpse","Igf2","Kcnj8",
"Lhx6","Lmo1","Lsp1","Mrc1","Nxph2","Nxph4","Opalin","Osr1","Otof","Pdgfra","Prox1","Rorb","Rspo1","Rxfp1",
"Satb2","Serpinf1","Slc17a6","Slc17a8","Slc30a3","Slc32a1","Sncg","Sox10","Sox6","Sulf1","Syt6","Tcap","Th","Tshz2",
"Vipr2","Vtn","Vip","Sst","Calb1","Gad2","Slc17a7","Lamp5","Gad1","Pvalb","Fezf2", "Bcl11b", "Npr3", "Otof"
    ])

cortex_major_markers = list(set(cortex_major_markers))
hypo_major_markers = [
    'Agtr1a', 'Pomc', 'Oxt', 'Npy', 'Agrp', 'Esr1', 'Slc17a6',
    'Meis2', 'Th', 'Gpr101', 'Hcrt', 'Nrgn', 'Sst', 'Map1b', 'Nts', 'Pmch', 'Cartpt',
    'Gpr83', 'Bdnf', 'Otp', 'Calb2', 'Tac1', 'Tac2', 'Calb1', 'Trh', 'Gal', 'Col25a1', 'Synpr'
]

moffitt_genes = [
    "Ace2", "Adora2a", "Aldh1l1", "Amigo2", "Ano3", "Aqp4", "Ar", "Arhgap36", "Avpr1a", "Avpr2", "Baiap2", "Bdnf", "Bmp7", "Brs3","Calcr","Cbln1","Cbln2","Cckar","Cckbr","Ccnd2","Cd24a","Cdkn1a","Cenpe","Chat","Coch","Col25a1","Cplx3","Cpne5","Creb3l1","Crhbp","Crhr1","Crhr2","Cspg5","Cxcl14","Cyp19a1","Cyp26a1","Cyr61","Dgkk","Ebf3","Egr2","Ermn","Esr1","Etv1","Fbxw13","Fezf1","Fn1","Fst","Gabra1","Gabrg1""Gad1","Galr1","Galr2","Gbx2","Gda","Gem","Gjc3","Glra3","Gpr165","Greb1","Grpr","Htr2c","Igf1r","Igf2r","Irs4","Isl1","Kiss1r","Klf4","Lepr","Lmod1","Lpar1","Man1a","Mc4r","Mki67","Mlc1","Myh11","Ndnf","Ndrg1","Necab1","Nos1","Npas1","Npy1r","Npy2r","Ntng1","Ntsr1","Nup62cl","Omp","Onecut2","Opalin","Oprd1","Oprk1","Oprl1","Oxtr","Pak3","Pcdh11x","Pdgfra","Pgr","Plin3","Pnoc","Pou3f2","Prlr","Ramp3","Rgs2","Rgs5","Rnd3","Rxfp1","Scgn","Selplg","Sema3c","Sema4d","Serpinb1b","Serpine1","Sgk1","Slc15a3","Slc17a6","Slc17a7","Slc17a8","Slc18a2","Slco1a4","Sox4","Sox6","Sox8","Sp9","Synpr","Syt2","Syt4","Sytl4","Tacr1","Tacr3","Tiparp","Tmem108","Traf4","Trhr","Ttn","Ttyh2","Oxt","Penk","Sst","Tac1","Gal","Cartpt","Vgf","Trh","Nts","Scg2","Gnrh1","Tac2","Cck","Crh","Ucn3","Adcyap1","Nnat","Sln","Mbp",
"Th"
]
t_cell_genes = [
    "Ptprc", "Rorc", "Gata3", "Foxp3", "Tbx21", "Il2ra", "Il7r", "Il2rb", "Il2rg", "Il15ra", "Pdcd1", "Ctla4", "Cd3e"    
]
macrophage_genes = [
    "Spi1", "Cx3cr1", "Ccr2", "Adgr1", "Aif1", "Csf1r", "Trem2", "H2-Ab1", "Itgae", "Clec10a", "Itgam", "Itgax"
]
bcell_genes = ["Ms4a1", "Cd19", "Prdm1"]
nkcell_genes = ["Klrk1", "Klrb1", "Eomes", "Klrg1"]
misc_immune = ["Cxcl9", "Cxcl10", "Ccl2", "Cd1d1", "Fcer1a", "Fcgr1", "Cr2", "Cd47"]
innate_bacterial = ['Il1b', 'Tnf', 'Il6', 'Ptges2']
innate_viral = ['Ifna12', 'Ifna16', 'Ifna2']
th1 = ['Ifng', 'Il12a']
th2 = ['Il4', 'Il5', 'Il13']
th17 = ['Il17a', 'Il17f', 'Il22', 'Il23a']
treg = ['Il10', 'Tgfb2', 'Tgfb1', 'Tgfb3']
other_immune = ["Nfkb1", "Nfkbia", "Irf3", "Nlrp3", "Irf7", "Gsdmd", "Il18"]

minimal_aging = ["C3", "C4b", "Il33","Tnf","Cdkn2a", "Cdkn2b", 'B2m', 'C1qa', 'C1qc', 'C4b', 'Ctss', 'Gbp6', 'Gbp10', 'Ifi44', 'Ifit3', 'Ifitm3', 'Itgb2', 'Parp14', 'Serpina3n', 'Tap1', 'Trim30a']

In [None]:
known_markers_pfc = np.unique(np.concatenate([
    cortex_major_markers,
    t_cell_genes, macrophage_genes, bcell_genes, nkcell_genes, 
    misc_immune, innate_bacterial, innate_viral, 
    th1, th2, th17, treg, other_immune,
    minimal_aging
]))

# Select MERFISH genes for cell type markers in PFC 

In [None]:
adata_pfc = adata[adata.obs.area == "PFC"]

In [None]:
# Meng/Stephen approach: for pairs of clusters, compute differential expression
adata_raw = adata_pfc.raw.to_adata()
adata_raw = adata_raw[:, adata_raw.var.highly_variable]

In [None]:
# pairwise
de_subclusts = compute_pairwise_de_for_clusts(adata_raw, "clust_label",n_de=10)
de_majorclusts = compute_pairwise_de_for_clusts(adata_raw, "cell_type",n_de=10)

# one vs all
minorclusts_onevsall = compute_onevsall_de_for_clusts(adata_raw, 'clust_label',n_de=15)
majorclusts_onevsall = compute_onevsall_de_for_clusts(adata_raw, 'cell_type',n_de=15)


In [None]:
#de_minorclust_pairwise = greedily_select_markers(de_subclusts, 1, pairwise=True,de_marker_genes=known_markers_pfc)
#de_majorclust_pairwise = greedily_select_markers(de_majorclusts, 5, pairwise=True,de_marker_genes=known_markers_pfc)
de_minorclusts_onevsall = greedily_select_markers(minorclusts_onevsall, 2, pairwise=False, de_marker_genes=known_markers_pfc)
de_majorclusts_onevsall = greedily_select_markers(majorclusts_onevsall, 2, pairwise=False, de_marker_genes=known_markers_pfc)

# List of known markers

In [None]:
de_combined = list(np.unique( list(de_minorclusts_onevsall) + list(de_majorclusts_onevsall)))

In [None]:
de_combined

In [None]:
adata_raw = adata_pfc.raw.to_adata()

de_combined = [i for i in de_combined if i in adata_raw.var_names]

In [None]:
len(de_combined)

In [None]:
# load per cluster markers and take top N
#seurat_clust_markers = pd.read_csv("gene_lists/all_clust_markers.csv")
#min_marker_genes = 4
#seurat_de_marker_genes = set()
#for n,i in enumerate(clust_labels_uniq):
#    curr_contrast = seurat_clust_markers[seurat_clust_markers.cluster==i].sort_values('avg_log2FC', ascending=False)
#    curr_genes = list(curr_contrast.gene)[:3]
#    for k in curr_genes:
#        seurat_de_marker_genes.add(k)
#seurat_de_marker_genes = list(seurat_de_marker_genes)

In [None]:
marker_clust_avgs = []
clust_avgs = []
for i in adata_raw.obs.clust_label.unique():
    clust_avgs.append(compute_mean_expression(adata_raw[adata_raw.obs.clust_label==i,:]))
    marker_clust_avgs.append(compute_mean_expression(adata_raw[adata_raw.obs.clust_label==i,:][:,de_combined]))

In [None]:
plot_clustered_celltypes_by_genes(adata_raw, de_combined,normalize=False)

In [None]:
adata_raw[:,de_combined].X.sum(1).shape

In [None]:
len(de_combined)


# Select MERFISH genes for aging

1. Cell types:
	1. Pairwise DE of major clusts
	2. Pairwise DE of minor clusts
	3. One-vs-all DE of major clusts
	4. One-vs-all DE of minor clusts
	5. Random forest features of major clusts
	6. Random forest features of minor clusts
2. Aging markers:
	1. Random forest features of major clusts
	2. Random forest features of minor clusts
	3. NB differential expression of major clusts
	4. NB differential expression of minor clusts
	5. TF random forests of major clusts
	6. TF random forests of minor clusts

In [None]:
# load aging differentially expressed genes
age_tf_feats = list(pd.read_csv("gene_lists/age_tf_feats.csv").gene)
age_de_minor = pd.read_csv("gene_lists/nb_glm_age_de_minor.csv")
age_de_major = pd.read_csv("gene_lists/nb_glm_age_de_major.csv")

In [None]:
age_de_major = age_de_major[age_de_major.qval < 1e-6]
age_de_minor = age_de_minor[age_de_minor.qval < 1e-6]

In [None]:
age_de_major['log10fc'] = np.abs(age_de_major.coef)
#age_de_minor['log10fc'] = np.abs(age_de_minor.coef)

In [None]:
age_de_major_markers = list(select_age_markers({k:age_de_major[age_de_major.cell_type==k] for k in age_de_major.cell_type.unique()}, 5))
age_de_minor_markers = list(select_age_markers({k:age_de_minor[age_de_minor.cell_type==k] for k in age_de_minor.cell_type.unique()}, 2))


In [None]:
combined_age_markers = list(set(age_de_minor_markers + age_de_major_markers + age_tf_feats))

In [None]:
len(combined_age_markers)

In [None]:
combined_age_markers = sorted(combined_age_markers)

# Add in markers from literature

In [None]:
# Microglia reactivity signature
# IL-6, TGFbeta1, IL10, IL-12/p40, IL-1beta, TNFalpha
aging_microglia = ['Ccl4', 'Lgals3', 'Ms4a7', 'Ifitm3'] + ['Il10', 'Il6', 'Il21a', 'Il12b', 'Il1b', 'Tnf']
aging_microglia += ["Tmem119", "Apoe", "Cst7", "Clec7a", "Lpl", "Hif1a", "Igf1", "Cd74", "Ifit2", "Ifit3", "Irf7", "Oasl2", "Top2a", "Mcm2"]
aging_microglia += ["Tyrobp", "Ctsb", "Ctsd", "Fth1", "Lyz2", "Axl", "Cst7", "Trem2", "Cst7", "Lpl", "Cd9", "Csf1", "Ccl6", "Itgax", "Clec7a", "Lilrb4", "Timp2", "Marcks", "Serinc3", "P2ry12", "Cd9", "Cd63"]

In [None]:
# Aging astrocytes -- The Aging Astrocyte Transcriptome from Multiple Regions of the Mouse Brain, Boisvert et al
aging_astro_allregions = ['Sprina3m', 'Serpina3n', 'C4b', 'Pcdh6', 'Pcdhb1', 'Gfap', 'Prss50', # upregulated
                          'Gpx8', 'Hspa1b', 'Hspa1a', 'Rsrp1']
aging_astro_regionspecific = list(np.unique(['Serpina3f', 'Rpk4', 'Timp1', 'Fbln5', 'Plin4', 'Rab20', 'Capg', 'Zc3hav1', 'Gbp2', 'Ifi35', 'Hs3st3a1', 'Mboat1', 'Psmb8', 'Cyp27a1',
                              'Serpina3f', 'Cdr1', 'Zbtb20', 'Grin2b', 'Hipk2', 'Tcp11l1', 'Ago3', 'Oasl2', 'Lnpep', 'Gan', 'Aqp2', 'Bst2', 'Hmbox1', 'Zc3hav1',
                             'Serpina3f', 'Cdr1', 'Lars2', 'Zbtb20', 'Grin2b', 'Rpk4', 'Nr5a1', 'Slc22a18', 'Timp1', 'Fcgr2b', 'Hipk2', 'C3', 'Osmr', 'Oasl2', 'Nupr1', # up
                                            'Bmp4', 'Kiss1', 'Fst', 'Cyr61', 'Tead2', 'Dnajb1', 'Banp', 'Cdx8', 'Rbm12b1', 'Ece2', 
                                            'Bmp4', 'Cd38', 'Sptbn2', 'Sptb', 'Pcdh20', 'Eif5b', 'Gm7120', 'Sptan1', 'Hmgcr', 'Trio',
                                            'Sspo', 'Wfdc2', 'Ttr', 'Ctgf', 'Thbs4', 'Bmp4', 'Prom1', 'Sptbn2', 'Bgn', 'Tnc', 'Sparc']))
combined_astro_marker = list(np.unique(aging_astro_allregions+aging_astro_regionspecific))

In [None]:
# senescence genes
senescence_high = ['Retnla', 'Tnf', 'Cdkn2a', 'Itgax', 'Il12b', 'Il18', 'Cd68', 'Fcgr1',
       'Parp14', 'Fcna', 'Cd36', 'Cd38', 'Bst1', 'Itgam', 'Emr1', 'Irg1',
       'Il1b', 'Lmnb1', 'Il10', 'Fabp4', 'Lyve1', 'Mrc1', 'Nampt', 'Nadk',
       'Bmi1', 'Sirt7']

senescence_low = ['Sirt1', 'Nfkbiz', 'Cdkn1a', 'Tiparp', 'Trp53',
       'Sirt5', 'Csf1', 'Nfkb1', 'Parp6', 'Sirt2', 'Nnmt', 'Hmgb1', 'Bcl2l2',
       'Nt5e', 'Sirt3', 'Serpine1', 'Arg1', 'Parp10', 'Ccl2', 'Il6', 'Nmnat3',
       'Cdkn2b', 'Il12a', 'Parp12', 'Parp9', 'Parp11', 'Parp8', 'Sirt6',
       'Sirt4', 'Mgl2', 'Parp3', 'Zc3hav1', 'Tnks', 'Parp4', 'Parp2', 'Sarm1',
       'Parp16', 'Nmnat2', 'Parp1', 'Nmnat1']

reactive_astro1 = ['C3',  'Ggta1', 'Ligp1', 'Gpp2', 'Fbln5', 'Ekbp5', 'Psmb8'] # A1 astrocytes are produced following LPS injection
reactive_astro2 = ['Clcf1', 'Tgm1', 'Ptx3', 'S100a10', 'Sphk1', 'Cd109', 'Ptgs2', 'Emp1', 'Slc10a6', 'Tms4sf1', 'B3gnt5', 'Stat3']
reactive_astro_pan = ['Lcn2', 'Steap4', 'S1pr3', 'Timp1', 'Hsbp1', 'Cxcl10', 'Cd44', 'Cp', 'Serpina3n', 'Aspg', 'Vim', 'Gfap']

# brunet aging genes
brunet_genes = ['B2m', 'C1qa', 'C1qc', 'C4b', 'Ctss', 'Gbp6', 'Gbp10', 'Ifi44', 'Ifit3', 'Ifitm3', 'Itgb2', 'Parp14', 'Serpina3n', 'Tap1', 'Trim30a']


In [None]:
combined_senescence = list(np.unique(senescence_high + senescence_low + brunet_genes + reactive_astro1 + reactive_astro2 + reactive_astro_pan))

In [None]:
orig_all_age_markers = list(list(combined_age_markers + aging_microglia + combined_astro_marker + combined_senescence))

In [None]:
# remove genes in cell type markers
orig_all_age_markers = list(set([i for i in orig_all_age_markers if i not in de_combined]))
good_genes = adata.raw.to_adata().var_names
orig_all_age_markers = [i for i in orig_all_age_markers if i in good_genes]

In [None]:
print(len(orig_all_age_markers))

In [None]:
age_diffexp = compute_average_age_expr_change(adata_raw, orig_all_age_markers)

In [None]:
# filter based on average change in expression
age_threshold = 0.35 #np.log(1.5)
all_age_markers = np.array(orig_all_age_markers)[(np.abs(age_diffexp) > age_threshold).any(0)]

In [None]:
all_age_markers

In [None]:
print(np.sum([1 if i in combined_senescence else 0 for i in orig_all_age_markers ]))

In [None]:
mean_age_expr = compute_mean_expression(adata_raw[:,all_age_markers])
plt.plot(np.cumsum(np.sort(mean_age_expr))/np.sum(mean_age_expr),'ko-')

In [None]:
# remove the top 10% highly expressed age markers
#sorted_age_markers = np.array(orig_all_age_markers)[np.argsort(mean_age_expr)]
#all_age_markers = list(np.array(orig_all_age_markers[:int(0.8*len(sorted_age_markers))]))

In [None]:
# how 
#print(np.sum([1 if i in combined_senescence else 0 for i in all_age_markers ]))

In [None]:
print("Senescence genes excluded based on expression")
for i in combined_senescence:
    if i not in all_age_markers and i in orig_all_age_markers:
        if i in brunet_genes:
            print(i, 'brunet')
        elif i in senescence_high:
            print(i, 'senesce_high')
        elif i in senescence_low:
            print(i, 'senesce_low')
        elif i in combined_astro_marker:
            print(i,'combined_astro')
        elif i in aging_microglia:
            print(i,'microglia')
        else:
            print(i)

In [None]:
plot_clustered_ages_by_genes(adata_raw, de_combined)
plot_clustered_ages_by_genes(adata_raw, all_age_markers)

In [None]:
plot_clustered_celltypes_by_genes(adata_raw, all_age_markers,normalize=False)

In [None]:
plot_per_celltype_sparsity(adata_raw, de_combined)

In [None]:
plot_per_celltype_sparsity(adata_raw, all_age_markers)

In [None]:
plot_per_celltype_totalexpr(adata_raw, de_combined)

In [None]:
plot_per_celltype_totalexpr(adata_raw, all_age_markers)

In [None]:
plot_per_gene_sparsity(adata_raw, de_combined)

In [None]:
plot_per_gene_sparsity(adata_raw, all_age_markers)

In [None]:
pd.DataFrame({'gene':de_combined}).to_csv("gene_lists/all_markers_pfc.csv")
pd.DataFrame({'gene':all_age_markers}).to_csv("gene_lists/all_markers_pfc_aging.csv")

# Save out per cluster expression for these genes for bit assignment

In [None]:
# find cluster names
adata_raw = adata.raw.to_adata()
clust_labels_uniq = adata_raw.obs.clust_label.unique()
# find markers actually in adata
all_markers = [i for i in all_markers_to_keep if i in adata_raw.var_names]
# compute cluster averages
clust_avgs = np.vstack([adata_raw[adata_raw.obs.clust_label==i,:][:, list(all_markers_to_keep)].X.mean(0) for i in clust_labels_uniq])

In [None]:
clust_expr = pd.DataFrame(clust_avgs, index=clust_labels_uniq, columns=all_markers_to_keep).to_csv("merfish_cluster_expr.csv")

In [None]:
# save this data for bit assignment
clust_proportions = np.array([np.sum(adata_raw.obs.clust_label==i) for i in clust_labels_uniq])
clust_proportions = clust_proportions/clust_proportions.sum()
pd.DataFrame({'clust':clust_labels_uniq,'proportion':clust_proportions}).to_csv("merfish_cluster_proportions.csv")