In [None]:
%load_ext autoreload
%autoreload 2
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format='retina'


In [None]:
import numpy as np
import scanpy as sc
import pandas as pd
import anndata as ad
import seaborn as sns

In [None]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.settings.set_figure_params(dpi=80, facecolor='white')


In [None]:
mouse_colors = plt.cm.colors.ListedColormap(['red', 'darkred', 'blue','darkblue', 'orange', 'darkorange', 'violet', 'darkviolet',])

samples = [
    "Hyp_4wk_1_matrix.h5",
    "Hyp_4wk_2_matrix.h5",
    "Hyp_4wk_3_matrix.h5",
    "Hyp_4wk_4_matrix.h5",
    "Hyp_90wk_1_matrix.h5",
    "Hyp_90wk_2_matrix.h5",
    "Hyp_90wk_3_matrix.h5",
    "Hyp_90wk_4_matrix.h5",
    "PFC_4wk_1_matrix.h5",
    "PFC_4wk_2_matrix.h5",
    "PFC_4wk_3_matrix.h5",
    "PFC_4wk_4_matrix.h5",
    "PFC_90wk_1_matrix.h5",
    "PFC_90wk_2_matrix.h5",
    "PFC_90wk_3_matrix.h5",
    "PFC_90wk_4_matrix.h5"
]

mouse_id = {
    0 : 1,
    1 : 1,
    2 : 2,
    3 : 2,
    4 : 3,
    5 : 3,
    6 : 4,
    7 : 4,
    8 : 5,
    9 : 5,
    10 : 6,
    11 : 6,
    12 : 7,
    13 : 7,
    14 : 8,
    15 : 8
}

In [None]:
all_adata = []
i = 0
for s in samples:
    area, age, idx, _ = s.split("_")
    print(area, age, idx)
    curr_adata = sc.read_10x_h5(f"/faststorage/brain_aging/aging10x/{s}")
    curr_adata.var_names_make_unique()
    curr_adata.obs['area'] = area
    curr_adata.obs['age'] = age
    curr_adata.obs['idx'] = i
    i += 1
    curr_adata.var['mt'] = curr_adata.var_names.str.startswith('mt-')  # annotate the group of mitochondrial genes as 'mt'
    sc.pp.calculate_qc_metrics(curr_adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)

    all_adata.append(curr_adata)

In [None]:
total_cells = np.sum([a.n_obs for a in all_adata])
print('total cells:', total_cells)

In [None]:
adata = ad.concat(all_adata)

In [None]:
adata[adata.obs.area=='PFC']

In [None]:
adata.obs_names_make_unique()

In [None]:
sc.pp.filter_cells(adata, min_genes=1000)
sc.pp.filter_cells(adata, max_counts=100000)
sc.pp.filter_genes(adata, min_cells=3)
sc.pp.filter_cells(adata, min_counts=2500)

In [None]:
adata

In [None]:
adata.obs['mouse_id'] = [mouse_id[i] for i in adata.obs.idx]

In [None]:
# run scrublet on adata to identify doublets
import scrublet as scr
scrub = scr.Scrublet(adata.X, expected_doublet_rate=0.09)
doublet_scores, predicted_doublets = scrub.scrub_doublets(min_gene_variability_pctl=85, 
                                                          n_prin_comps=30)



In [None]:
scrub.plot_histogram();


In [None]:
np.sum(predicted_doublets)/len(doublet_scores)

In [None]:
adata = adata[~predicted_doublets,:]

In [None]:
adata

In [None]:
sc.pl.highest_expr_genes(adata, n_top=20, )

In [None]:
adata.write("adata_combined_nodoublet.h5ad")

In [None]:

sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True,size=0.25)


In [None]:
print(np.median(adata.obs.n_genes_by_counts))

In [None]:
print(np.median(adata.obs.total_counts))

In [None]:
adata

In [None]:
#adata = adata[adata.obs.n_genes_by_counts < 3000, :]
#adata = adata[adata.obs.pct_counts_mt < 5, :]


In [None]:
sc.pp.normalize_total(adata, target_sum=1e4)

sc.pp.log1p(adata)

sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)


In [None]:
sc.pl.highly_variable_genes(adata)


In [None]:
adata.raw = adata


In [None]:
adata = adata[:, adata.var.highly_variable]


In [None]:
sc.pp.regress_out(adata, ['total_counts', 'pct_counts_mt'])


In [None]:
sc.pp.scale(adata, max_value=10)


In [None]:
sc.tl.pca(adata, svd_solver='arpack')


In [None]:
sc.pl.pca(adata)


In [None]:
sc.pl.pca_variance_ratio(adata, log=True,n_pcs=50)


In [None]:
sc.pp.neighbors(adata, n_neighbors=10, n_pcs=50)


In [None]:
sc.tl.umap(adata)


In [None]:
sc.tl.leiden(adata,resolution=0.2)


In [None]:
sc.pl.pca(adata,color=['leiden','age','idx'],color_map=plt.cm.rainbow)


In [None]:
sc.pl.umap(adata, color=['leiden','n_genes','total_counts'],color_map=plt.cm.viridis)

In [None]:
adata.write("adata_combined_nodoublet_normalized.h5ad")

# 1. Merge clusters into neurons and not neurons

In [None]:
gene_ids = adata.raw.var_names
ens_idx = np.in1d(gene_ids, 'Snap25')

In [None]:
adata.raw.var

In [None]:
adata.obs['Snap25'] = adata.raw.X[:,ens_idx].mean(1)

In [None]:
plt.hist(adata.obs.groupby('leiden')['Snap25'].apply(np.mean).to_numpy(),100)
plt.axvline(1.25)

In [None]:
is_cluster_neuronal = (adata.obs.groupby('leiden')['Snap25'].apply(np.mean).to_numpy()>1.25)
neuronal_map = dict(zip([str(i) for i in range(len(is_cluster_neuronal))],is_cluster_neuronal))

In [None]:
# computer cluster mean expression for each gene
adata.obs['neuronal'] = [neuronal_map[i] for i in adata.obs.leiden]


In [None]:
sc.pl.umap(adata,color=['neuronal','Snap25'])

In [None]:
sc.pl.dotplot(adata, ['Cx3cr1', 'Aldh1l1','Olig1','Cspg4', 'Snap25', 'Gad1', 'Slc17a6', 'Slc17a7'],groupby='leiden')

In [None]:
# subset by neurons in PFC and Hyp
adata_neuronal = adata[adata.obs.neuronal].copy()
adata_neuronal = adata_neuronal.raw.to_adata()


In [None]:
adata_neuronal_pfc = adata_neuronal[adata_neuronal.obs.area == 'PFC'].copy()
adata_neuronal_hyp = adata_neuronal[adata_neuronal.obs.area == 'Hyp'].copy()


# 2. Cluster neurons

## 2.1 Cluster PFC neurons

In [None]:
def reprocess_subset(A,res=0.7):
    # assumes data have already been normalized/log transformed
    print('finding highly variable genes')
    sc.pp.highly_variable_genes(A, min_mean=0.0125, max_mean=3, min_disp=0.5)
    A.raw = A
    A = A[:, A.var.highly_variable]
    print('regressing out')
    sc.pp.regress_out(A, ['total_counts', 'pct_counts_mt'])
    print('scaling')
    sc.pp.scale(A, max_value=10)
    print('pca')
    sc.tl.pca(A, svd_solver='arpack')
    print('neighbors')
    sc.pp.neighbors(A, n_neighbors=10, n_pcs=50)
    print('umap')
    sc.tl.umap(A)
    print('leiden')
    sc.tl.leiden(A,resolution=res)
    return A

In [None]:
adata_neuronal_pfc = reprocess_subset(adata_neuronal_pfc)

In [None]:
sc.pl.umap(adata_neuronal_pfc, color=['age'])

In [None]:
sc.external.pp.bbknn(adata_neuronal_pfc,batch_key='age')
sc.tl.leiden(adata_neuronal_pfc,resolution=0.6)
sc.tl.umap(adata_neuronal_pfc)

In [None]:
sc.pl.umap(adata_neuronal_pfc, color=['leiden','age','mouse_id'],color_map=mouse_colors)

In [None]:
sc.pl.umap(adata_neuronal_pfc, color='age')

In [None]:
sc.pl.umap(adata_neuronal_pfc, 
           color=['Slc17a7','Gad1','Drd1','Drd2','Sst','Vip','Pvalb',
                  'Cux1','Tshz2','Cd44','Vegfd','Pld5','Otof','Npr3'],
          use_raw=True)

In [None]:
sc.tl.rank_genes_groups(adata_neuronal_pfc, 'leiden', method='wilcoxon')
#sc.pl.rank_genes_groups(adata_neuronal_pfc, n_genes=25, sharey=False)


In [None]:
sc.tl.filter_rank_genes_groups(adata_neuronal_pfc, min_fold_change=1.5)
sc.pl.rank_genes_groups_dotplot(adata_neuronal_pfc, key='rank_genes_groups_filtered')

In [None]:
sc.pl.rank_genes_groups_heatmap(adata_neuronal_pfc,n_genes=5,groupby='leiden',show_gene_labels=True)

## 2.2 Cluster hypothalamus neurons


In [None]:
adata_neuronal_hyp = reprocess_subset(adata_neuronal_hyp)

In [None]:
#sc.external.pp.bbknn(adata_neuronal_hyp,batch_key='mouse_id')
#sc.tl.leiden(adata_neuronal_hyp,resolution=0.2)
#sc.tl.umap(adata_neuronal_hyp)

In [None]:
sc.pl.umap(adata_neuronal_hyp, color=['leiden','age','mouse_id'],color_map=mouse_colors)

In [None]:
sc.pl.umap(adata_neuronal_hyp, color=['Gad1','Slc17a6','Slc17a7','Gal','Agtr1a','Esr1','Pomc','Agrp','Nxph4','Adcyap1','Oxt'],use_raw=True)


In [None]:
sc.tl.rank_genes_groups(adata_neuronal_hyp, 'leiden', method='t-test')


In [None]:
sc.tl.filter_rank_genes_groups(adata_neuronal_hyp, min_fold_change=1.5)

In [None]:
#sc.pl.rank_genes_groups(adata_neuronal_pfc, n_genes=25, sharey=False)

sc.pl.rank_genes_groups_heatmap(adata_neuronal_hyp,n_genes=3,key='rank_genes_groups_filtered',groupby='leiden',show_gene_labels=True)


In [None]:
sc.pl.rank_genes_groups_dotplot(adata_neuronal_hyp, key='rank_genes_groups_filtered')

# 3. Cluster non neurons

In [None]:
adata_nonneuronal = adata[~adata.obs.neuronal].copy()
adata_nonneuronal = adata_nonneuronal.raw.to_adata()


In [None]:
adata_nonneuronal = reprocess_subset(adata_nonneuronal)

In [None]:
#sc.external.pp.bbknn(adata_nonneuronal,batch_key='mouse_id')
#sc.tl.leiden(adata_nonneuronal,resolution=1.2)
#sc.tl.umap(adata_nonneuronal)

In [None]:
sc.tl.leiden(adata_nonneuronal,resolution=0.7)


In [None]:
sc.pl.umap(adata_nonneuronal, color=['leiden'])

In [None]:
sc.pl.umap(adata_nonneuronal, color=['leiden','area','age','mouse_id'],color_map=mouse_colors)

In [None]:
sc.pl.umap(adata_nonneuronal, color=['Cdkn2a','Aldh1l1','Cx3cr1','Plp1','Cspg4',
                                     'Gfap','Aqp4','Cldn5','Adgrf5'])



In [None]:
old_to_new = dict(
    old_cluster1='new_cluster1',
    old_cluster2='new_cluster1',
    old_cluster3='new_cluster2',
)
adata.obs['new_clusters'] = (
    adata.obs['old_clusters']
    .map(old_to_new)
    .astype('category')
)


# Cluster whole dataset

In [None]:
sc.external.pp.bbknn(adata,batch_key='mouse_id')
sc.tl.leiden(adata,resolution=0.2)
sc.tl.umap(adata)

In [None]:
fig = sc.pl.umap(adata, color=['age','area','mouse_id'],color_map=mouse_colors,return_fig=True)
fig.savefig("/Users/wea/src/tithonus/analysis/aging10x/umap.png",dpi=300,bbox_inches='tight')

In [None]:
sc.pl.umap(adata, color=['Cx3cr1', 'Aldh1l1','Olig1','Cspg4', 'Snap25', 'Gad1', 'Slc17a6', 'Slc17a7'],color_map=plt.cm.Reds)

In [None]:
sc.pl.umap(adata, color=['Vip','Gal','Sst','Cck','Npy','Oxt','Nxph4','Agtr1a','Agrp','Esr1'],cmap=plt.cm.coolwarm,vmin=-5,vmax=5)

In [None]:
sc.pl.umap(adata, color=['C1qa','C3','Itgam','Trem2'],cmap=plt.cm.coolwarm,use_raw=True,vmin=-3,vmax=3)

In [None]:
sc.pl.umap(adata, color=['Cdkn2a','C2','C4b','Tspan2','Il33','Aldh1l1','Cd4','Cd74','Agtr1a'],color_map=plt.cm.Reds,use_raw=True)

In [None]:

sc.pl.umap(adata, color=[i for i in list(adata.raw.var_names) if 'Il' in i],color_map=plt.cm.Reds,use_raw=True)

In [None]:
sc.pl.umap(adata, color=[i for i in list(adata.raw.var_names) if 'H2-' in i],color_map=plt.cm.coolwarm,use_raw=True,vmin=-3,vmax=3)

In [None]:
sc.tl.rank_genes_groups(adata, 'leiden', method='t-test')
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False)


In [None]:
pd.DataFrame(adata.uns['rank_genes_groups']['names']).head(5)
