In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import csv
import scanpy.api as sc
from igraph import *
from MulticoreTSNE import MulticoreTSNE as TSNE #faster TSNE alternative
from anndata import read_h5ad
import dill # for saving everything

sc.logging.print_versions()

  from ._conv import register_converters as _register_converters


scanpy==1.3.1 anndata==0.6.10 numpy==1.14.3 scipy==1.1.0 pandas==0.23.0 scikit-learn==0.19.1 statsmodels==0.9.0 python-igraph==0.7.1 louvain==0.6.1 


# Load data

## 18, 21 and 24 months

In [None]:
path = '/data/maca/data/'
maca10x182124metadata = pd.read_csv('/data/maca/data/MACA_10x_18-21-24_qc.csv',usecols = ['channel','tissue','tissue.notes','mouse.age','mouse.sex'])

maca10x182124metadata = maca10x182124metadata.rename(columns = {'mouse.age':'age','mouse.sex':'sex','tissue.notes':'subtissue'})
maca10x182124metadata.index = range(len(maca10x182124metadata))

In [None]:
maca10x182124metadata.head()
#len(maca10x182124metadata)

In [None]:
adata182124aux = []
for i in range(0,len(maca10x182124metadata)):
    path = '/data/maca/data/' + maca10x182124metadata.channel[i] + '/'
    foo = sc.read(path + 'matrix.mtx', cache=True).transpose()
    foo.var_names = np.genfromtxt(path + 'genes.tsv', dtype=str)[:, 1]
    foo.obs_names = np.genfromtxt(path + 'barcodes.tsv', dtype=str)
    foo.obs['tissue'] = str(maca10x182124metadata.tissue[i]).title()
    #foo.obs['subTissue'] = str(maca10x182124metadata.subtissue[i]).title()
    
    
    if pd.isnull(maca10x182124metadata.subtissue[i]):
        foo.obs['subtissue'] = maca10x182124metadata.subtissue[i]
    else:        
        foo.obs['subtissue'] = str(maca10x182124metadata.subtissue[i]).title()
        
    
    foo.obs['age'] = str(maca10x182124metadata.age[i]) + 'm'
    
    if maca10x182124metadata.sex[i] == 'M':
        foo.obs['sex'] = 'male'
    else:
        foo.obs['sex'] = 'female'
    
    if maca10x182124metadata.tissue[i] == 'MUSCLE':
        foo.obs['tissue'] = 'Limb_Muscle'
        
    sc.pp.filter_cells(foo, min_genes=250)
    
    foo.obs['channel'] = maca10x182124metadata.channel[i]# + foo.obs_names.split("-")[0]
    
    foo.obs['cell_ontology_class'] = 'NA' #maca10x3metadata.cell_ontology_class[i]
    #foo.obs['CellTypeID'] = maca10x3metadata.cell_ontology_id[i]
    #foo.obs['FreeAnn'] = maca10x3metadata.free_annotation[i]
    
    adata182124aux.append(foo)
    #print(adata3.X.size*8/1000000) # convert to Mb


first = adata182124aux.pop()
adata182124 = first.concatenate(adata182124aux)
adata182124.obs['cell'] = adata182124.obs['channel'] + '_' + adata182124.obs_names
adata182124.obs['cell'] = adata182124.obs['cell'].apply(lambda x: pd.Series(x.split('-')))[0]
adata182124.obs['method'] = 'droplet'

adata182124.obs.head()

In [None]:
set(adata182124.obs['tissue'])


## 3months

In [2]:
path = '/data/maca/data/10x/'
maca10x3metadata = pd.read_csv('/data/maca/data/10x/MACA_10x_3_qc.csv',usecols = ['channel','tissue','tissue.notes','mouse.age','mouse.sex','pool'])
maca10x3metadata2 = pd.read_csv('/data/maca/data/10x/TM_droplet_metadata.csv', low_memory=False)

maca10x3metadata = maca10x3metadata.rename(columns = {'mouse.age':'age','mouse.sex':'sex','tissue.notes':'subtissue'})
#maca10x3metadata = maca10x3metadata[maca10x3metadata.age == 3]
maca10x3metadata.index = range(len(maca10x3metadata))

maca10x3metadata.head()

Unnamed: 0,channel,tissue,subtissue,age,sex,pool
0,10X_P4_0,Tongue,TONGUE,3,M,4
1,10X_P4_1,Tongue,TONGUE,3,M,4
2,10X_P4_2,Liver,HEPATOCYTES,3,M,4
3,10X_P4_3,Bladder,BLADDER,3,M,4
4,10X_P4_4,Bladder,BLADDER,3,M,4


In [None]:
adata3aux = []
for i in range(0,len(maca10x3metadata)):
    #print(i)
    path = '/data/maca/data/10x/' + maca10x3metadata.channel[i] + '/'
    foo = sc.read(path + 'matrix.mtx', cache=True).transpose()
    foo.var_names = np.genfromtxt(path + 'genes.tsv', dtype=str)[:, 1]
    foo.obs_names = np.genfromtxt(path + 'barcodes.tsv', dtype=str)
    foo.obs['tissue'] = maca10x3metadata.tissue[i]
    foo.obs['subtissue'] = maca10x3metadata.subtissue[i]
    foo.obs['age'] = str(maca10x3metadata.age[i]) + 'm'
    
    if maca10x3metadata.sex[i] == 'M':
        foo.obs['sex'] = 'male'
    else:
        foo.obs['sex'] = 'female'
    
    if maca10x3metadata.tissue[i] == 'Muscle':
        foo.obs['tissue'] = 'Limb_Muscle'
        
    sc.pp.filter_cells(foo, min_genes=250)
    
    foo.obs['channel'] = maca10x3metadata.channel[i]# + foo.obs_names.split("-")[0]
    
    foo.obs['cell_ontology_class'] = 'NA' #maca10x3metadata.cell_ontology_class[i]
    #foo.obs['CellTypeID'] = maca10x3metadata.cell_ontology_id[i]
    #foo.obs['FreeAnn'] = maca10x3metadata.free_annotation[i]
    
    adata3aux.append(foo)
    #print(adata3.X.size*8/1000000) # convert to Mb

first = adata3aux.pop()
adata3 = first.concatenate(adata3aux)
adata3.obs['cell'] = adata3.obs['channel'] + '_' + adata3.obs_names
adata3.obs['cell'] = adata3.obs['cell'].apply(lambda x: pd.Series(x.split('-')))[0]
adata3.obs['method'] = 'droplet'
display(adata3.obs.head())
adata3

In [None]:
set(adata3.obs['tissue'])

## concatenate all time points

In [None]:
adata = adata182124.concatenate([adata3])
adata

## import tabula-muris facs data

In [None]:
tm_facs_metadata = pd.read_csv('/data/maca/data/facs3mo/TM_facs_metadata.csv',usecols = ['cell','tissue','subtissue','mouse.sex','method','cell_ontology_class','cell_ontology_id','free_annotation'])
tm_facs_metadata = tm_facs_metadata.rename(columns = {'mouse.sex':'sex'})
display(tm_facs_metadata.head())
tm_facs_metadata = tm_facs_metadata.set_index('cell')
tm_facs_metadata.tail()

In [None]:
tm_facs_data = read_h5ad('/data/maca/data/facs3mo/TM_facs_mat.h5ad')
tm_facs_data

In [None]:
tm_facs_data.obs.tail()

In [None]:
merged_inner = pd.merge(left=tm_facs_data.obs,right=tm_facs_metadata, left_index=True, right_on='cell',how = 'inner')
merged_inner

In [None]:
tm_facs_data.obs = merged_inner
tm_facs_data.obs['age'] = '3m'
for i in range(0,len(tm_facs_data.obs)):
    if tm_facs_data.obs['sex'][i] == 'M':
        tm_facs_data.obs['sex'][i] = 'male'
    else:
        tm_facs_data.obs['sex'][i] = 'female'
    if tm_facs_data.obs['tissue'][i] == 'Mammary_Gland':
        tm_facs_data.obs['tissue'][i] = 'Mammary'

tm_facs_data.obs.head()

In [None]:
tm_facs_data

In [None]:
# adata = adata.concatenate(tm_facs_data)
adata = tm_facs_data
adata

In [None]:
adata.write('./write/maca-facs-3mo.h5ad')

In [None]:
import gc
gc.collect()

# Import all data

In [None]:
adata = read_h5ad('./write/maca-facs-3mo.h5ad')
type(adata.X)


In [None]:
adata.shape

In [None]:
adata

In [None]:
adata.obs.head()

# Preprocessing

In [None]:
sc.pp.filter_genes(adata, min_cells=5)
sc.pp.filter_cells(adata, min_genes=250)
adata

In [None]:
ercc_genes = [name for name in adata.var_names if name.startswith('ERCC-')]
# for each cell compute fraction of counts in ercc genes vs. all genes
# the ".A1" is only necessary, as X is sparse - it transform to a dense array after summing
adata.obs['percent_ercc'] = np.sum(
    adata[:, ercc_genes].X, axis=1).A1 / np.sum(adata.X, axis=1).A1

In [None]:
adata = adata[:,~adata.var_names.isin(ercc_genes)]

In [None]:
# add the total counts per cell as observations-annotation to adata
adata.obs['n_counts'] = np.sum(adata.X, axis=1).A1
adata

In [None]:
axs = sc.pl.violin(adata, ['n_genes', 'n_counts'],
                   jitter=0.4, multi_panel=True)

In [None]:
ax = sc.pl.scatter(adata, x='n_counts', y='n_genes')

In [None]:
sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e4) #simple lib size normalization?
adata.raw = adata

In [None]:
filter_result = sc.pp.filter_genes_dispersion(
    adata.X, min_mean=0.0125, max_mean=10, min_disp=0.5)
sc.pl.filter_genes_dispersion(filter_result)

In [None]:
adata = adata[:, filter_result.gene_subset]
adata.shape

In [None]:
sc.pp.log1p(adata)

In [None]:
sc.pp.scale(adata, max_value=10, zero_center=False)

# Exploration
## PCA

In [None]:
sc.tl.pca(adata)

In [None]:
ax = sc.pl.pca_scatter(adata, color=['tissue'], right_margin=0.5)#, save='_all_ages_droplet_facs_tissue.pdf')

In [None]:
ax = sc.pl.pca_scatter(adata, color=['age'], right_margin=0.5)#, save='_all_ages_droplet_facs_age.pdf')

In [None]:
ax = sc.pl.pca_scatter(adata, color=['sex'], right_margin=0.5)#, save='_all_ages_droplet_facs_sex.pdf')

In [None]:
ax = sc.pl.pca_scatter(adata, color='n_counts', right_margin=0.5)#, save='_all_ages_droplet_facs_counts.pdf')

In [None]:
ax = sc.pl.pca_scatter(adata, color='method', right_margin=0.5)#, save='_all_ages_droplet_facs_method.pdf')

In [None]:
sc.pl.pca_variance_ratio(adata, log=True)

## Louvain clustering

In [None]:
sc.pp.neighbors(adata, n_neighbors=15)#, method='gauss')
sc.tl.louvain(adata, resolution = 0.3)

In [None]:
adata

## UMAP

In [None]:
sc.tl.umap(adata)

In [None]:
sc.settings.set_figure_params(dpi=200)
sc.pl.umap(adata, color=['tissue'], right_margin=0.5, save='_3mo_droplet_facs_tissue.pdf')

In [None]:
sc.settings.set_figure_params(dpi=200)
sc.pl.umap(adata, color=['age'], right_margin=0.5, save='_3mo_droplet_facs_age.pdf')

In [None]:
sc.settings.set_figure_params(dpi=200)
sc.pl.umap(adata, color=['sex'], right_margin=0.5, save='_3mo_droplet_facs_sex.pdf')

In [None]:
sc.settings.set_figure_params(dpi=200)
sc.pl.umap(adata, color=['louvain'], right_margin=0.5, save='_3mo_droplet_facs_louvain.pdf')

In [None]:
sc.settings.set_figure_params(dpi=200)
sc.pl.umap(adata, color=['method'], right_margin=0.5, save='_3mo_droplet_facs_method.pdf')

## tSNE

In [None]:
sc.tl.tsne(adata, perplexity=50)

In [None]:
sc.settings.set_figure_params(dpi=200)
sc.pl.tsne(adata, color=['tissue'], right_margin=0.5, save='_3mo_droplet_facs_tissue.pdf')

In [None]:
sc.pl.tsne(adata, color=['age'], right_margin=0.5, save='_3mo_droplet_facs_age.pdf')

In [None]:
sc.pl.tsne(adata, color=['sex'], right_margin=0.5, save='_3mo_droplet_facs_sex.pdf')

In [None]:
sc.pl.tsne(adata, color=['method'], right_margin=0.5, save='_3mo_droplet_facs_method.pdf')

In [None]:
sc.pl.tsne(adata, color=['louvain'], right_margin=0.5, save='_3mo_droplet_facs_louvain.pdf')

# Save processed data

In [None]:
adata.write('./write/maca-facs-3mo.processed.h5ad')