In [None]:
%%bash
wget 'https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM3762nnn/GSM3762870/suppl/GSM3762870_Car1gfp_barcodes.tsv.gz' -P ./data/car1/
wget 'https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM3762nnn/GSM3762870/suppl/GSM3762870_Car1gfp_genes.tsv.gz' -P ./data/car1/
wget 'https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM3762nnn/GSM3762870/suppl/GSM3762870_Car1gfp_matrix.mtx.gz' -P ./data/car1/

wget 'https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM3762nnn/GSM3762869/suppl/GSM3762869_Naivebm_barcodes.tsv.gz' -P ./data/naive/
wget 'https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM3762nnn/GSM3762869/suppl/GSM3762869_Naivebm_genes.tsv.gz' -P ./data/naive/
wget 'https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM3762nnn/GSM3762869/suppl/GSM3762869_Naivebm_matrix.mtx.gz' -P ./data/naive/

gzip -d ./data/car1/*.gz
gzip -d ./data/naive/*.gz

In [None]:
import numpy as np
import scanpy as sc
import pandas as pd
import bbknn
import anndata

In [None]:
def LoadData(gex_path):
    adata = sc.read_10x_h5(gex_path, cache_compression = None, var_names='gene_symbols')
    adata.var_names_make_unique()
    return (adata)

def BasicFiltering(adata):
    sc.pl.highest_expr_genes(adata, n_top=20,)
    print("Before filtering:", adata.n_obs, adata.n_vars)
    sc.pp.filter_genes(adata, min_cells=3) # keep the genes which are expressed in min number of cells
    sc.pp.filter_cells(adata, min_genes=200) # cells with min number of genes expressed
    print("After filtering:", adata.n_obs, adata.n_vars)
    adata.var['mt'] = adata.var_names.str.startswith('MT-')  
    adata.var['ribo']=adata.var_names.str.startswith(('RPS','RPL'))
    adata.var['hemo']=adata.var_names.str.startswith(('^HB[^(P)]'))
    sc.pp.calculate_qc_metrics(adata, qc_vars=['mt','ribo','hemo'], percent_top=None, inplace=True, log1p=False)
    sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt','pct_counts_ribo','pct_counts_hemo'], jitter=0.4, multi_panel=True)


    adata = adata[adata.obs.n_genes_by_counts < 5000, :]
    print("Remaining cells after removing high number of genes %d"%adata.n_obs)

    adata = adata[adata.obs.pct_counts_mt < 10, :] #10% mitochondrial contamination is a solid cutoff to start with. 
    print("Remaining cells afer removing mt %d"%adata.n_obs)

    sc.pp.normalize_per_cell(adata, counts_per_cell_after=1000)
    sc.pp.log1p(adata)

    sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
    
    #Two last steps real quick. First to regress out the variation caused by the mitochondrial, ribosomal, and hemoglobin counts
    sc.pp.regress_out(adata, ['total_counts','pct_counts_mt','pct_counts_ribo','pct_counts_hemo'])
    #And then scale the data appropriately. 
    sc.pp.scale(adata)
    
    return(adata)
    
def Clustering(adata):
    sc.pp.pca(adata, svd_solver="arpack")
    sc.pp.neighbors(adata, n_neighbors=10, n_pcs=40)
    sc.tl.umap(adata)
    
    # Standard Leiden clustering
    sc.tl.leiden(adata)
    sc.pl.umap(adata, color=['leiden'])
    return (adata)

In [None]:
naive=LoadData('./data/naive/)
naive.obs['sample']='Naive'
naive=BasicFiltering(naive)

car1=LoadData('./data/car1/)
car1.obs['sample']='Car1'
car1=BasicFiltering(car1)

In [None]:
naive=Clustering(naive)

In [None]:
car1=Clustering(car1)

In [None]:
var_names=naive.var_names.intersection(car1.var_names)
naive=naive_treated[:,var_names]
car1=car1[:,var_names]

adata = naive.concatenate(car1)

adata= Clustering(adata)

#And an extra plot to show the batch differences
sc.pl.umap(adata, color=['sample','leiden'])

In [None]:
sc.external.pp.bbknn(adata, batch_key='sample')

sc.tl.umap(adata)
sc.tl.leiden(adata)
sc.pl.umap(adata, color=['sample', 'leiden'])