In [None]:
# Derived from https://scanpy.readthedocs.io/en/stable/tutorials.html

# Count matrices were generated using CellRanger Count (see 10X Genomics website)

# import packages 
import scanpy.external as sce
import scanpy as sc
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
from matplotlib import colors
import seaborn as sb
from gprofiler import gprofiler
from seaborn import despine
from seaborn import axes_style
from matplotlib.pyplot import suptitle
import magic # imputation tool; van Dijk et al 2018 # 

In [None]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_versions()

# package versions used for this analysis
# scanpy==1.4.4 anndata==0.6.22.post1 umap==0.3.9 numpy==1.17.2 scipy==1.3.0 pandas==0.23.4 scikit-learn==0.20.2 statsmodels==0.10.1 python-igraph==0.7.1 louvain==0.6.1

#load and merge files

filenames = [
'AM1_filtered_feature_bc_matrix.h5',
'AM2_filtered_feature_bc_matrix.h5',
'AM4_filtered_feature_bc_matrix.h5',    
'AM5_filtered_feature_bc_matrix.h5',
'AM7_filtered_feature_bc_matrix.h5',  
'AM9_filtered_feature_bc_matrix.h5']
adatas = [sc.read_10x_h5(filename) for filename in filenames]

adata = adatas[0].concatenate(adatas[1:],batch_categories=["AM1.ep_YFP_neg","AM2.ep_YFP_pos","AM4.ep_YFP_neg","AM5.ep_YFP_pos","AM7.ep_control","AM9.ep_control"])

# make sure gene names are unique
adata.var_names_make_unique()  

# compute %mito and remove cells with >10%
mito_genes = adata.var_names.str.startswith('mt-')

# for each cell compute fraction of counts in mito genes vs. all genes
# the `.A1` is only necessary as X is sparse (to transform to a dense array after summing)
adata.obs['percent_mito'] = np.sum(adata[:, mito_genes].X, axis=1).A1 / np.sum(adata.X, axis=1).A1

# add the total counts per cell as observations-annotation to adata
adata.obs['n_counts'] = adata.X.sum(axis=1).A1

# plot mitochondrial content (y-axis) against read count before and after filtering cells with > 10% mitochondrial gene expression
# Cells wth >10% mito expression also has low read count indicating bad/dead cells rather than cells requiring more energy
sc.settings.set_figure_params(dpi=80)
with axes_style({'axes.grid': False}):
 sc.pl.scatter(adata, y='percent_mito', x='n_counts', size=5)

adata = adata[adata.obs['percent_mito'] < 0.1, :]
sc.pp.filter_genes(adata, min_cells=3)

with axes_style({'axes.grid': False}):
 sc.pl.scatter(adata, y='percent_mito', x='n_counts', size=5)

# Normalize the data, save raw data, then use data diffusion tool (van Dijk et al 2018)
sc.pp.normalize_per_cell(adata)
sc.pp.log1p(adata)

# No consensus about gene scaling so I did not scale the data. Read https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6582955/
# Not scaling argueable retains biological information
# I did not regress cell cycle either. A population of proliferative or non-proliferative cells in this context would be high interesting in my opinion, and is not information
# I want to remove!

# save current data as adata.raw
adata.raw = adata

# data diffusion tool is in scanpy.external 
sce.pp.magic(adata, name_list='all_genes', k=5, t=15, n_pca=20) # default settings

#Intial filter
sc.pp.filter_cells(adata, min_genes=200)

# create nearest neighbors graph and run louvain community detection algorithm
sc.pp.neighbors(adata, n_neighbors=5, n_pcs=20)
sc.tl.louvain(adata, resolution=0.05, key_added='louvain_r0.05')
sc.tl.louvain(adata, resolution=0.1, key_added='louvain_r0.1')
sc.tl.louvain(adata, resolution=0.2, key_added='louvain_r0.2')
sc.tl.louvain(adata, resolution=0.3, key_added='louvain_r0.3')
sc.tl.louvain(adata, resolution=0.4, key_added='louvain_r0.4')
sc.tl.louvain(adata, resolution=0.5, key_added='louvain_r0.5')

# Visualize different Louvain resolutions using UMAP
sc.tl.umap(adata)

sc.settings.set_figure_params(dpi=80)
sc.pl.umap(adata, color=['batch','louvain_r0.05','louvain_r0.1','louvain_r0.2','louvain_r0.3','louvain_r0.4','louvain_r0.5'])

In [None]:
# Create a PAGA-initialized UMAP embedding (Wolf et al. 2018 Genome Biology). Louvain_r0.05 was chosen

sc.tl.paga(adata, groups='louvain_r0.05')
sc.pl.paga(adata, plot=True, color=['batch'])

# sc.tl.umap(adata, init_pos='paga') is not working (August 7, 2019) https://github.com/theislab/scanpy/issues/769 = work around for now..

sc.tl.umap(adata, init_pos=sci.tl._utils.get_init_pos_from_paga(adata))
sc.pl.umap(adata, frameon=True, color=['louvain_r0.05'], legend_loc='right margin', size=40) # louvain clusters, resolution = 0.05
sc.pl.umap(adata, frameon=True, color=['louvain_r0.05'], legend_loc='right margin', size=40) # batch
sc.pl.umap(adata, frameon=True, color=['Lyz2','Scgb1a1','Scgb3a2','Foxj1','Ly6a'],  color_map='RdYlBu_r', size=40, ncols=2, use_raw=False, vmin=0) # distal lung epithelial markers


In [None]:
# remove louvain clusters with less than 300 cells
adata.obs['louvain_r0.05'].value_counts()
adata_subset = adata[adata.obs['louvain_r0.05'].isin(['0','1','2','3','4','5'])]
subset_results_file = './write/adata_subset.h5ad'  # the file that will store the analysis results
adata_subset.write(subset_results_file) # save subset data

In [None]:
# Check the contribution of each library ID to the remaining louvain clusters
rcParams['figure.figsize'] = 8,5
with axes_style({'axes.grid': False}):
 ax = sb.countplot(x="louvain_r0.05", hue="batch", data=adata_subset.obs, linewidth=0.5, edgecolor='black')
 despine(right=True)
plt.xlabel("Louvain communities")
plt.ylabel("Cells")

In [None]:
# Differential expression (DE) and Enrichr analysis

# Identify DE gene using the in-built ScanPy function

# Filtered gene list for TF/TFC identification. I wanted to be stringent
sc.tl.rank_genes_groups(adata_subset, 'louvain_r0.05', method='wilcoxon', n_genes=1000, use_raw=True)

# Removed the outgroup filter, but kept the log fold change and ingroup filter
sc.tl.filter_rank_genes_groups(adata_subset, 
   min_fold_change = 1, # minimum log fold change   
   min_in_group_fraction = 0.5,  # 50% of cells in the cluster must express the gene
   max_out_group_fraction = 0.5, # No more than 50% of cells outside a cluster can express this genes
   key_added='rank_genes_groups_filtered') # filtered group key

result = adata_subset.uns['rank_genes_groups_filtered']
groups = result['names'].dtype.names

# export filtered DE results to excel 
df1=pd.DataFrame({group + '_' + key[:1]: result[key][group] for group in groups for key in ['names', 'pvals_adj','logfoldchanges']})
df1.to_excel("adata_subset_DE_top1000_filtered.xlsx", sheet_name='Top 1000 filtered')

# Visualize the data using a heatmap
heatmap_cmap = matplotlib.colors.LinearSegmentedColormap.from_list("", ['lightblue','lightyellow','lightcoral']) # color map

sc.pl.rank_genes_groups_heatmap(adata_subset, n_genes=25,cmap=heatmap_cmap,swap_axes=True, use_raw=False, key='rank_genes_groups_filtered', vmin=0, vmax=1.5, show_gene_labels=True)

In [None]:
# identify all DE TF's 

# import list of mouse TFs from TFDB
# http://bioinfo.life.hust.edu.cn/AnimalTFDB/#!/species
df_tf = pd.read_csv('Mus_musculus_TF.csv', header=0)
tf_genes = set(df_tf['Symbol'].tolist())

# Find TF/TFCs in the filtered DE genes list
diff_padj_filtered = pd.read_excel('adata_subset_DE_top1000_filtered.xlsx', header = 0) # previously generated using in built ScanPy function
names = diff_padj_filtered.columns
clust_dict = {} #key,value

for col in diff_padj_filtered.columns:
    
    # get diff gene list
    cluster_genes = diff_padj_filtered[col].tolist()

    # Create results array
    results_tfs = []

    # iterate through and pull out TF/TFCs
    
    for i in tf_genes:
     if i in cluster_genes:
      results_tfs.append(i)
    
    #store in dict (receptors,ligands,tfs)
    the_key = col
    the_value = {}
    the_value["tfs"] = results_tfs
    clust_dict[the_key] = the_value

# Create a list of identified TFs
merged_tf = []

for i in clust_dict:
    sub_dict = clust_dict[i] #access each cluster in dict
    
    # create merge
    merged_tf += sub_dict["tfs"]

# Visualize data as a matrix plot

cmap2 = matplotlib.colors.LinearSegmentedColormap.from_list("", ["white",'aliceblue',"royalblue"])

sc.pl.matrixplot(adata_subset, var_names=de_tfs, cmap=cmap2, groupby='louvain_r0.05', use_raw=False, swap_axes=True, vmin=0, vmax=0.6, dendrogram=False)

# You can also check TF/TFCs per cluster. For example:
clust_dict['0_n']["tfs"] 

# Unfiltered gene list for TF/TFC identification. I wanted to be less stringent here
sc.tl.rank_genes_groups(adata_subset, 'louvain_r0.05', method='wilcoxon',n_genes=1000, use_raw=True)

result = adata_subset.uns['rank_genes_groups']
groups = result['names'].dtype.names

# export DE results to excel 
mouse_DE_df=pd.DataFrame({group + '_' + key[:1]: result[key][group] for group in groups for key in ['names', 'pvals_adj','logfoldchanges']})
mouse_DE_df.to_excel("adata_subset_DE_top1000_unfiltered.xlsx", sheet_name='Top 1000 unfiltered')

# Look for enriched Gene Ontology Biological Process 2018 pathways

%matplotlib inline
%config InlineBackend.figure_format='retina' # mac
import gseapy as gp
from gseapy.plot import barplot, dotplot # only needed if you want bar and dot plots

#view available reference libraries
names = gp.get_library_name() # a list of available libraries will appear

# turn columns into lists (filtered data)
C0 = mouse_DE_df['0_n']
C1 = mouse_DE_df['1_n']
C2 = mouse_DE_df['2_n']
C3 = mouse_DE_df['3_n']
C4 = mouse_DE_df['4_n']
C5 = mouse_DE_df['5_n']

# drop NaN: NaNs cause enricher to break
C0=C0.dropna()
C1=C1.dropna()
C2=C2.dropna()
C3=C3.dropna()
C4=C4.dropna()
C5=C5.dropna()

# GO Analysis

Cluster0_GOBio = gp.enrichr(gene_list = C0,
description='Cluster0_GOBio',
gene_sets=['GO_Biological_Process_2018'],
outdir='Enricher_analysis/Cluster0_GOBio',
cutoff=0.05
)

Cluster1_GOBio = gp.enrichr(gene_list = C1,
description='Cluster1_GOBio',
gene_sets=['GO_Biological_Process_2018'],
outdir='Enricher_analysis/Cluster1_GOBio',
cutoff=0.05
)

Cluster2_GOBio = gp.enrichr(gene_list = C2,
description='Cluster2_GOBio',
gene_sets=['GO_Biological_Process_2018'],
outdir='Enricher_analysis/Cluster2_GOBio',
cutoff=0.05
)

Cluster3_GOBio = gp.enrichr(gene_list = C3,
description='Cluster3_GOBio',
gene_sets=['GO_Biological_Process_2018'],
outdir='Enricher_analysis/Cluster3_GOBio',
cutoff=0.05
)

Cluster4_GOBio = gp.enrichr(gene_list = C4,
description='Cluster4_GOBio',
gene_sets=['GO_Biological_Process_2018'],
outdir='Enricher_analysis/Cluster4_GOBio',
cutoff=0.05
)

Cluster5_GOBio = gp.enrichr(gene_list = C5,
description='Cluster5_GOBio',
gene_sets=['GO_Biological_Process_2018'],
outdir='Enricher_analysis/Cluster5_GOBio',
cutoff=0.05
)

# find unique GO terms in clusters 0, 

# filter for statistically significant terms

c0_sig = Cluster0_GOBio.res2d.loc[(Cluster0_GOBio.res2d['Adjusted P-value'] < 0.05)] 
c1_sig = Cluster1_GOBio.res2d.loc[(Cluster1_GOBio.res2d['Adjusted P-value'] < 0.05)] 

c0_list = c0_sig['Term'].tolist() # wt
c1_list = c1_sig['Term'].tolist() # cancer

# look for unique and common terms in C1 and C0

at2_c0_unique_terms = []
at2_c1_unique_terms = []
at2_common_terms = []

for i in c0_list:
    if i not in c1_list:
        at2_c0_unique_terms.append(i)

for i in c1_list:
    if i not in c0_list:
        at2_c1_unique_terms.append(i)
       
for i in c1_list:
    if i in c0_list:
        at2_common_terms.append(i)   

In [None]:
# Export  unique terms to excel
pd.DataFrame(c0_unique_terms).to_excel("Unique cluster 0 terms.xlsx", sheet_name='Sheet_name_1')
pd.DataFrame(c1_unique_terms).to_excel("Unique cluster 1 terms.xlsx", sheet_name='Sheet_name_1')
pd.DataFrame(c3_unique_terms).to_excel("Unique cluster 3 terms.xlsx", sheet_name='Sheet_name_1')
pd.DataFrame(common_terms).to_excel("common terms.xlsx", sheet_name='Sheet_name_1')  

In [None]:
# plot overlap using a venn diagram and check it matches the length of the lists above

from matplotlib_venn import venn2
c=venn2([set(c0_list), set(c1_list)], set_labels = ('C0', 'C1'))
plt.show() 

In [None]:
# Calculate gene signatures. Principle is true for all gene lists provided in supplementary

# Calculate Kras score using Bild et al. Nature Kras signature. Also used in Barbie et al.

bild_kras = pd.read_excel('Supplementary Table 2.xlsx', sheet_name='Bild et al. 2006 (Kras genes)', header=0) # load lists 

bild_kras_list = bild_kras['GeneSymbol'].tolist() # Make list of gene names

# Gene names are capitals. Lower and capitalize to be compatible with dataset gene names

bild_kras_list = [x.lower() for x in bild_kras_list] # lower
bild_kras_list = [x.capitalize() for x in bild_kras_list] # capitalize

bild_kras_list_final = [x for x in bild_kras_list if x in adata.var_names] # remove genes not in adata.var

# Calculating z-scores for single cells
# Derived from https://github.com/theislab/scanpy/issues/181

# Create a cell barcodes column
adata_subset.obs['index1'] = adata_subset.obs.index

# create the marker dict 
marker_dict = dict()
marker_dict['Kras z-score'] = bild_kras_list_final

# create the function
def evaluate_partition(anndata, marker_dict, gene_symbol_key=None, partition_key=None):
    # Inputs:
    #    anndata         - An AnnData object containing the data set and a partition
    #    marker_dict     - A dictionary with cell-type markers. The markers should be stores as anndata.var_names or 
    #                      an anndata.var field with the key given by the gene_symbol_key input
    #    gene_symbol_key - The key for the anndata.var field with gene IDs or names that correspond to the marker 
    #                      genes
    #    partition_key   - The key for the anndata.obs field where the cluster IDs are stored. The default is
    #                      'louvain_r1' 

    #Test inputs
    if partition_key not in anndata.obs.columns.values:
        print('KeyError: The partition key was not found in the passed AnnData object.')
        print('   Have you done the clustering? If so, please tell pass the cluster IDs with the AnnData object!')
        raise

    if (gene_symbol_key != None) and (gene_symbol_key not in anndata.var.columns.values):
        print('KeyError: The provided gene symbol key was not found in the passed AnnData object.')
        print('   Check that your cell type markers are given in a format that your anndata object knows!')
        raise
        
    if gene_symbol_key:
        gene_ids = anndata.var[gene_symbol_key]
    else:
        gene_ids = anndata.var_names
        
    # Create a column based on index. This allows z-score calculation on single cells rather than clusters
    clusters = np.unique(anndata.obs[partition_key])
    n_clust = len(clusters)
    n_groups = len(marker_dict)
    
    marker_res = np.zeros((n_groups, n_clust))
    z_scores = sc.pp.scale(anndata, copy=True) # try changing this to anndata_raw as a separate function

    i = 0
    for group in marker_dict:
        # Find the corresponding columns and get their mean expression in the cluster
        j = 0
        for clust in clusters:
            cluster_cells = np.in1d(z_scores.obs[partition_key], clust)
            marker_genes = np.in1d(gene_ids, marker_dict[group])
            marker_res[i,j] = z_scores.X[np.ix_(cluster_cells,marker_genes)].mean()
            j += 1
        i+=1

    variances = np.nanvar(marker_res, axis=0)
    if np.all(np.isnan(variances)):
        print("No variances could be computed, check if your cell markers are in the data set.")
        print("Maybe the cell marker IDs do not correspond to your gene_symbol_key input or the var_names")
        raise

    marker_res_df = pd.DataFrame(marker_res, columns=clusters, index=marker_dict.keys())
    
    return marker_res_df

   # Return the median of all the variances over the clusters
    #marker_matches = ([np.median(variances), marker_res_df])
    
   # return marker_matches

# Calculate the z-score
df = evaluate_partition(adata_subset, marker_dict, gene_symbol_key=None, partition_key = 'index1')

# Transpose the dataframe
df_transposed = df.transpose()

# add score to adata.obs
adata_subset.obs['Kras z-score'] = df_transposed['Kras z-score']

# save the data
adata_subset.write(results_file)

In [None]:
#### You can download the annotated adata_subset file and begin data analysis straight away! ####

# load adata_subset file shown in the figures
results_file = 'adata_subset.h5ad'  
adata_subset = sc.read(results_file)

In [None]:
# Hope the data and code were useful!