In [None]:
from matplotlib import pyplot as plt
import scanpy as sc
import numpy as np
import pandas as pd

Set the data directory to wherever you downloaded the relevant data from NCBI GEO, accession [GSE114687](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE114687).

In [None]:
mcfaline_data_directory = ''

In [None]:
counts = sc.read_mtx(data_directory + 'pseudospace_matrix.mtx.gz').T # Take the transpose because Seurat defines genes as rows
genes = pd.read_csv(data_directory + 'pseudospace_genes.tsv.gz', sep='\t', names=['gene_ids'], index_col=1)


In [None]:
metadata = pd.read_csv(data_directory + 'pseudospace_metadata.tsv.gz', sep='\t', names=humec_metadata.columns[1:])
metadata.index = metadata['cell']
metadata.drop(labels='cell', axis=1, inplace=True)
metadata.index.name = None

In [None]:
# Define the annotated data
mcfaline = sc.AnnData(X=counts.X)
mcfaline.obs = metadata
mcfaline.var = genes
mcfaline.var_names_make_unique()

In [None]:
# We only care about the mock treatment
mcfaline = mcfaline[mcfaline.obs['treatment_id'] == 'Mock']

In [None]:
mcfaline.layers['counts'] = mcfaline.X.copy() # Store the raw counts

# Normalise the data
sc.pp.normalize_total(mcfaline, target_sum=1e4)
sc.pp.log1p(mcfaline)

In [None]:
# Identify the highly-variable genes. We use the CellRanger routine provided in Scanpy.
target_genes = 2000
sc.pp.highly_variable_genes(mcfaline, flavor='cell_ranger', n_top_genes=target_genes, batch_key='spatial_id')

In [None]:
# Calculate the KNN graph for clustering
sc.pp.pca(mcfaline, use_highly_variable=True)
sc.pp.neighbors(mcfaline, n_neighbors=30)

In [None]:
sc.tl.leiden(mcfaline, resolution=0.3, key_added='leiden')

In [None]:
sc.tl.rank_genes_groups(adata, 'leiden', key_added = 'leiden', method='wilcoxon')

Plot the cell state and spatial region compositions to show why we've defined these as such. 

In [None]:
new_cluster_names = ['Intermediate 1', 'Intermediate 2', 'Epithelial', 'Mesenchymal']
mcfaline.rename_categories('leiden', new_cluster_names)

In [None]:
mcfaline_df = mcfaline.obs

tmp = pd.crosstab(mcfaline_df['spatial_id'], mcfaline_df['leiden'], normalize='index')
tmp = tmp.reindex(['inner', 'outer'])

axis = tmp.plot.bar(stacked=True, width=0.9, grid=False, figsize=(6,10), linewidth=1.0)
# axis.invert_yaxis()
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
fig = axis.get_figure()

In [None]:
mcfaline_df = mcfaline.obs

tmp = pd.crosstab(mcfaline_df['leiden'], mcfaline_df['spatial_id'], normalize='index')
tmp = tmp.reindex(['Epithelial', 'Intermediate 1', 'Intermediate 2', 'Mesenchymal'])

axis = tmp.plot.bar(stacked=True, width=0.9, grid=False, figsize=(6,10), linewidth=1.0)
# axis.invert_yaxis()
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
fig = axis.get_figure()

In [None]:
# Save the data
mcfaline.write(mcfaline_data_directory + 'mcfaline19_merged.h5ad', compression='gzip')