# Figure 1

This notebook recreates a subset of the visualizations in figure 1 of the reference article for both the real and noise cells.

## 0. Initializations

In [None]:
# -- imports --
import anndata as ad
import numpy as np
import scanpy as sc

from scipy import sparse

from signals_in_the_noise.preprocessing.gse161529 import GSE161529

In [None]:
def get_combined_dataset(is_noise):
    adatas = []
    is_noise_val = 1 if is_noise else 0
    for idx, filename in enumerate(gse.EPI_CELL_TYPING_FILENAMES):
        adata = gse.get_dataset(filename)
        adata.obs_names = [f"{filename}_{i}" for i in range(adata.n_obs)]
        # filter to real cells
        adata = adata[adata.obs['is_noise']==is_noise_val].copy()
        adata = gse.annotate_epithial_cell_typing(adata)
        # remove stromal cells - "...removed the stromal subset..."
        mask = ~adata.obs['predicted_type'].str.lower().str.contains('stromal')
        adata = adata[mask].copy()
        # re-run highly variable genes after removing stromal cells
        sc.pp.highly_variable_genes(adata, flavor='seurat')
        adata = adata[:, adata.var.highly_variable].copy()
        # arbitrarily assign a unique id to each dataset as proxy for subject id
        adata.obs['specimen_id'] = idx
        if sparse.issparse(adata.X):
            adata.X = adata.X.toarray()
        adatas.append(adata)
    return ad.concat(adatas, join='inner')

def figure_1_tsne(adata_all, color):
    sc.pp.scale(adata_all)
    sc.pp.pca(adata_all, random_state=random_kwargs['random_state'])
    sc.pp.neighbors(adata_all, **random_kwargs)
    # use of leiden and resolution specified in caption for Figure 1E
    sc.tl.leiden(adata_all, resolution=0.015)
    sc.tl.tsne(adata_all, **random_kwargs)
    sc.pl.tsne(adata_all, color=color)

In [None]:
# -- dataset --
gse = GSE161529()
adata_all_real = get_combined_dataset(is_noise=False)
adata_all_noise = get_combined_dataset(is_noise=True)

# re-run highly variable genes after subsetting
sc.pp.highly_variable_genes(adata_all_real, flavor='seurat')
adata_all_real = adata_all_real[:, adata_all_real.var.highly_variable].copy()

sc.pp.highly_variable_genes(adata_all_noise, flavor='seurat')
adata_all_noise = adata_all_noise[:, adata_all_noise.var.highly_variable].copy()

# -- constants --
random_kwargs = {'use_rep': 'X_pca', 'random_state': 43}

## 1. Visualization E (t-SNE colored by cell clusters)

### 1.1. Real cells

In [None]:
figure_1_tsne(adata_all_real, color=['predicted_type'])

### 1.2. Noise cells

In [None]:
figure_1_tsne(adata_all_noise, color=['predicted_type'])

## 2. Visualization C (t-SNE colored by tissue specimens)

### 2.1. Real cells

In [None]:
# adata = gse.get_dataset(gse.EPI_CELL_TYPING_FILENAMES[0])

In [None]:
figure_1_tsne(adata_all_real, color=['specimen_id'])

### 2.2. Noise cells

In [None]:
figure_1_tsne(adata_all_noise, color=['specimen_id'])