# Generate "clean" B cell subset

In this notebook, we'll read the cleaned PBMC reference dataset, select L1 cell types, and generate subset AnnData objects for each with updated UMAP projections.

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=RuntimeWarning)

from datetime import date
import hisepy
import os
import pandas as pd
import scanpy as sc
import scanpy.external as sce

In [None]:
def read_adata_uuid(h5ad_uuid):
    h5ad_path = '/home/jupyter/cache/{u}'.format(u = h5ad_uuid)
    if not os.path.isdir(h5ad_path):
        hise_res = hisepy.reader.cache_files([h5ad_uuid])
    h5ad_filename = os.listdir(h5ad_path)[0]
    h5ad_file = '{p}/{f}'.format(p = h5ad_path, f = h5ad_filename)
    adata = sc.read_h5ad(h5ad_file)
    return adata

In [None]:
out_dir = 'output'
if not os.path.isdir(out_dir):
    os.makedirs(out_dir)

In [None]:
out_files = []

## Read annotated dataset

In [None]:
h5ad_uuid = '6e8972a5-9463-4230-84b4-a20de055b9c3'

In [None]:
adata = read_adata_uuid(h5ad_uuid)

In [None]:
adata.shape

## Generate B cell object

In [None]:
cell_class = 'b-cells'
keep_labels = ['B cell']

In [None]:
adata_subset = adata[adata.obs['AIFI_L1'].isin(keep_labels)]
adata_subset.shape

In [None]:
obs = adata_subset.obs.copy()
obs['AIFI_L1'] = obs['AIFI_L1'].cat.remove_unused_categories()
obs['AIFI_L2'] = obs['AIFI_L2'].cat.remove_unused_categories()
obs['AIFI_L3'] = obs['AIFI_L3'].cat.remove_unused_categories()
adata_subset.obs = obs

### Re-project within the subset

In [None]:
adata_subset = adata_subset.raw.to_adata()

In [None]:
adata_subset.raw = adata_subset

In [None]:
sc.pp.normalize_total(adata_subset, target_sum = 1e4)

In [None]:
sc.pp.log1p(adata_subset)
sc.pp.highly_variable_genes(adata_subset)

In [None]:
adata_subset = adata_subset[:, adata_subset.var_names[adata_subset.var['highly_variable']]].copy()

### Remove Ig Genes (for B cells only)

In [None]:
igl_genes = [gene for gene in adata_subset.var_names if gene.startswith("IGL")]
igk_genes = [gene for gene in adata_subset.var_names if gene.startswith("IGK")]
ighc_genes = [gene for gene in adata_subset.var_names if gene.startswith("IGH")]
exl_genes = igl_genes + igk_genes + ighc_genes

In [None]:
filtered_genes = [gene for gene in adata_subset.var_names if gene not in exl_genes]
adata_subset = adata_subset[:, filtered_genes]

In [None]:
sc.pp.scale(adata_subset)

In [None]:
sc.tl.pca(adata_subset, svd_solver='arpack')

In [None]:
sce.pp.harmony_integrate(
    adata_subset, 
    'cohort.cohortGuid',
    max_iter_harmony = 30)

In [None]:
sc.pp.neighbors(
    adata_subset, 
    n_neighbors = 50,
    use_rep = 'X_pca_harmony', 
    n_pcs = 30)

In [None]:
sc.tl.umap(adata_subset, min_dist = 0.05)

In [None]:
sc.pl.umap(
    adata_subset,
    color = 'AIFI_L2'
)

In [None]:
sc.pl.umap(
    adata_subset,
    color = 'AIFI_L2',
    legend_loc = 'on data',
    legend_fontsize = 6,
    legend_fontweight = 'normal'
)

In [None]:
sc.pl.umap(
    adata_subset,
    color = 'AIFI_L3'
)

In [None]:
sc.pl.umap(
    adata_subset,
    color = 'AIFI_L3',
    legend_loc = 'on data',
    legend_fontsize = 6,
    legend_fontweight = 'normal'
)

### Output subset annotations

In [None]:
obs = adata_subset.obs
obs = obs.reset_index(drop = True)

In [None]:
umap_mat = adata_subset.obsm['X_umap']
umap_df = pd.DataFrame(umap_mat, columns = ['umap_1', 'umap_2'])
obs['umap_1'] = umap_df['umap_1']
obs['umap_2'] = umap_df['umap_2']

In [None]:
obs.head()

In [None]:
obs_out_csv = '{p}/ref_pbmc_{c}_clean_labeled_meta_umap_{d}.csv'.format(p = out_dir, c = cell_class, d = date.today())
obs.to_csv(obs_out_csv, index = False)
out_files.append(obs_out_csv)

In [None]:
obs_out_parquet = '{p}/ref_pbmc_{c}_clean_labeled_meta_umap_{d}.parquet'.format(p = out_dir, c = cell_class, d = date.today())
obs.to_parquet(obs_out_parquet, index = False)
out_files.append(obs_out_parquet)

### Output subset AnnData

In [None]:
out_h5ad = '{p}/ref_pbmc_{c}_clean_labeled_{d}.h5ad'.format(p = out_dir, c = cell_class, d = date.today())
adata.write_h5ad(out_h5ad)
out_files.append(out_h5ad)

## Upload results to HISE

Finally, we'll use `hisepy.upload.upload_files()` to send a copy of our output to HISE to use for downstream analysis steps.

In [None]:
study_space_uuid = '64097865-486d-43b3-8f94-74994e0a72e0'
title = 'Clean PBMC Reference B cells {d}'.format(d = date.today())

In [None]:
in_files = [h5ad_uuid]

In [None]:
in_files

In [None]:
out_files

In [None]:
hisepy.upload.upload_files(
    files = out_files,
    study_space_id = study_space_uuid,
    title = title,
    input_file_ids = in_files
)

In [None]:
import session_info
session_info.show()