# Cluster filtered cells

Now that we've filtered out possible doublets and low-quality cells, we'll re-cluster and re-project the cells within each cell class group to examine our cell type labels.

Because this process can take substantial time, we'll split this across 4 notebooks that can be executed in parallel across instances (16 inputs per instance), and parallelize processing within each instance (4 processes at a time).

## Load packages

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=RuntimeWarning)

import concurrent.futures
from concurrent.futures import ThreadPoolExecutor
from datetime import date
import hisepy
import os
import pandas as pd
import scanpy as sc

In [None]:
out_dir = 'output'
if not os.path.isdir(out_dir):
    os.makedirs(out_dir)

## Helper functions

We'll use these helper functions to read our files stored in HISE, then restore the full set of data, renormalize, and select high variance genes. Then, we'll run PCA and perform leiden clustering, as well as UMAP to visualize the data.

In [None]:
def cache_uuid_path(uuid):
    cache_path = '/home/jupyter/cache/{u}'.format(u = uuid)
    if not os.path.isdir(cache_path):
        hise_res = hisepy.reader.cache_files([uuid])
    filename = os.listdir(cache_path)[0]
    cache_file = '{p}/{f}'.format(p = cache_path, f = filename)
    return cache_file

In [None]:
def read_adata_uuid(uuid):
    cache_file = cache_uuid_path(uuid)
    res = sc.read_h5ad(cache_file)
    return res

In [None]:
def process_adata(adata):
    
    # Keep a copy of the raw data
    adata = adata.raw.to_adata()
    adata.raw = adata

    print('Normalizing')
    # Normalize and log transform
    sc.pp.normalize_total(adata)
    sc.pp.log1p(adata)

    print('Finding HVGs')
    # Restrict downstream steps to variable genes
    sc.pp.highly_variable_genes(adata)
    adata = adata[:, adata.var_names[adata.var['highly_variable']]].copy()
    print(adata.shape)

    print('Scaling')
    # Scale variable genes
    sc.pp.scale(adata)

    print('PCA')
    # Run PCA
    sc.tl.pca(adata, svd_solver = 'arpack')

    print('Neighbors')
    # Find nearest neighbors
    sc.pp.neighbors(
        adata, 
        n_neighbors = 50,
        n_pcs = 30
    )

    print('Leiden')
    # Find clusters
    sc.tl.leiden(
        adata, 
        resolution = 2, 
        key_added = 'leiden_2'
    )

    print('UMAP')
    # Run UMAP
    sc.tl.umap(adata, min_dist = 0.05)
    
    print('Renormalizing')
    adata = adata.raw.to_adata()
    adata.raw = adata

    # Normalize and log transform
    sc.pp.normalize_total(adata)
    sc.pp.log1p(adata)
    
    return adata

## Identify files in HISE

In [None]:
h5ad_uuids = {
    # 'ASDC': '0df777eb-6c6a-4892-89e8-3ab6ebfade89',
    # 'BR1_Female_Negative_CD14-monocyte': 'beea5765-b3c0-4db1-a1c2-ada2ef51ee26',
    # 'BR1_Female_Negative_CD56dim-NK-cell': '426366d0-fdf7-4ab2-8339-0baefe80d096',
    # 'BR1_Female_Negative_Memory-CD4-T-cell': 'a40e7454-4637-4ad7-b222-bd29716aa027',
    # 'BR1_Female_Negative_Memory-CD8-T-cell': 'de83b600-3cc6-40ba-acb6-613c12e178ac',
    # 'BR1_Female_Negative_Naive-CD4-T-cell': 'b55301f1-2289-45a6-b14d-b1ee31a7f11c',
    # 'BR1_Female_Positive_CD14-monocyte': '8496f50f-38f6-4f0d-a50c-0835268c42b5',
    # 'BR1_Female_Positive_CD56dim-NK-cell': '8dd11b33-9065-460f-bcea-88a3092bf662',
    # 'BR1_Female_Positive_Memory-CD4-T-cell': '8f7afec7-194b-4e7c-9a27-2fa4e47b6085',
    # 'BR1_Female_Positive_Memory-CD8-T-cell': 'aa8b8b0f-164f-4fba-8af9-8397d9e67cd7',
    # 'BR1_Female_Positive_Naive-CD4-T-cell': '5a50a26e-1a56-4239-ba8e-dc0b8f3ef91d',
    # 'BR1_Male_Negative_CD14-monocyte': '6dfc083e-0392-438a-aa79-4b6c79acd55b',
    # 'BR1_Male_Negative_CD56dim-NK-cell': '6edf4d9d-f29a-4c7c-bc86-7e87b53ca9f5',
    # 'BR1_Male_Negative_Memory-CD4-T-cell': 'cdf5d83f-603d-4089-a5b9-9f04783329d8',
    # 'BR1_Male_Negative_Memory-CD8-T-cell': '8cc6a9b1-0ec7-445e-8299-556a4f95cb66',
    # 'BR1_Male_Negative_Naive-CD4-T-cell': '9997103a-4f8a-4e00-abc1-61a9674cf01b',
    
    # 'BR1_Male_Positive_CD14-monocyte': '4f0d6c22-21d5-4410-b603-8b485487f42b',
    # 'BR1_Male_Positive_CD56dim-NK-cell': '3788e5c0-5fee-4f6d-b108-77d9da289a7f',
    # 'BR1_Male_Positive_Memory-CD4-T-cell': 'f104fa2c-9054-4403-afd4-e17048d93d75',
    # 'BR1_Male_Positive_Memory-CD8-T-cell': 'df7c9a0a-3b5e-48d9-91de-2bd424daa44a',
    # 'BR1_Male_Positive_Naive-CD4-T-cell': '2aaa8f67-c64b-450c-9037-7a7cbde2c3e0',
    # 'BR2_Female_Negative_CD14-monocyte': '2906d91a-c989-4b1d-b343-aea2e23de036',
    # 'BR2_Female_Negative_CD56dim-NK-cell': '82b143e9-0dd6-4ad9-b59e-6feb135f5c0c',
    # 'BR2_Female_Negative_Memory-CD4-T-cell': '050e56f7-fe83-4196-8fd4-d55292ed5cfa',
    # 'BR2_Female_Negative_Memory-CD8-T-cell': 'b8550f9d-b4aa-4ba7-955b-cf556fabb21d',
    # 'BR2_Female_Negative_Naive-CD4-T-cell': '82a127b5-7025-41db-8d88-5347055a5268',
    # 'BR2_Female_Positive_CD14-monocyte': '42c2b3e1-f3ec-4a44-9137-3db3ead74454',
    # 'BR2_Female_Positive_CD56dim-NK-cell': 'eda7be7b-7ba6-4832-83b9-8b210319c078',
    # 'BR2_Female_Positive_Memory-CD4-T-cell': '760961f6-4707-48b1-a2f4-33efc816be28',
    # 'BR2_Female_Positive_Memory-CD8-T-cell': 'd76d8ee6-6b85-42e4-9974-36c6ef4b0538',
    # 'BR2_Female_Positive_Naive-CD4-T-cell': '56630d0b-cdd9-43b0-8da4-9f7227b35190',
    # 'BR2_Male_Negative_CD14-monocyte': '47fff457-438f-46ba-ab37-9876bbbb1f18',
    
    'BR2_Male_Negative_CD56dim-NK-cell': '6dcc8d60-7b43-40f1-90b3-2390d09e4bbc',
    'BR2_Male_Negative_Memory-CD4-T-cell': 'ab6f751f-76df-42eb-be88-0315cf2d7c10',
    'BR2_Male_Negative_Memory-CD8-T-cell': '10d6ff38-dabd-4439-90e4-28d2c932d81f',
    'BR2_Male_Negative_Naive-CD4-T-cell': '22d42b15-1ae2-4b3f-8b6d-39dce427f765',
    'BR2_Male_Positive_CD14-monocyte': '41094c89-3ab0-43b8-a683-9b3b5a5b3653',
    'BR2_Male_Positive_CD56dim-NK-cell': 'f84657c6-d7df-42f9-9a42-5d3aa2e5c4c2',
    'BR2_Male_Positive_Memory-CD4-T-cell': '35215ceb-9dcc-41bd-abaf-1973de95b3d2',
    'BR2_Male_Positive_Memory-CD8-T-cell': '37d23649-4670-4ac2-9dd0-e0de0fce573d',
    'BR2_Male_Positive_Naive-CD4-T-cell': '87c3b749-c177-4fa1-8747-c8faa4e4859e',
    'CD16-monocyte': '39cb92e1-8053-4607-8fe8-a6dd8632b32a',
    'CD56bright-NK-cell': 'd4960075-6eba-4d79-9157-5f8259bbeedf',
    'CD8aa': 'da603bcc-e5f3-4ede-889f-e055b60d054d',
    'cDC1': '8728233c-e99c-4c93-921e-18f56ba72b75',
    'cDC2': '74207276-23b2-4943-8c02-02ca4d12473b',
    'DN-T-cell': '7c528158-b0c3-473a-9256-54bdaf510d66',
    'Effector-B-cell': '6d849b19-68be-4243-807a-9ee59e6c962a',
    
    # 'Erythrocyte': '2adc98ac-79a8-4d9c-9faf-55e7d81d0adf',
    # 'gdT': 'ec9f2e39-a65c-4e1a-b7e8-80b818d06166',
    # 'ILC': '4c885520-849a-4ffb-9490-90c7cc3d25df',
    # 'Intermediate-monocyte': 'c7bb1b35-03b4-4acb-bb52-b75a0c5efcdf',
    # 'MAIT': '35912dbb-1911-45fb-8c20-5e3794b3961b',
    # 'Memory-B-cell': '38977bd9-bb58-491d-a293-c8a83585b21a',
    # 'Naive-B-cell': 'f1f95172-4746-4632-b4ce-52e36f1328ca',
    # 'Naive-CD8-T-cell': 'b9cbc1b4-6e6e-4bfe-a729-7cf5a240f764',
    # 'pDC': 'ae861863-fd93-42aa-91d0-7a66c3528831',
    # 'Plasma-cell': '982321d2-4b2a-4dc3-b2f7-3c22a8692039',
    # 'Platelet': '10bd3fda-ad31-42b3-b440-78ba92e148a9',
    # 'Progenitor-cell': '700298cc-b5a9-45c4-97f6-9c15d3a11b24',
    # 'Proliferating-NK-cell': 'cc3b4985-3b83-4645-a472-cbf823e46a46',
    # 'Proliferating-T-cell': 'd86d1487-e394-4605-bd4f-f0743cb670bf',
    # 'Transitional-B-cell': 'a512e0ff-1535-4dfc-b9b4-fd51b7938efd',
    # 'Treg': '1a59aa90-eb0e-4250-9ffd-df6b1f801cb0'
}

## Process each dataset

In [None]:
def process_file(group_name):
    print(group_name)
    out_file = 'output/diha_{g}_clean_{d}.h5ad'.format(g = group_name, d = date.today())
    if os.path.isfile(out_file):
        print('Previously filtered {g}; Skipping.'.format(g = group_name))
        out_files.append(out_file)
    else:
        uuid = h5ad_uuids[group_name]
        adata = read_adata_uuid(uuid)
        adata = process_adata(adata)
        adata.write_h5ad(out_file)
    return out_file

In [None]:
group_names = list(h5ad_uuids.keys())
with ThreadPoolExecutor(max_workers = 4) as executor:  
    for result in executor.map(process_file, group_names):
        out_files.append(result)

## Upload Cell Type data to HISE

Finally, we'll use `hisepy.upload.upload_files()` to send a copy of our output to HISE to use for downstream analysis steps.

In [None]:
study_space_uuid = 'de025812-5e73-4b3c-9c3b-6d0eac412f2a'
title = 'DIHA PBMC AIFI_L2 Clean .h5ad 3 of 4 {d}'.format(d = date.today())

In [None]:
in_files = list(h5ad_uuids.values())
in_files

In [None]:
out_files

In [None]:
hisepy.upload.upload_files(
    files = out_files,
    study_space_id = study_space_uuid,
    title = title,
    input_file_ids = in_files
)

In [None]:
import session_info
session_info.show()