In [1]:
%load_ext pretty_jupyter

In [2]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

from datetime import date
import hisepy
import numpy as np
import os
import pandas as pd
import re
import scanpy as sc

## Helper functions

In [3]:
def cache_uuid_path(uuid):
    cache_path = '/home/jupyter/cache/{u}'.format(u = uuid)
    if not os.path.isdir(cache_path):
        hise_res = hisepy.reader.cache_files([uuid])
    filename = os.listdir(cache_path)[0]
    cache_file = '{p}/{f}'.format(p = cache_path, f = filename)
    return cache_file

In [4]:
def read_csv_uuid(uuid):
    cache_file = cache_uuid_path(uuid)
    res = pd.read_csv(cache_file)
    return res

In [5]:
def read_parquet_uuid(uuid):
    cache_file = cache_uuid_path(uuid)
    res = pd.read_parquet(cache_file)
    return res

In [6]:
def read_adata_uuid(uuid):
    cache_file = cache_uuid_path(uuid)
    res = sc.read_h5ad(cache_file)
    return res

In [7]:
def backed_adata_uuid(uuid):
    cache_file = cache_uuid_path(uuid)
    res = sc.read_h5ad(cache_file, backed = 'r')
    return res

In [8]:
def rm_cache_uuid(uuid):
    cache_path = '/home/jupyter/cache/{u}'.format(u = uuid)
    rm_call = 'rm -r {d}'.format(d = cache_path)
    os.system(rm_call)

In [9]:
def format_cell_type(cell_type):
    cell_type = re.sub('\\+', 'pos', cell_type)
    cell_type = re.sub('-', 'neg', cell_type)
    cell_type = re.sub(' ', '_', cell_type)
    return cell_type

In [10]:
def filename_cell_type(filename):
    cell_type = re.sub('.+L3_','',filename)
    cell_type = re.sub('_2024.+','',cell_type)
    cell_type = re.sub('_', ' ', cell_type)
    cell_type = re.sub('pos','+', cell_type)
    cell_type = re.sub('neg','-', cell_type)
    return cell_type

In [11]:
def add_labels(adata, labels):
    obs = adata.obs
    obs = obs.reset_index(drop = True)
    obs = obs.merge(labels, on = 'barcodes', how = 'left')
    obs = obs.set_index('barcodes', drop = False)
    adata.obs = obs
    return adata

In [12]:
def element_id(n = 3):
    import periodictable
    from random import randrange
    rand_el = []
    for i in range(n):
        el = randrange(0,118)
        rand_el.append(periodictable.elements[el].name)
    rand_str = '-'.join(rand_el)
    return rand_str

## Markers for dotplots

In [13]:
broad_markers = [
    'CD3D', # T cells
    'CD3E', # T cells/NK
    'FCN1', # Monocytes/Myeloid
    'HBB', # Erythrocytes
    'MS4A1', # B cells
    'CD79A', # B cells
    'PPBP', # Platelets
    'IFI44L' # ISG-high
]

In [14]:
class_markers = [
    'AXL',     # ASDCs
    'C1orf54',
    'CCL3',    # IL1B+ Monocytes
    'CD14',    # CD14+ Monocytes and cDC2s
    'CD1C',    # cDC2
    'CD74',
    'CDKN1C',
    'CLEC9A',
    'CST3',
    'FCER1A',  # cDC2
    'FCGR3A',  # CD16+ Monocytes
    'HLA-DRA', # CD16+ Monocytes; DCs
    'IFI44L',  # ISG+ cells
    'IL1B',
    'IRF4',    # pDC
    'IRF8',    # pDC
    'ISG15',
    'LST1',
    'MX1',
    'NFKBIA',   # IL1B+ Monocytes
    'S100A9',
    'VCAN'
]

## Read previous, non-tracked labels
We ran a preliminary analysis of our dataset that wasn't tracked for reproducibility. Here, we'll retrieve the labels for visualization and comparison to check for consistency.

In [15]:
original_uuid = '3868592c-0087-4ed8-98b2-4bf1b8676111'
original_df = read_parquet_uuid(original_uuid)
original_df = original_df[['barcodes', 'AIFI_L1', 'AIFI_L2', 'AIFI_L3']]
original_df = original_df.rename({'AIFI_L1': 'original_L1', 'AIFI_L2': 'original_L2', 'AIFI_L3': 'original_L3'}, axis = 1)

In [16]:
original_df.head()

Unnamed: 0,barcodes,original_L1,original_L2,original_L3
0,05ea9806794211eb93b836d1cb6129eb,DC,cDC1,cDC1
1,e225c914794011eb9282e2ceeb91ba52,DC,cDC1,cDC1
2,b1379eae795411eb958b0245821e6993,DC,cDC1,cDC1
3,b13d3a8a795411eb958b0245821e6993,DC,cDC1,cDC1
4,b1430d16795411eb958b0245821e6993,DC,cDC1,cDC1


## Read cell type hierarchy

In [17]:
hierarchy_uuid = '1a44252c-8cab-4c8f-92c9-d8f3af633790'
hierarchy_df = read_csv_uuid(hierarchy_uuid)

## Identify files for use in HISE

In [18]:
search_id = 'neptunium-cadmium-erbium'

Retrieve files stored in our HISE project store

In [19]:
ps_df = hisepy.list_files_in_project_store('cohorts')
ps_df = ps_df[['id', 'name']]

Filter for files from the previous notebook using our search_id

In [20]:
search_df = ps_df[ps_df['name'].str.contains(search_id)]
search_df = search_df.sort_values('name')
search_df['AIFI_L3'] = [filename_cell_type(f) for f in search_df['name']]

In [21]:
search_df['AIFI_L3'].tolist()

['CD8 reasons',
 'CD8aa',
 'CM CD8 T cell',
 'Core naive CD8 T cell',
 'GZMK- CD27+ EM CD8 T cell',
 'GZMK+ CD27+ EM CD8 T cell',
 'ISG+ memory CD8 T cell',
 'ISG+ naive CD8 T cell',
 'KLRF1- GZMB+ CD27- EM CD8 T cell',
 'KLRF1+ GZMB+ CD27- EM CD8 T cell',
 'SOX4+ naive CD8 T cell']

# DCs

## Review ASDC

In [22]:
cell_type = 'ASDC'
type_uuid = search_df['id'].loc[search_df['AIFI_L3'] == cell_type].tolist()[0]

IndexError: list index out of range

In [None]:
type_uuid

In [None]:
adata = read_adata_uuid(type_uuid)

### Clustering

In [None]:
sc.pl.umap(adata, color = 'leiden_2', legend_loc = 'on data')

### Marker expression

In [None]:
sc.pl.dotplot(adata, groupby = 'leiden_2', var_names = broad_markers, swap_axes = True)

In [None]:
sc.pl.dotplot(adata, groupby = 'leiden_2', var_names = class_markers, swap_axes = True)

### Cluster counts

In [None]:
adata.obs['leiden_2'].value_counts()

### Comparison to original labels

In [None]:
adata = add_labels(adata, original_df)

In [None]:
sc.pl.umap(adata, color = 'original_L3')

In [None]:
adata.obs['original_L3'].value_counts(dropna = False)

In [None]:
adata.obs['original_L3'].value_counts(dropna = False) / adata.obs.shape[0]

## Review cDC1

In [None]:
cell_type = 'cDC1'
type_uuid = search_df['id'].loc[search_df['AIFI_L3'] == cell_type].tolist()[0]

In [None]:
type_uuid

In [None]:
adata = read_adata_uuid(type_uuid)

### Clustering

In [None]:
sc.pl.umap(adata, color = 'leiden_2', legend_loc = 'on data')

### Marker expression

In [None]:
sc.pl.dotplot(adata, groupby = 'leiden_2', var_names = broad_markers, swap_axes = True)

In [None]:
sc.pl.dotplot(adata, groupby = 'leiden_2', var_names = class_markers, swap_axes = True)

### Cluster counts

In [None]:
adata.obs['leiden_2'].value_counts()

### Comparison to original labels

In [None]:
adata = add_labels(adata, original_df)

In [None]:
sc.pl.umap(adata, color = 'original_L3')

In [None]:
adata.obs['original_L3'].value_counts(dropna = False)

In [None]:
adata.obs['original_L3'].value_counts(dropna = False) / adata.obs.shape[0]

## Review CD14+ cDC2

In [None]:
cell_type = 'CD14+ cDC2'
type_uuid = search_df['id'].loc[search_df['AIFI_L3'] == cell_type].tolist()[0]

In [None]:
type_uuid

In [None]:
adata = read_adata_uuid(type_uuid)

### Clustering

In [None]:
sc.pl.umap(adata, color = 'leiden_2', legend_loc = 'on data')

### Marker expression

In [None]:
sc.pl.dotplot(adata, groupby = 'leiden_2', var_names = broad_markers, swap_axes = True)

In [None]:
sc.pl.dotplot(adata, groupby = 'leiden_2', var_names = class_markers, swap_axes = True)

### Cluster counts

In [None]:
adata.obs['leiden_2'].value_counts()

### Comparison to original labels

In [None]:
adata = add_labels(adata, original_df)

In [None]:
sc.pl.umap(adata, color = 'original_L3')

In [None]:
adata.obs['original_L3'].value_counts(dropna = False)

In [None]:
adata.obs['original_L3'].value_counts(dropna = False) / adata.obs.shape[0]

## Review HLA-DRhi cDC2

In [None]:
cell_type = 'HLA-DRhi cDC2'
type_uuid = search_df['id'].loc[search_df['AIFI_L3'] == cell_type].tolist()[0]

In [None]:
type_uuid

In [None]:
adata = read_adata_uuid(type_uuid)

### Clustering

In [None]:
sc.pl.umap(adata, color = 'leiden_2', legend_loc = 'on data')

### Marker expression

In [None]:
sc.pl.dotplot(adata, groupby = 'leiden_2', var_names = broad_markers, swap_axes = True)

In [None]:
sc.pl.dotplot(adata, groupby = 'leiden_2', var_names = class_markers, swap_axes = True)

### Cluster counts

In [None]:
adata.obs['leiden_2'].value_counts()

### Comparison to original labels

In [None]:
adata = add_labels(adata, original_df)

In [None]:
sc.pl.umap(adata, color = 'original_L3')

In [None]:
adata.obs['original_L3'].value_counts(dropna = False)

In [None]:
adata.obs['original_L3'].value_counts(dropna = False) / adata.obs.shape[0]

## Review ISG+ cDC2

In [None]:
cell_type = 'ISG+ cDC2'
type_uuid = search_df['id'].loc[search_df['AIFI_L3'] == cell_type].tolist()[0]

In [None]:
type_uuid

In [None]:
adata = read_adata_uuid(type_uuid)

### Clustering

In [None]:
sc.pl.umap(adata, color = 'leiden_2', legend_loc = 'on data')

### Marker expression

In [None]:
sc.pl.dotplot(adata, groupby = 'leiden_2', var_names = broad_markers, swap_axes = True)

In [None]:
sc.pl.dotplot(adata, groupby = 'leiden_2', var_names = class_markers, swap_axes = True)

### Cluster counts

In [None]:
adata.obs['leiden_2'].value_counts()

### Comparison to original labels

In [None]:
adata = add_labels(adata, original_df)

In [None]:
sc.pl.umap(adata, color = 'original_L3')

In [None]:
adata.obs['original_L3'].value_counts(dropna = False)

In [None]:
adata.obs['original_L3'].value_counts(dropna = False) / adata.obs.shape[0]

## Review pDC

In [None]:
cell_type = 'pDC'
type_uuid = search_df['id'].loc[search_df['AIFI_L3'] == cell_type].tolist()[0]

In [None]:
type_uuid

In [None]:
adata = read_adata_uuid(type_uuid)

### Clustering

In [None]:
sc.pl.umap(adata, color = 'leiden_2', legend_loc = 'on data')

### Marker expression

In [None]:
sc.pl.dotplot(adata, groupby = 'leiden_2', var_names = broad_markers, swap_axes = True)

In [None]:
sc.pl.dotplot(adata, groupby = 'leiden_2', var_names = class_markers, swap_axes = True)

### Comparison to original labels

In [None]:
adata = add_labels(adata, original_df)

In [None]:
sc.pl.umap(adata, color = 'original_L3')

In [None]:
adata.obs['original_L3'].value_counts(dropna = False)

In [None]:
adata.obs['original_L3'].value_counts(dropna = False) / adata.obs.shape[0]

# Monocytes

## Review Core CD14 monocyte

In [None]:
cell_type = 'Core CD14 monocyte'
type_uuid = search_df['id'].loc[search_df['AIFI_L3'] == cell_type].tolist()[0]

In [None]:
type_uuid

In [None]:
adata = read_adata_uuid(type_uuid)

### Clustering

In [None]:
sc.pl.umap(adata, color = 'leiden_2', legend_loc = 'on data')

### Marker expression

In [None]:
sc.pl.dotplot(adata, groupby = 'leiden_2', var_names = broad_markers, swap_axes = True)

In [None]:
sc.pl.dotplot(adata, groupby = 'leiden_2', var_names = class_markers, swap_axes = True)

### Comparison to original labels

In [None]:
adata = add_labels(adata, original_df)

In [None]:
sc.pl.umap(adata, color = 'original_L3')

In [None]:
adata.obs['original_L3'].value_counts(dropna = False)

In [None]:
adata.obs['original_L3'].value_counts(dropna = False) / adata.obs.shape[0]

## Review ISG+ CD14 monocyte

In [None]:
cell_type = 'ISG+ CD14 monocyte'
type_uuid = search_df['id'].loc[search_df['AIFI_L3'] == cell_type].tolist()[0]

In [None]:
type_uuid

In [None]:
adata = read_adata_uuid(type_uuid)

### Clustering

In [None]:
sc.pl.umap(adata, color = 'leiden_2', legend_loc = 'on data')

### Marker expression

In [None]:
sc.pl.dotplot(adata, groupby = 'leiden_2', var_names = broad_markers, swap_axes = True)

In [None]:
sc.pl.dotplot(adata, groupby = 'leiden_2', var_names = class_markers, swap_axes = True)

### Comparison to original labels

In [None]:
adata = add_labels(adata, original_df)

In [None]:
sc.pl.umap(adata, color = 'original_L3')

In [None]:
adata.obs['original_L3'].value_counts(dropna = False)

In [None]:
adata.obs['original_L3'].value_counts(dropna = False) / adata.obs.shape[0]

## Review IL1B+ CD14 monocyte

In [None]:
cell_type = 'IL1B+ CD14 monocyte'
type_uuid = search_df['id'].loc[search_df['AIFI_L3'] == cell_type].tolist()[0]

In [None]:
type_uuid

In [None]:
adata = read_adata_uuid(type_uuid)

### Clustering

In [None]:
sc.pl.umap(adata, color = 'leiden_2', legend_loc = 'on data')

### Marker expression

In [None]:
sc.pl.dotplot(adata, groupby = 'leiden_2', var_names = broad_markers, swap_axes = True)

In [None]:
sc.pl.dotplot(adata, groupby = 'leiden_2', var_names = class_markers, swap_axes = True)

### Comparison to original labels

In [None]:
adata = add_labels(adata, original_df)

In [None]:
sc.pl.umap(adata, color = 'original_L3')

In [None]:
adata.obs['original_L3'].value_counts(dropna = False)

In [None]:
adata.obs['original_L3'].value_counts(dropna = False) / adata.obs.shape[0]

## Review Intermediate monocyte

In [None]:
cell_type = 'Intermediate monocyte'
type_uuid = search_df['id'].loc[search_df['AIFI_L3'] == cell_type].tolist()[0]

In [None]:
type_uuid

In [None]:
adata = read_adata_uuid(type_uuid)

### Clustering

In [None]:
sc.pl.umap(adata, color = 'leiden_2', legend_loc = 'on data')

### Marker expression

In [None]:
sc.pl.dotplot(adata, groupby = 'leiden_2', var_names = broad_markers, swap_axes = True)

In [None]:
sc.pl.dotplot(adata, groupby = 'leiden_2', var_names = class_markers, swap_axes = True)

### Comparison to original labels

In [None]:
adata = add_labels(adata, original_df)

In [None]:
sc.pl.umap(adata, color = 'original_L3')

In [None]:
adata.obs['original_L3'].value_counts(dropna = False)

In [None]:
adata.obs['original_L3'].value_counts(dropna = False) / adata.obs.shape[0]

## Review Core CD16 monocyte

In [None]:
cell_type = 'Core CD16 monocyte'
type_uuid = search_df['id'].loc[search_df['AIFI_L3'] == cell_type].tolist()[0]

In [None]:
type_uuid

In [None]:
adata = read_adata_uuid(type_uuid)

### Clustering

In [None]:
sc.pl.umap(adata, color = 'leiden_2', legend_loc = 'on data')

### Marker expression

In [None]:
sc.pl.dotplot(adata, groupby = 'leiden_2', var_names = broad_markers, swap_axes = True)

In [None]:
sc.pl.dotplot(adata, groupby = 'leiden_2', var_names = class_markers, swap_axes = True)

### Comparison to original labels

In [None]:
adata = add_labels(adata, original_df)

In [None]:
sc.pl.umap(adata, color = 'original_L3')

In [None]:
adata.obs['original_L3'].value_counts(dropna = False)

In [None]:
adata.obs['original_L3'].value_counts(dropna = False) / adata.obs.shape[0]

## Review C1Q+ CD16 monocyte

In [None]:
cell_type = 'C1Q+ CD16 monocyte'
type_uuid = search_df['id'].loc[search_df['AIFI_L3'] == cell_type].tolist()[0]

In [None]:
type_uuid

In [None]:
adata = read_adata_uuid(type_uuid)

### Clustering

In [None]:
sc.pl.umap(adata, color = 'leiden_2', legend_loc = 'on data')

### Marker expression

In [None]:
sc.pl.dotplot(adata, groupby = 'leiden_2', var_names = broad_markers, swap_axes = True)

In [None]:
sc.pl.dotplot(adata, groupby = 'leiden_2', var_names = class_markers, swap_axes = True)

### Comparison to original labels

In [None]:
adata = add_labels(adata, original_df)

In [None]:
sc.pl.umap(adata, color = 'original_L3')

In [None]:
adata.obs['original_L3'].value_counts(dropna = False)

In [None]:
adata.obs['original_L3'].value_counts(dropna = False) / adata.obs.shape[0]

## Review ISG+ CD16 monocyte

In [None]:
cell_type = 'ISG+ CD16 monocyte'
type_uuid = search_df['id'].loc[search_df['AIFI_L3'] == cell_type].tolist()[0]

In [None]:
type_uuid

In [None]:
adata = read_adata_uuid(type_uuid)

### Clustering

In [None]:
sc.pl.umap(adata, color = 'leiden_2', legend_loc = 'on data')

### Marker expression

In [None]:
sc.pl.dotplot(adata, groupby = 'leiden_2', var_names = broad_markers, swap_axes = True)

In [None]:
sc.pl.dotplot(adata, groupby = 'leiden_2', var_names = class_markers, swap_axes = True)

### Comparison to original labels

In [None]:
adata = add_labels(adata, original_df)

In [None]:
sc.pl.umap(adata, color = 'original_L3')

In [None]:
adata.obs['original_L3'].value_counts(dropna = False)

In [None]:
adata.obs['original_L3'].value_counts(dropna = False) / adata.obs.shape[0]

In [None]:
os.system('jupyter nbconvert --to html --template pj 13e-Python_review_filtered_L3_myeloid_data.ipynb')

In [None]:
import session_info
session_info.show()