## Adapt NK cell annotations

With slightly different clustering, we need to shift the original expert cluster annotations to new cluster numbers.

We can do this using the barcodes associated with each cluster.

In [13]:
from datetime import date
import hisepy
import os
import pandas as pd
import scanpy as sc

In [14]:
def read_obs_uuid(h5ad_uuid):
    h5ad_path = '/home/jupyter/cache/{u}'.format(u = h5ad_uuid)
    if not os.path.isdir(h5ad_path):
        hise_res = hisepy.reader.cache_files([h5ad_uuid])
    h5ad_filename = os.listdir(h5ad_path)[0]
    h5ad_file = '{p}/{f}'.format(p = h5ad_path, f = h5ad_filename)
    adata = sc.read_h5ad(h5ad_file, backed = 'r')
    obs = adata.obs.copy()
    return obs

In [15]:
def read_parquet_uuid(parquet_uuid):
    pq_path = '/home/jupyter/cache/{u}'.format(u = parquet_uuid)
    if not os.path.isdir(pq_path):
        hise_res = hisepy.reader.cache_files([parquet_uuid])
    pq_filename = os.listdir(pq_path)[0]
    pq_file = '{p}/{f}'.format(p = pq_path, f = pq_filename)
    df = pd.read_parquet(pq_file)
    return df

In [16]:
def join_anno(og_obs, new_obs, new_col):
    new_df = new_obs[['barcodes', new_col]].copy()
    new_df = new_df.reset_index(drop = True)
    joint_df = new_df.merge(og_obs, how = 'left', on = 'barcodes')

    return joint_df

In [17]:
def reduce_anno(joint_df, og_anno, new_col):    
    freq_anno = joint_df.drop('barcodes', axis = 1)
    
    freq_anno = (
        freq_anno
        .groupby(new_col)['AIFI_L3']
        .agg(pd.Series.mode)
        .to_frame()
        .reset_index()
    )

    new_anno = freq_anno.merge(og_anno, how = 'left', on = 'AIFI_L3')

    return(new_anno)

## Read new subclustering results from HISE

In [18]:
cell_class = 'nk-cells'

In [19]:
pq_uuid = '11dea6f0-b69c-4641-a982-9e9d79a68240'
new_obs = read_parquet_uuid(pq_uuid)

## Read original annotations

In [20]:
og_anno = pd.read_parquet('NK_Annotation.parquet')

In [21]:
og_anno.head()

Unnamed: 0,barcodes,batch_id,cell_name,cell_uuid,chip_id,hto_barcode,hto_category,n_genes,n_mito_umis,n_reads,...,leiden,leiden_resolution_1,leiden_resolution_1.5,leiden_resolution_2,AIFI_L1,AIFI_L1_Final,AIFI_L2,AIFI_L2_Final,AIFI_L3,AIFI_L3_Final
0,cf72d2f048b611ea8957bafe6d70929e,B001,shrewd_chocolaty_xrayfish,cf72d2f048b611ea8957bafe6d70929e,B001-P1C1,TGATGGCCTATTGGG,singlet,1229,51,8644,...,5,5,6,4,NK cell,Yes,CD56dim NK cell,Yes,GZMK+ CD56dim NK cell,Yes
1,cf73d29048b611ea8957bafe6d70929e,B001,subocean_bold_oriole,cf73d29048b611ea8957bafe6d70929e,B001-P1C1,TGATGGCCTATTGGG,singlet,2005,246,18676,...,5,8,12,14,NK cell,Yes,NK+T Doublets,Yes,NK+T Doublets,Yes
2,cf73ecda48b611ea8957bafe6d70929e,B001,observant_fathomable_barracuda,cf73ecda48b611ea8957bafe6d70929e,B001-P1C1,TGATGGCCTATTGGG,singlet,921,37,5216,...,5,0,1,12,NK cell,Yes,CD56dim NK cell,Yes,GZMK- CD56dim NK cell,Yes
3,cf7430b448b611ea8957bafe6d70929e,B001,omniscient_slim_hoki,cf7430b448b611ea8957bafe6d70929e,B001-P1C1,TGATGGCCTATTGGG,singlet,2153,117,16241,...,5,2,2,5,NK cell,Yes,CD56dim NK cell,Yes,GZMK- CD56dim NK cell,Yes
4,cf754e0448b611ea8957bafe6d70929e,B001,glossy_intimate_ladybug,cf754e0448b611ea8957bafe6d70929e,B001-P1C1,TGATGGCCTATTGGG,singlet,1497,131,9952,...,5,3,13,15,NK cell,Yes,CD56dim NK cell,Yes,GZMK- CD56dim NK cell,Yes


Rename leiden cluster so names don't conflict with new clustering

In [22]:
og_anno = og_anno[['barcodes', 'AIFI_L1', 'AIFI_L2', 'AIFI_L3']]

Join original annotations to new clusters by barcodes

In [23]:
new_df = new_obs[['barcodes', join_col]].copy()
new_df = new_df.reset_index(drop = True)

In [24]:
joint_df = new_df.merge(og_anno, how = 'left', on = 'barcodes')

In [25]:
joint_df.head()

Unnamed: 0,barcodes,leiden_resolution_2,AIFI_L1,AIFI_L2,AIFI_L3
0,cf72d2f048b611ea8957bafe6d70929e,3,NK cell,CD56dim NK cell,GZMK+ CD56dim NK cell
1,cf73d29048b611ea8957bafe6d70929e,13,NK cell,NK+T Doublets,NK+T Doublets
2,cf73ecda48b611ea8957bafe6d70929e,0,NK cell,CD56dim NK cell,GZMK- CD56dim NK cell
3,cf7430b448b611ea8957bafe6d70929e,1,NK cell,CD56dim NK cell,GZMK- CD56dim NK cell
4,cf754e0448b611ea8957bafe6d70929e,18,NK cell,CD56dim NK cell,GZMK- CD56dim NK cell


In [27]:
freq_anno = joint_df.drop(['barcodes'], axis = 1)

Use `pd.Series.mode` on each group of leiden_resolution_1.5 cells to get the most frequent value of AIFI_L3

In [28]:
freq_anno = (
    freq_anno
    .groupby(join_col)['AIFI_L3']
    .agg(pd.Series.mode)
    .to_frame()
    .reset_index()
)

  .groupby(join_col)['AIFI_L3']


In [29]:
freq_anno.head()

Unnamed: 0,leiden_resolution_2,AIFI_L3
0,0,GZMK- CD56dim NK cell
1,1,GZMK- CD56dim NK cell
2,2,GZMK- CD56dim NK cell
3,3,GZMK+ CD56dim NK cell
4,4,GZMK- CD56dim NK cell


Select unique associations between L3 labels and the other levels of the hierarchy from the original annotations - should work because these are all subsets of L2 and L1

In [31]:
anno_levels = (
    og_anno
    .drop('barcodes', axis = 1)
    .drop_duplicates()
)

In [34]:
anno_levels

Unnamed: 0,AIFI_L1,AIFI_L2,AIFI_L3
0,NK cell,CD56dim NK cell,GZMK+ CD56dim NK cell
1,NK cell,NK+T Doublets,NK+T Doublets
2,NK cell,CD56dim NK cell,GZMK- CD56dim NK cell
5,NK cell,CD56dim NK cell,Adaptive NK cell
9,NK cell,CD56bright NK cell,CD56bright NK cell
11,NK cell,CD56dim NK cell,ISG+ CD56dim NK cell
33,NK cell,Proliferating NK cell,Proliferating NK cell
51,NK cell,CD56dim NK cell,CD56dim Mito+ NK
140,NK cell,NK+Platelet Doublet,NK+Platelet Doublet
157,ILC,ILC,ILC


Join the other labels to our new annotations

In [32]:
new_anno = freq_anno.merge(anno_levels, how = 'left', on = 'AIFI_L3')

In [33]:
new_anno

Unnamed: 0,leiden_resolution_2,AIFI_L3,AIFI_L1,AIFI_L2
0,0,GZMK- CD56dim NK cell,NK cell,CD56dim NK cell
1,1,GZMK- CD56dim NK cell,NK cell,CD56dim NK cell
2,2,GZMK- CD56dim NK cell,NK cell,CD56dim NK cell
3,3,GZMK+ CD56dim NK cell,NK cell,CD56dim NK cell
4,4,GZMK- CD56dim NK cell,NK cell,CD56dim NK cell
5,5,CD56bright NK cell,NK cell,CD56bright NK cell
6,6,Adaptive NK cell,NK cell,CD56dim NK cell
7,7,GZMK- CD56dim NK cell,NK cell,CD56dim NK cell
8,8,GZMK- CD56dim NK cell,NK cell,CD56dim NK cell
9,9,GZMK- CD56dim NK cell,NK cell,CD56dim NK cell


In [None]:
out_dir = '../04-Annotation/annotations/'
out_file = '{p}/pbmc_ref_{c}_annotations_{d}.csv'.format(
    p = out_dir,
    c = cell_class,
    d = date.today()
)
new_anno.to_csv(out_file)

New type counts

In [None]:
new_counts = joint_df['AIFI_L3'].value_counts()
new_counts

Original type counts

In [None]:
old_counts = og_df['AIFI_L3'].value_counts()
old_counts

Difference in counts

In [None]:
new_counts - old_counts

Percent difference

In [None]:
(new_counts - old_counts) / old_counts * 100

In [None]:
import session_info
session_info.show()