## Adapt NK cell annotations

With slightly different clustering, we need to shift the original expert cluster annotations to new cluster numbers.

We can do this using the barcodes associated with each cluster.

In [1]:
from datetime import date
import hisepy
import os
import pandas as pd
import scanpy as sc

In [2]:
def read_obs_uuid(h5ad_uuid):
    h5ad_path = '/home/jupyter/cache/{u}'.format(u = h5ad_uuid)
    if not os.path.isdir(h5ad_path):
        hise_res = hisepy.reader.cache_files([h5ad_uuid])
    h5ad_filename = os.listdir(h5ad_path)[0]
    h5ad_file = '{p}/{f}'.format(p = h5ad_path, f = h5ad_filename)
    adata = sc.read_h5ad(h5ad_file, backed = 'r')
    obs = adata.obs.copy()
    return obs

In [3]:
def read_parquet_uuid(parquet_uuid):
    pq_path = '/home/jupyter/cache/{u}'.format(u = parquet_uuid)
    if not os.path.isdir(pq_path):
        hise_res = hisepy.reader.cache_files([parquet_uuid])
    pq_filename = os.listdir(pq_path)[0]
    pq_file = '{p}/{f}'.format(p = pq_path, f = pq_filename)
    df = pd.read_parquet(pq_file)
    return df

In [4]:
def join_anno(og_obs, new_obs, new_col):
    new_df = new_obs[['barcodes', new_col]].copy()
    new_df = new_df.reset_index(drop = True)
    joint_df = new_df.merge(og_obs, how = 'left', on = 'barcodes')

    return joint_df

In [5]:
def reduce_anno(joint_df, og_anno, new_col):    
    freq_anno = joint_df.drop('barcodes', axis = 1)
    
    freq_anno = (
        freq_anno
        .groupby(new_col)['AIFI_L3']
        .agg(pd.Series.mode)
        .to_frame()
        .reset_index()
    )

    new_anno = freq_anno.merge(og_anno, how = 'left', on = 'AIFI_L3')

    return(new_anno)

## Read new subclustering results from HISE

In [6]:
cell_class = 'nk-cells'

In [7]:
pq_uuid = '11dea6f0-b69c-4641-a982-9e9d79a68240'
new_obs = read_parquet_uuid(pq_uuid)

## Read old subclustering results from HISE

In [8]:
og_h5ad_uuid = '1bf187fb-d283-430e-9d1d-0d3ae199318e'
og_obs = read_obs_uuid(og_h5ad_uuid)

  utils.warn_names_duplicates("obs")


## Read original annotations

In [9]:
og_anno = pd.read_csv('NK_Cells_Res1.5_Annotation.csv')

In [10]:
og_df = og_obs[['barcodes', 'leiden_resolution_1.5']].copy()

In [11]:
join_col = 'leiden_resolution_1.5'

In [12]:
og_anno[join_col] = og_anno[join_col].astype('string').astype('category')

In [13]:
og_df = og_df.merge(og_anno, how = 'left', on = 'leiden_resolution_1.5')

In [14]:
og_df.head()

Unnamed: 0,barcodes,leiden_resolution_1.5,AIFI_L1,AIFI_L1_Final,AIFI_L2,AIFI_L2_Final,AIFI_L3,AIFI_L3_Final
0,cf72d2f048b611ea8957bafe6d70929e,6,NK cell,Yes,CD56dim NK cell,Yes,GZMK+ CD56dim NK cell,Yes
1,cf73d29048b611ea8957bafe6d70929e,12,NK cell,Yes,NK+T Doublets,Yes,NK+T Doublets,Yes
2,cf73ecda48b611ea8957bafe6d70929e,1,NK cell,Yes,CD56dim NK cell,Yes,GZMK- CD56dim NK cell,Yes
3,cf7430b448b611ea8957bafe6d70929e,2,NK cell,Yes,CD56dim NK cell,Yes,GZMK- CD56dim NK cell,Yes
4,cf754e0448b611ea8957bafe6d70929e,13,NK cell,Yes,CD56dim NK cell,Yes,GZMK- CD56dim NK cell,Yes


Rename leiden cluster so names don't conflict with new clustering

In [15]:
og_df = og_df.rename({"leiden_resolution_1.5":"og_cl"}, axis = 1)

Join original annotations to new clusters by barcodes

In [16]:
new_df = new_obs[['barcodes', 'leiden_resolution_1.5']].copy()
new_df = new_df.reset_index(drop = True)

In [17]:
joint_df = new_df.merge(og_df, how = 'left', on = 'barcodes')

In [18]:
joint_df.head()

Unnamed: 0,barcodes,leiden_resolution_1.5,og_cl,AIFI_L1,AIFI_L1_Final,AIFI_L2,AIFI_L2_Final,AIFI_L3,AIFI_L3_Final
0,cf72d2f048b611ea8957bafe6d70929e,3,6,NK cell,Yes,CD56dim NK cell,Yes,GZMK+ CD56dim NK cell,Yes
1,cf73d29048b611ea8957bafe6d70929e,11,12,NK cell,Yes,NK+T Doublets,Yes,NK+T Doublets,Yes
2,cf73ecda48b611ea8957bafe6d70929e,1,1,NK cell,Yes,CD56dim NK cell,Yes,GZMK- CD56dim NK cell,Yes
3,cf7430b448b611ea8957bafe6d70929e,0,2,NK cell,Yes,CD56dim NK cell,Yes,GZMK- CD56dim NK cell,Yes
4,cf754e0448b611ea8957bafe6d70929e,15,13,NK cell,Yes,CD56dim NK cell,Yes,GZMK- CD56dim NK cell,Yes


In [19]:
freq_anno = joint_df.drop(['barcodes', 'og_cl'], axis = 1)

Use `pd.Series.mode` on each group of leiden_resolution_1.5 cells to get the most frequent value of AIFI_L3

In [20]:
freq_anno = (
    freq_anno
    .groupby('leiden_resolution_1.5')['AIFI_L3']
    .agg(pd.Series.mode)
    .to_frame()
    .reset_index()
)

  .groupby('leiden_resolution_1.5')['AIFI_L3']


In [21]:
freq_anno.head()

Unnamed: 0,leiden_resolution_1.5,AIFI_L3
0,0,GZMK- CD56dim NK cell
1,1,GZMK- CD56dim NK cell
2,2,GZMK- CD56dim NK cell
3,3,GZMK+ CD56dim NK cell
4,4,GZMK- CD56dim NK cell


Select unique associations between L3 labels and the other levels of the hierarchy from the original annotations - should work because these are all subsets of L2 and L1

In [22]:
anno_levels = (
    og_anno
    .drop('leiden_resolution_1.5', axis = 1)
    .drop_duplicates()
)

Join the other labels to our new annotations

In [23]:
new_anno = freq_anno.merge(anno_levels, how = 'left', on = 'AIFI_L3')

In [24]:
new_anno

Unnamed: 0,leiden_resolution_1.5,AIFI_L3,AIFI_L1,AIFI_L1_Final,AIFI_L2,AIFI_L2_Final,AIFI_L3_Final
0,0,GZMK- CD56dim NK cell,NK cell,Yes,CD56dim NK cell,Yes,Yes
1,1,GZMK- CD56dim NK cell,NK cell,Yes,CD56dim NK cell,Yes,Yes
2,2,GZMK- CD56dim NK cell,NK cell,Yes,CD56dim NK cell,Yes,Yes
3,3,GZMK+ CD56dim NK cell,NK cell,Yes,CD56dim NK cell,Yes,Yes
4,4,GZMK- CD56dim NK cell,NK cell,Yes,CD56dim NK cell,Yes,Yes
5,5,GZMK- CD56dim NK cell,NK cell,Yes,CD56dim NK cell,Yes,Yes
6,6,Adaptive NK cell,NK cell,Yes,CD56dim NK cell,Yes,Yes
7,7,CD56bright NK cell,NK cell,Yes,CD56bright NK cell,Yes,Yes
8,8,GZMK- CD56dim NK cell,NK cell,Yes,CD56dim NK cell,Yes,Yes
9,9,GZMK- CD56dim NK cell,NK cell,Yes,CD56dim NK cell,Yes,Yes


In [25]:
out_dir = '../04-Annotation/annotations/'
out_file = '{p}/pbmc_ref_{c}_annotations_{d}.csv'.format(
    p = out_dir,
    c = cell_class,
    d = date.today()
)
new_anno.to_csv(out_file)

New type counts

In [26]:
new_counts = joint_df['AIFI_L3'].value_counts()
new_counts

AIFI_L3
GZMK- CD56dim NK cell     103273
CD56bright NK cell         12189
GZMK+ CD56dim NK cell      10468
Adaptive NK cell            9927
NK+Erythocytes Doublet      5547
NK+T Doublets               5239
ISG+ CD56dim NK cell        3833
CD56dim Mito+ NK            3279
NK+Platelet Doublet         3092
Proliferating NK cell        979
ILC                          876
Name: count, dtype: int64

Original type counts

In [27]:
old_counts = og_df['AIFI_L3'].value_counts()
old_counts

AIFI_L3
GZMK- CD56dim NK cell     103311
CD56bright NK cell         12209
GZMK+ CD56dim NK cell      10478
Adaptive NK cell            9946
NK+Erythocytes Doublet      5549
NK+T Doublets               5412
ISG+ CD56dim NK cell        3834
NK+Platelet Doublet         3339
CD56dim Mito+ NK            3281
Proliferating NK cell       1004
ILC                          882
Name: count, dtype: int64

Difference in counts

In [28]:
new_counts - old_counts

AIFI_L3
Adaptive NK cell          -19
CD56bright NK cell        -20
CD56dim Mito+ NK           -2
GZMK+ CD56dim NK cell     -10
GZMK- CD56dim NK cell     -38
ILC                        -6
ISG+ CD56dim NK cell       -1
NK+Erythocytes Doublet     -2
NK+Platelet Doublet      -247
NK+T Doublets            -173
Proliferating NK cell     -25
Name: count, dtype: int64

Percent difference

In [29]:
(new_counts - old_counts) / old_counts * 100

AIFI_L3
Adaptive NK cell         -0.191032
CD56bright NK cell       -0.163814
CD56dim Mito+ NK         -0.060957
GZMK+ CD56dim NK cell    -0.095438
GZMK- CD56dim NK cell    -0.036782
ILC                      -0.680272
ISG+ CD56dim NK cell     -0.026082
NK+Erythocytes Doublet   -0.036043
NK+Platelet Doublet      -7.397424
NK+T Doublets            -3.196600
Proliferating NK cell    -2.490040
Name: count, dtype: float64

In [30]:
import session_info
session_info.show()