## Adapt  M cell annotations

With slightly different clustering, we need to shift the original expert cluster annotations to new cluster numbers.

We can do this using the barcodes associated with each cluster.

In [1]:
from datetime import date
import hisepy
import os
import pandas as pd
import scanpy as sc

In [2]:
def read_parquet_uuid(parquet_uuid):
    pq_path = '/home/jupyter/cache/{u}'.format(u = parquet_uuid)
    if not os.path.isdir(pq_path):
        hise_res = hisepy.reader.cache_files([parquet_uuid])
    pq_filename = os.listdir(pq_path)[0]
    pq_file = '{p}/{f}'.format(p = pq_path, f = pq_filename)
    df = pd.read_parquet(pq_file)
    return df

In [3]:
def join_anno(og_obs, new_obs, new_col):
    new_df = new_obs[['barcodes', new_col]].copy()
    new_df = new_df.reset_index(drop = True)
    joint_df = new_df.merge(og_obs, how = 'left', on = 'barcodes')

    return joint_df

In [4]:
def reduce_anno(joint_df, og_anno, new_col):    
    freq_anno = joint_df.drop('barcodes', axis = 1)
    
    freq_anno = (
        freq_anno
        .groupby(new_col)['AIFI_L3']
        .agg(pd.Series.mode)
        .to_frame()
        .reset_index()
    )

    new_anno = freq_anno.merge(og_anno, how = 'left', on = 'AIFI_L3')

    return(new_anno)

## Read original annotations

In [5]:
og_obs = pd.read_parquet('NK_Annotation.parquet')

In [6]:
og_obs.head()

Unnamed: 0,barcodes,batch_id,cell_name,cell_uuid,chip_id,hto_barcode,hto_category,n_genes,n_mito_umis,n_reads,...,leiden,leiden_resolution_1,leiden_resolution_1.5,leiden_resolution_2,AIFI_L1,AIFI_L1_Final,AIFI_L2,AIFI_L2_Final,AIFI_L3,AIFI_L3_Final
0,cf72d2f048b611ea8957bafe6d70929e,B001,shrewd_chocolaty_xrayfish,cf72d2f048b611ea8957bafe6d70929e,B001-P1C1,TGATGGCCTATTGGG,singlet,1229,51,8644,...,5,5,6,4,NK cell,Yes,CD56dim NK cell,Yes,GZMK+ CD56dim NK cell,Yes
1,cf73d29048b611ea8957bafe6d70929e,B001,subocean_bold_oriole,cf73d29048b611ea8957bafe6d70929e,B001-P1C1,TGATGGCCTATTGGG,singlet,2005,246,18676,...,5,8,12,14,NK cell,Yes,NK+T Doublets,Yes,NK+T Doublets,Yes
2,cf73ecda48b611ea8957bafe6d70929e,B001,observant_fathomable_barracuda,cf73ecda48b611ea8957bafe6d70929e,B001-P1C1,TGATGGCCTATTGGG,singlet,921,37,5216,...,5,0,1,12,NK cell,Yes,CD56dim NK cell,Yes,GZMK- CD56dim NK cell,Yes
3,cf7430b448b611ea8957bafe6d70929e,B001,omniscient_slim_hoki,cf7430b448b611ea8957bafe6d70929e,B001-P1C1,TGATGGCCTATTGGG,singlet,2153,117,16241,...,5,2,2,5,NK cell,Yes,CD56dim NK cell,Yes,GZMK- CD56dim NK cell,Yes
4,cf754e0448b611ea8957bafe6d70929e,B001,glossy_intimate_ladybug,cf754e0448b611ea8957bafe6d70929e,B001-P1C1,TGATGGCCTATTGGG,singlet,1497,131,9952,...,5,3,13,15,NK cell,Yes,CD56dim NK cell,Yes,GZMK- CD56dim NK cell,Yes


In [7]:
og_obs = og_obs[['barcodes', 'AIFI_L1', 'AIFI_L2', 'AIFI_L3']]

In [8]:
og_anno = og_obs.drop('barcodes', axis = 1).drop_duplicates()

In [9]:
joint_df_list = []

## Read CD56dim subclustering results from HISE

In [10]:
cell_class = 'nk-cells-dim'

In [11]:
dim_uuid = '23f26eb8-4ea7-4737-9a4d-4069254c7ff0'
dim_obs = read_parquet_uuid(dim_uuid)

In [12]:
dim_obs.shape

(105676, 56)

In [13]:
dim_col = 'leiden_resolution_1.5_nk-cells-dim'

Join and assign to new clusters

In [14]:
dim_df = dim_obs[['barcodes', dim_col]]
dim_df = dim_df.reset_index(drop = True)
dim_df = dim_df.merge(og_obs, on = 'barcodes', how = 'left')

In [15]:
dim_df.head()

Unnamed: 0,barcodes,leiden_resolution_1.5_nk-cells-dim,AIFI_L1,AIFI_L2,AIFI_L3
0,cf73ecda48b611ea8957bafe6d70929e,2,NK cell,CD56dim NK cell,GZMK- CD56dim NK cell
1,cf7430b448b611ea8957bafe6d70929e,3,NK cell,CD56dim NK cell,GZMK- CD56dim NK cell
2,cf754e0448b611ea8957bafe6d70929e,10,NK cell,CD56dim NK cell,GZMK- CD56dim NK cell
3,cf7802de48b611ea8957bafe6d70929e,7,NK cell,CD56dim NK cell,GZMK- CD56dim NK cell
4,cf7ab2a448b611ea8957bafe6d70929e,8,NK cell,CD56dim NK cell,GZMK- CD56dim NK cell


In [16]:
joint_df_list.append(dim_df.drop(dim_col, axis = 1))

In [17]:
dim_label_df = reduce_anno(dim_df, og_anno, dim_col)

  .groupby(new_col)['AIFI_L3']


In [18]:
dim_label_df

Unnamed: 0,leiden_resolution_1.5_nk-cells-dim,AIFI_L3,AIFI_L1,AIFI_L2
0,0,GZMK- CD56dim NK cell,NK cell,CD56dim NK cell
1,1,GZMK- CD56dim NK cell,NK cell,CD56dim NK cell
2,2,GZMK- CD56dim NK cell,NK cell,CD56dim NK cell
3,3,GZMK- CD56dim NK cell,NK cell,CD56dim NK cell
4,4,GZMK- CD56dim NK cell,NK cell,CD56dim NK cell
5,5,GZMK- CD56dim NK cell,NK cell,CD56dim NK cell
6,6,GZMK- CD56dim NK cell,NK cell,CD56dim NK cell
7,7,GZMK- CD56dim NK cell,NK cell,CD56dim NK cell
8,8,GZMK- CD56dim NK cell,NK cell,CD56dim NK cell
9,9,GZMK- CD56dim NK cell,NK cell,CD56dim NK cell


In [19]:
final_dim = dim_df[['barcodes', dim_col]].merge(dim_label_df, on = dim_col, how = 'left')

In [20]:
out_dir = '../04-Annotation/annotations/'
out_file = '{p}/pbmc_ref_{c}_annotations_{d}.csv'.format(
    p = out_dir,
    c = cell_class,
    d = date.today()
)
dim_label_df.to_csv(out_file)

## Read NK subclustering results from HISE

In [21]:
cell_class = 'nk'

In [22]:
new_uuid = '11dea6f0-b69c-4641-a982-9e9d79a68240'
new_obs = read_parquet_uuid(new_uuid)

In [23]:
new_obs.shape

(160848, 55)

Drop cells labeled with CD56dim

In [24]:
keep_obs = [not x for x in new_obs['barcodes'].isin(dim_obs['barcodes']) ]
new_obs = new_obs[keep_obs]
new_obs.shape

(55172, 55)

In [25]:
new_col = 'leiden_resolution_1.5'

Join and assign to new clusters

In [26]:
joint_df = join_anno(og_obs, new_obs, new_col)

In [27]:
joint_df['AIFI_L3'].isna().sum()

1934

In [28]:
joint_df[new_col] = joint_df[new_col].cat.remove_unused_categories()

In [29]:
joint_df.head()

Unnamed: 0,barcodes,leiden_resolution_1.5,AIFI_L1,AIFI_L2,AIFI_L3
0,cf72d2f048b611ea8957bafe6d70929e,3,NK cell,CD56dim NK cell,GZMK+ CD56dim NK cell
1,cf73d29048b611ea8957bafe6d70929e,11,NK cell,NK+T Doublets,NK+T Doublets
2,cf7577c648b611ea8957bafe6d70929e,6,NK cell,CD56dim NK cell,Adaptive NK cell
3,cf774a9c48b611ea8957bafe6d70929e,11,NK cell,NK+T Doublets,NK+T Doublets
4,cf7a833848b611ea8957bafe6d70929e,6,NK cell,CD56dim NK cell,Adaptive NK cell


In [30]:
joint_df_list.append(joint_df.drop(new_col, axis = 1))

In [31]:
label_df = reduce_anno(joint_df, og_anno, new_col)

  .groupby(new_col)['AIFI_L3']


In [32]:
label_df

Unnamed: 0,leiden_resolution_1.5,AIFI_L3,AIFI_L1,AIFI_L2
0,3,GZMK+ CD56dim NK cell,NK cell,CD56dim NK cell
1,6,Adaptive NK cell,NK cell,CD56dim NK cell
2,7,CD56bright NK cell,NK cell,CD56bright NK cell
3,11,NK+T Doublets,NK cell,NK+T Doublets
4,12,NK+Erythocytes Doublet,NK cell,NK+Erythocytes Doublet
5,13,NK+Platelet Doublet,NK cell,NK+Platelet Doublet
6,14,ISG+ CD56dim NK cell,NK cell,CD56dim NK cell
7,16,Proliferating NK cell,NK cell,Proliferating NK cell
8,17,ILC,ILC,ILC


In [33]:
final_nk = joint_df[['barcodes', new_col]].merge(label_df, on = new_col, how = 'left')

In [34]:
out_dir = '../04-Annotation/annotations/'
out_file = '{p}/pbmc_ref_{c}_annotations_{d}.csv'.format(
    p = out_dir,
    c = cell_class,
    d = date.today()
)
label_df.to_csv(out_file)

## Compare old and new annotations

In [35]:
all_new = pd.concat(joint_df_list)

In [36]:
new_counts = all_new['AIFI_L3'].value_counts()
new_counts

AIFI_L3
GZMK- CD56dim NK cell     103273
CD56bright NK cell         12189
GZMK+ CD56dim NK cell      10468
Adaptive NK cell            9927
NK+Erythocytes Doublet      5547
NK+T Doublets               5239
ISG+ CD56dim NK cell        3833
CD56dim Mito+ NK            3279
NK+Platelet Doublet         3092
Proliferating NK cell        979
ILC                          876
Name: count, dtype: int64

In [37]:
sum(all_new['AIFI_L3'].isna())

2146

In [38]:
sum(new_counts)

158702

Original type counts

In [39]:
old_counts = og_obs['AIFI_L3'].value_counts()
old_counts

AIFI_L3
GZMK- CD56dim NK cell     103311
CD56bright NK cell         12209
GZMK+ CD56dim NK cell      10478
Adaptive NK cell            9946
NK+Erythocytes Doublet      5549
NK+T Doublets               5412
ISG+ CD56dim NK cell        3834
NK+Platelet Doublet         3339
CD56dim Mito+ NK            3281
Proliferating NK cell       1004
ILC                          882
Name: count, dtype: int64

In [40]:
sum(old_counts)

159245

Difference in counts

In [41]:
new_counts - old_counts

AIFI_L3
Adaptive NK cell          -19
CD56bright NK cell        -20
CD56dim Mito+ NK           -2
GZMK+ CD56dim NK cell     -10
GZMK- CD56dim NK cell     -38
ILC                        -6
ISG+ CD56dim NK cell       -1
NK+Erythocytes Doublet     -2
NK+Platelet Doublet      -247
NK+T Doublets            -173
Proliferating NK cell     -25
Name: count, dtype: int64

Percent difference

In [42]:
(new_counts - old_counts) / old_counts * 100

AIFI_L3
Adaptive NK cell         -0.191032
CD56bright NK cell       -0.163814
CD56dim Mito+ NK         -0.060957
GZMK+ CD56dim NK cell    -0.095438
GZMK- CD56dim NK cell    -0.036782
ILC                      -0.680272
ISG+ CD56dim NK cell     -0.026082
NK+Erythocytes Doublet   -0.036043
NK+Platelet Doublet      -7.397424
NK+T Doublets            -3.196600
Proliferating NK cell    -2.490040
Name: count, dtype: float64

In [43]:
old_id = og_obs[['barcodes','AIFI_L3']].rename({'AIFI_L3':'old_L3'}, axis = 1)
new_id = all_new[['barcodes','AIFI_L3']].rename({'AIFI_L3':'new_L3'}, axis = 1)

In [44]:
sum(new_id['barcodes'].isin(old_id['barcodes']))

158702

In [45]:
comp_old = old_id.merge(new_id, on = 'barcodes', how = 'left')

In [46]:
comp_old.shape

(159245, 3)

In [47]:
sum(comp_old['old_L3'] == comp_old['new_L3'])

158702

In [48]:
diff_old = comp_old[comp_old['old_L3'] != comp_old['new_L3']]
#diff = diff[[not x for x in diff['new_L3'].isna()]]

In [49]:
diff_old.head()

Unnamed: 0,barcodes,old_L3,new_L3
37,cf9171ce48b611ea8957bafe6d70929e,GZMK- CD56dim NK cell,
54,cfa73dec48b611ea8957bafe6d70929e,NK+T Doublets,
55,cfa73edc48b611ea8957bafe6d70929e,NK+T Doublets,
647,5d0cbece48b111ea8fbed2ddb8e0a14a,NK+T Doublets,
869,46cae4b4488011eaa581ba6d6c7b9f21,GZMK- CD56dim NK cell,


In [50]:
diff_old['old_L3'].value_counts()

old_L3
NK+Platelet Doublet       247
NK+T Doublets             173
GZMK- CD56dim NK cell      38
Proliferating NK cell      25
CD56bright NK cell         20
Adaptive NK cell           19
GZMK+ CD56dim NK cell      10
ILC                         6
CD56dim Mito+ NK            2
NK+Erythocytes Doublet      2
ISG+ CD56dim NK cell        1
Name: count, dtype: int64

In [51]:
comp_new = new_id.merge(old_id, on = 'barcodes', how = 'left')

In [52]:
comp_new.shape

(160848, 3)

In [53]:
sum(comp_new['old_L3'] == comp_new['new_L3'])

158702

In [54]:
diff_new = comp_new[comp_new['old_L3'] != comp_new['new_L3']]
#diff = diff[[not x for x in diff['new_L3'].isna()]]

In [55]:
diff_new.head()

Unnamed: 0,barcodes,new_L3,old_L3
19,cf87c39a48b611ea8957bafe6d70929e,,
33,cfa7c3a248b611ea8957bafe6d70929e,,
46,cfb468dc48b611ea8957bafe6d70929e,,
338,74f681b448b611eabab7c676ab45cfca,,
489,46b30ed4488011eaa581ba6d6c7b9f21,,


In [56]:
diff_new.shape

(2146, 3)

Is a cluster in here that was previously assigned to a different class?

In [57]:
sum(diff_new['new_L3'].isna())

2146

In [58]:
final_new = pd.concat([final_dim.drop(dim_col, axis = 1),final_nk.drop(new_col, axis = 1)])

In [59]:
missing_new = final_new[final_new['barcodes'].isin(diff_new['barcodes'])]

In [60]:
missing_new['AIFI_L3'].value_counts()

AIFI_L3
Adaptive NK cell          1436
NK+T Doublets              242
NK+Erythocytes Doublet     156
CD56dim Mito+ NK           115
GZMK- CD56dim NK cell       97
GZMK+ CD56dim NK cell       41
ISG+ CD56dim NK cell        31
NK+Platelet Doublet         15
ILC                          6
Proliferating NK cell        5
CD56bright NK cell           2
Name: count, dtype: int64

Differences are mostly due to doublets. No problemo.

In [61]:
import session_info
session_info.show()