## Adapt Myeloid cell annotations

With slightly different clustering, we need to shift the original expert cluster annotations to new cluster numbers.

We can do this using the barcodes associated with each cluster.

In [1]:
from datetime import date
import hisepy
import os
import pandas as pd
import scanpy as sc

In [2]:
def read_parquet_uuid(parquet_uuid):
    pq_path = '/home/jupyter/cache/{u}'.format(u = parquet_uuid)
    if not os.path.isdir(pq_path):
        hise_res = hisepy.reader.cache_files([parquet_uuid])
    pq_filename = os.listdir(pq_path)[0]
    pq_file = '{p}/{f}'.format(p = pq_path, f = pq_filename)
    df = pd.read_parquet(pq_file)
    return df

In [3]:
def join_anno(og_obs, new_obs, new_col):
    new_df = new_obs[['barcodes', new_col]].copy()
    new_df = new_df.reset_index(drop = True)
    joint_df = new_df.merge(og_obs, how = 'left', on = 'barcodes')

    return joint_df

In [4]:
def reduce_anno(joint_df, og_anno, new_col):    
    freq_anno = joint_df.drop('barcodes', axis = 1)
    
    freq_anno = (
        freq_anno
        .groupby(new_col)['AIFI_L3']
        .agg(pd.Series.mode)
        .to_frame()
        .reset_index()
    )

    new_anno = freq_anno.merge(og_anno, how = 'left', on = 'AIFI_L3')

    return(new_anno)

## Read Original subclustering results from HISE

In [5]:
og_pq_uuid = '948ff860-9ec6-4b37-8040-06a2ee235859'

In [6]:
og_obs = read_parquet_uuid(og_pq_uuid)

In [7]:
og_obs = og_obs[['barcodes',
                 'AIFI_L1',
                 'AIFI_L2',
                 'AIFI_L3']]
og_obs = og_obs.reset_index(drop = True)

In [8]:
og_obs.shape

(397526, 4)

In [9]:
og_anno = og_obs.drop('barcodes', axis = 1).drop_duplicates()

In [10]:
joint_df_list = []

## Read DC subclustering results from HISE

In [11]:
cell_class = 'myeloid-dcs'

In [12]:
dc_uuid = 'c71290d2-bc00-416f-847c-ba93654f7433'
dc_obs = read_parquet_uuid(dc_uuid)

In [13]:
dc_obs.shape

(34641, 59)

In [14]:
dc_col = 'leiden_resolution_2_myeloid-dcs'

Join and assign to new clusters

In [15]:
dc_df = join_anno(og_obs, dc_obs, dc_col)

In [16]:
dc_df['AIFI_L3'].isna().sum()

23

In [17]:
dc_df.head()

Unnamed: 0,barcodes,leiden_resolution_2_myeloid-dcs,AIFI_L1,AIFI_L2,AIFI_L3
0,cf72153648b611ea8957bafe6d70929e,1,DC,pDC,pDC
1,cf7273e648b611ea8957bafe6d70929e,19,DC,cDC2,CD14+ cDC2
2,cf764a6648b611ea8957bafe6d70929e,20,Doublet,Doublet,Doublet
3,cf7ac94c48b611ea8957bafe6d70929e,8,DC,cDC2,HLA-DRhi cDC2
4,cf7f830648b611ea8957bafe6d70929e,18,Doublet,Doublet,Doublet


In [18]:
joint_df_list.append(dc_df.drop(dc_col, axis = 1))

In [19]:
dc_label_df = reduce_anno(dc_df, og_anno, dc_col)

  .groupby(new_col)['AIFI_L3']


In [20]:
dc_label_df

Unnamed: 0,leiden_resolution_2_myeloid-dcs,AIFI_L3,AIFI_L1,AIFI_L2
0,0,pDC,DC,pDC
1,1,pDC,DC,pDC
2,2,HLA-DRhi cDC2,DC,cDC2
3,3,Core CD14 monocyte,Monocyte,CD14 monocyte
4,4,pDC,DC,pDC
5,5,Core CD14 monocyte,Monocyte,CD14 monocyte
6,6,Core CD14 monocyte,Monocyte,CD14 monocyte
7,7,Core CD14 monocyte,Monocyte,CD14 monocyte
8,8,HLA-DRhi cDC2,DC,cDC2
9,9,HLA-DRhi cDC2,DC,cDC2


In [21]:
final_dc = dc_df[['barcodes', dc_col]].merge(dc_label_df, on = dc_col, how = 'left')

In [22]:
out_dir = '../04-Annotation/annotations/'
out_file = '{p}/pbmc_ref_{c}_annotations_{d}.csv'.format(
    p = out_dir,
    c = cell_class,
    d = date.today()
)
dc_label_df.to_csv(out_file)

## Read Myeloid subclustering results from HISE

In [23]:
cell_class = 'myeloid'

In [24]:
new_uuid = '8522dd82-6349-458b-aeff-d9e2a20a833b'
new_obs = read_parquet_uuid(new_uuid)

In [25]:
new_obs.shape

(397356, 57)

Drop cells labeled with DCs

In [26]:
keep_obs = [not x for x in new_obs['barcodes'].isin(dc_obs['barcodes']) ]
new_obs = new_obs[keep_obs]
new_obs.shape

(362715, 57)

In [27]:
new_col = 'leiden_resolution_3'

Join and assign to new clusters

In [28]:
joint_df = join_anno(og_obs, new_obs, new_col)

In [29]:
joint_df['AIFI_L3'].isna().sum()

728

In [30]:
joint_df[new_col] = joint_df[new_col].cat.remove_unused_categories()

In [31]:
joint_df.head()

Unnamed: 0,barcodes,leiden_resolution_3,AIFI_L1,AIFI_L2,AIFI_L3
0,cf71fa1048b611ea8957bafe6d70929e,1,Monocyte,CD14 monocyte,Core CD14 monocyte
1,cf71ffba48b611ea8957bafe6d70929e,19,Monocyte,CD14 monocyte,Core CD14 monocyte
2,cf721da648b611ea8957bafe6d70929e,10,Monocyte,CD14 monocyte,Core CD14 monocyte
3,cf7221e848b611ea8957bafe6d70929e,1,Monocyte,CD14 monocyte,Core CD14 monocyte
4,cf7223aa48b611ea8957bafe6d70929e,13,Monocyte,CD14 monocyte,Core CD14 monocyte


In [32]:
joint_df_list.append(joint_df.drop(new_col, axis = 1))

In [33]:
label_df = reduce_anno(joint_df, og_anno, new_col)

  .groupby(new_col)['AIFI_L3']


In [34]:
label_df

Unnamed: 0,leiden_resolution_3,AIFI_L3,AIFI_L1,AIFI_L2
0,0,Core CD16 monocyte,Monocyte,CD16 monocyte
1,1,Core CD14 monocyte,Monocyte,CD14 monocyte
2,2,Core CD14 monocyte,Monocyte,CD14 monocyte
3,3,Core CD14 monocyte,Monocyte,CD14 monocyte
4,4,Core CD14 monocyte,Monocyte,CD14 monocyte
5,5,Core CD14 monocyte,Monocyte,CD14 monocyte
6,6,Core CD14 monocyte,Monocyte,CD14 monocyte
7,7,Core CD16 monocyte,Monocyte,CD16 monocyte
8,8,ISG+ CD14 monocyte,Monocyte,CD14 monocyte
9,9,Core CD14 monocyte,Monocyte,CD14 monocyte


In [35]:
final_m = joint_df[['barcodes', new_col]].merge(label_df, on = new_col, how = 'left')

In [36]:
out_dir = '../04-Annotation/annotations/'
out_file = '{p}/pbmc_ref_{c}_annotations_{d}.csv'.format(
    p = out_dir,
    c = cell_class,
    d = date.today()
)
label_df.to_csv(out_file)

## Compare old and new annotations

In [37]:
all_new = pd.concat(joint_df_list)

In [38]:
new_counts = all_new['AIFI_L3'].value_counts()
new_counts

AIFI_L3
Core CD14 monocyte       203192
ISG+ CD14 monocyte        52816
Doublet                   45230
Core CD16 monocyte        34481
Intermediate monocyte     18232
IL1B+ CD14 monocyte        8651
pDC                        7656
CD14+ cDC2                 6879
HLA-DRhi cDC2              6850
ISG+ CD16 monocyte         5862
C1Q+ CD16 monocyte         3922
cDC1                        948
ISG+ cDC2                   893
Doublet                     585
ASDC                        408
Name: count, dtype: int64

In [39]:
sum(all_new['AIFI_L3'].isna())

751

In [40]:
sum(new_counts)

396605

Original type counts

In [41]:
old_counts = og_obs['AIFI_L3'].value_counts()
old_counts

AIFI_L3
Core CD14 monocyte       203205
ISG+ CD14 monocyte        52817
Doublet                   46107
Core CD16 monocyte        34482
Intermediate monocyte     18233
IL1B+ CD14 monocyte        8651
pDC                        7668
CD14+ cDC2                 6880
HLA-DRhi cDC2              6850
ISG+ CD16 monocyte         5864
C1Q+ CD16 monocyte         3922
cDC1                        952
ISG+ cDC2                   893
Doublet                     590
ASDC                        412
Name: count, dtype: int64

In [42]:
sum(old_counts)

397526

Difference in counts

In [43]:
new_counts - old_counts

AIFI_L3
Core CD14 monocyte       -13
ISG+ CD14 monocyte        -1
Doublet                 -877
Core CD16 monocyte        -1
Intermediate monocyte     -1
IL1B+ CD14 monocyte        0
pDC                      -12
CD14+ cDC2                -1
HLA-DRhi cDC2              0
ISG+ CD16 monocyte        -2
C1Q+ CD16 monocyte         0
cDC1                      -4
ISG+ cDC2                  0
Doublet                   -5
ASDC                      -4
Name: count, dtype: int64

Percent difference

In [44]:
(new_counts - old_counts) / old_counts * 100

AIFI_L3
Core CD14 monocyte      -0.006397
ISG+ CD14 monocyte      -0.001893
Doublet                 -1.902097
Core CD16 monocyte      -0.002900
Intermediate monocyte   -0.005485
IL1B+ CD14 monocyte      0.000000
pDC                     -0.156495
CD14+ cDC2              -0.014535
HLA-DRhi cDC2            0.000000
ISG+ CD16 monocyte      -0.034106
C1Q+ CD16 monocyte       0.000000
cDC1                    -0.420168
ISG+ cDC2                0.000000
Doublet                 -0.847458
ASDC                    -0.970874
Name: count, dtype: float64

In [45]:
old_id = og_obs[['barcodes','AIFI_L3']].rename({'AIFI_L3':'old_L3'}, axis = 1)
new_id = all_new[['barcodes','AIFI_L3']].rename({'AIFI_L3':'new_L3'}, axis = 1)

In [46]:
sum(new_id['barcodes'].isin(old_id['barcodes']))

396605

In [47]:
comp_old = old_id.merge(new_id, on = 'barcodes', how = 'left')

In [48]:
comp_old.shape

(397526, 3)

In [49]:
sum(comp_old['old_L3'] == comp_old['new_L3'])

396605

In [50]:
diff_old = comp_old[comp_old['old_L3'] != comp_old['new_L3']]
#diff = diff[[not x for x in diff['new_L3'].isna()]]

In [51]:
diff_old.head()

Unnamed: 0,barcodes,old_L3,new_L3
26,cf74aa1248b611ea8957bafe6d70929e,Doublet,
36,cf762e0a48b611ea8957bafe6d70929e,Doublet,
41,cf76907048b611ea8957bafe6d70929e,Doublet,
85,cf7d687848b611ea8957bafe6d70929e,Doublet,
132,cf85509c48b611ea8957bafe6d70929e,Doublet,


In [52]:
diff_old['old_L3'].value_counts()

old_L3
Doublet                  877
Core CD14 monocyte        13
pDC                       12
Doublet                    5
ASDC                       4
cDC1                       4
ISG+ CD16 monocyte         2
CD14+ cDC2                 1
ISG+ CD14 monocyte         1
Core CD16 monocyte         1
Intermediate monocyte      1
Name: count, dtype: int64

In [53]:
comp_new = new_id.merge(old_id, on = 'barcodes', how = 'left')

In [54]:
comp_new.shape

(397356, 3)

In [55]:
sum(comp_new['old_L3'] == comp_new['new_L3'])

396605

In [56]:
diff_new = comp_new[comp_new['old_L3'] != comp_new['new_L3']]
#diff = diff[[not x for x in diff['new_L3'].isna()]]

In [57]:
diff_new.head()

Unnamed: 0,barcodes,new_L3,old_L3
3761,0b80893c862e11eeaf3f4ac3fe582f25,,
5094,6ccd0db601e011ec97d1be2f1bab6958,,
5111,6ce3d46a01e011ec97d1be2f1bab6958,,
7615,bcdf19627abc11ebac911e12d1777b84,,
7657,b61de4ea7abd11eb9e34d2275bdfa688,,


In [58]:
diff_new.shape

(751, 3)

Is a cluster in here that was previously assigned to a different class?

In [59]:
sum(diff_new['new_L3'].isna())

751

In [60]:
final_new = pd.concat([final_dc.drop(dc_col, axis = 1),final_m.drop(new_col, axis = 1)])

In [61]:
missing_new = final_new[final_new['barcodes'].isin(diff_new['barcodes'])]

In [62]:
missing_new['AIFI_L3'].value_counts()

AIFI_L3
Doublet                  697
Core CD14 monocyte        29
Core CD16 monocyte         8
pDC                        7
ASDC                       5
ISG+ CD14 monocyte         4
Intermediate monocyte      1
Name: count, dtype: int64

Differences are mostly due to doublets. No problemo.

In [63]:
import session_info
session_info.show()