## Adapt B cell annotations

With slightly different clustering, we need to shift the original expert cluster annotations to new cluster numbers.

We can do this using the barcodes associated with each cluster.

In [1]:
from datetime import date
import hisepy
import os
import pandas as pd
import scanpy as sc

In [2]:
def read_parquet_uuid(parquet_uuid):
    pq_path = '/home/jupyter/cache/{u}'.format(u = parquet_uuid)
    if not os.path.isdir(pq_path):
        hise_res = hisepy.reader.cache_files([parquet_uuid])
    pq_filename = os.listdir(pq_path)[0]
    pq_file = '{p}/{f}'.format(p = pq_path, f = pq_filename)
    df = pd.read_parquet(pq_file)
    return df

In [3]:
def join_anno(og_obs, new_obs, new_col):
    new_df = new_obs[['barcodes', new_col]].copy()
    new_df = new_df.reset_index(drop = True)
    joint_df = new_df.merge(og_obs, how = 'left', on = 'barcodes')

    return joint_df

In [4]:
def reduce_anno(joint_df, og_anno, new_col):    
    freq_anno = joint_df.drop('barcodes', axis = 1)
    
    freq_anno = (
        freq_anno
        .groupby(new_col)['AIFI_L3']
        .agg(pd.Series.mode)
        .to_frame()
        .reset_index()
    )

    new_anno = freq_anno.merge(og_anno, how = 'left', on = 'AIFI_L3')

    return(new_anno)

## Read Original subclustering results from HISE

In [5]:
og_pq_uuid = 'd4d2c0f6-7d39-4461-8bc6-aa0fbb5028ce'

In [6]:
og_obs = read_parquet_uuid(og_pq_uuid)

In [7]:
og_obs = og_obs[['barcodes',
                 'AIFI_L1',
                 'AIFI_L2',
                 'AIFI_L3']]
og_obs = og_obs.reset_index(drop = True)

In [8]:
og_obs.shape

(178198, 4)

In [9]:
og_anno = og_obs.drop('barcodes', axis = 1).drop_duplicates()

In [10]:
joint_df_list = []

## Read Memory B subclustering results from HISE

In [11]:
cell_class = 'b-cells-mem-no-ig'

In [12]:
mem_uuid = 'ec5c5e78-65cf-4928-9e4e-d5cd20e163cc'
mem_obs = read_parquet_uuid(mem_uuid)

In [13]:
mem_obs.shape

(47886, 57)

In [14]:
mem_col = 'ms_leiden_2.5'

Join and assign to new clusters

In [15]:
mem_df = join_anno(og_obs, mem_obs, mem_col)

In [16]:
mem_df['AIFI_L3'].isna().sum()

5

In [17]:
mem_df.head()

Unnamed: 0,barcodes,ms_leiden_2.5,AIFI_L1,AIFI_L2,AIFI_L3
0,cf720af048b611ea8957bafe6d70929e,8,B cell,Memory B cell,Core memory B cell
1,cf78473048b611ea8957bafe6d70929e,5,B cell,Memory B cell,Core memory B cell
2,cf79279a48b611ea8957bafe6d70929e,8,B cell,Memory B cell,Core memory B cell
3,cf80e44e48b611ea8957bafe6d70929e,8,B cell,Memory B cell,Core memory B cell
4,cf81a58c48b611ea8957bafe6d70929e,9,B cell,Memory B cell,Early memory B cell


In [18]:
joint_df_list.append(mem_df.drop(mem_col, axis = 1))

In [19]:
mem_label_df = reduce_anno(mem_df, og_anno, mem_col)

  .groupby(new_col)['AIFI_L3']


In [20]:
mem_label_df

Unnamed: 0,ms_leiden_2.5,AIFI_L3,AIFI_L1,AIFI_L2
0,0,Core memory B cell,B cell,Memory B cell
1,1,Core memory B cell,B cell,Memory B cell
2,2,Core memory B cell,B cell,Memory B cell
3,3,Core memory B cell,B cell,Memory B cell
4,4,Core memory B cell,B cell,Memory B cell
5,5,Core memory B cell,B cell,Memory B cell
6,6,Core memory B cell,B cell,Memory B cell
7,7,Core memory B cell,B cell,Memory B cell
8,8,Core memory B cell,B cell,Memory B cell
9,9,Early memory B cell,B cell,Memory B cell


In [21]:
final_mem = mem_df[['barcodes', mem_col]].merge(mem_label_df, on = mem_col, how = 'left')

In [22]:
out_dir = '../04-Annotation/annotations/'
out_file = '{p}/pbmc_ref_{c}_annotations_{d}.csv'.format(
    p = out_dir,
    c = cell_class,
    d = date.today()
)
mem_label_df.to_csv(out_file)

## Read Myeloid subclustering results from HISE

In [23]:
cell_class = 'b-cells-no-ig'

In [24]:
new_uuid = '26a27d04-7858-423e-a9aa-bd1f56ca0c12'
new_obs = read_parquet_uuid(new_uuid)

In [25]:
new_obs.shape

(177994, 56)

Drop cells labeled with DCs

In [26]:
keep_obs = [not x for x in new_obs['barcodes'].isin(mem_obs['barcodes']) ]
new_obs = new_obs[keep_obs]
new_obs.shape

(130108, 56)

In [27]:
new_col = 'ms_leiden_2'

Join and assign to new clusters

In [28]:
joint_df = join_anno(og_obs, new_obs, new_col)

In [29]:
joint_df['AIFI_L3'].isna().sum()

213

In [30]:
joint_df[new_col] = joint_df[new_col].cat.remove_unused_categories()

In [31]:
joint_df.head()

Unnamed: 0,barcodes,ms_leiden_2,AIFI_L1,AIFI_L2,AIFI_L3
0,cf72a14a48b611ea8957bafe6d70929e,21,T_B doublet,T_B doublet,T_B doublet
1,cf73500e48b611ea8957bafe6d70929e,0,B cell,Naive B cell,Core naive B cell
2,cf73694a48b611ea8957bafe6d70929e,11,B cell,Effector B cell,CD27+ effector B cell
3,cf73fbee48b611ea8957bafe6d70929e,1,B cell,Transitional B cell,Transitional B cell
4,cf74ae2c48b611ea8957bafe6d70929e,0,B cell,Naive B cell,Core naive B cell


In [32]:
joint_df_list.append(joint_df.drop(new_col, axis = 1))

In [33]:
label_df = reduce_anno(joint_df, og_anno, new_col)

  .groupby(new_col)['AIFI_L3']


In [34]:
label_df

Unnamed: 0,ms_leiden_2,AIFI_L3,AIFI_L1,AIFI_L2
0,0,Core naive B cell,B cell,Naive B cell
1,1,Transitional B cell,B cell,Transitional B cell
2,3,Core naive B cell,B cell,Naive B cell
3,6,Core naive B cell,B cell,Naive B cell
4,7,Core naive B cell,B cell,Naive B cell
5,8,Core naive B cell,B cell,Naive B cell
6,9,Core naive B cell,B cell,Naive B cell
7,10,Core naive B cell,B cell,Naive B cell
8,11,CD27+ effector B cell,B cell,Effector B cell
9,12,Core naive B cell,B cell,Naive B cell


In [35]:
final_m = joint_df[['barcodes', new_col]].merge(label_df, on = new_col, how = 'left')

In [36]:
out_dir = '../04-Annotation/annotations/'
out_file = '{p}/pbmc_ref_{c}_annotations_{d}.csv'.format(
    p = out_dir,
    c = cell_class,
    d = date.today()
)
label_df.to_csv(out_file)

## Compare old and new annotations

In [37]:
all_new = pd.concat(joint_df_list)

In [38]:
new_counts = all_new['AIFI_L3'].value_counts()
new_counts

AIFI_L3
Core naive B cell                 86004
Core memory B cell                38281
Transitional B cell                9586
CD27+ effector B cell              7012
Erythrocyte contamination          6005
CD27- effector B cell              4355
ISG+ naive B cell                  4115
B_platelet doublet                 4049
Early memory B cell                3706
T_B doublet                        2760
Type 2 polarized memory B cell     2734
CD95 memory B cell                 2461
Plasma cell                        2166
NK_Mono doublet                    2093
B_mono doublet                     1893
Activated memory B cell             416
Dying cells                         140
Name: count, dtype: int64

In [39]:
sum(all_new['AIFI_L3'].isna())

218

In [40]:
sum(new_counts)

177776

Original type counts

In [41]:
old_counts = og_obs['AIFI_L3'].value_counts()
old_counts

AIFI_L3
Core naive B cell                 86008
Core memory B cell                38284
Transitional B cell                9587
CD27+ effector B cell              7013
Erythrocyte contamination          6008
CD27- effector B cell              4355
ISG+ naive B cell                  4116
B_platelet doublet                 4055
Early memory B cell                3706
T_B doublet                        2968
Type 2 polarized memory B cell     2734
CD95 memory B cell                 2461
NK_Mono doublet                    2264
Plasma cell                        2169
B_mono doublet                     1908
Activated memory B cell             416
Dying cells                         146
Name: count, dtype: int64

In [42]:
sum(old_counts)

178198

Difference in counts

In [43]:
new_counts - old_counts

AIFI_L3
Activated memory B cell             0
B_mono doublet                    -15
B_platelet doublet                 -6
CD27+ effector B cell              -1
CD27- effector B cell               0
CD95 memory B cell                  0
Core memory B cell                 -3
Core naive B cell                  -4
Dying cells                        -6
Early memory B cell                 0
Erythrocyte contamination          -3
ISG+ naive B cell                  -1
NK_Mono doublet                  -171
Plasma cell                        -3
T_B doublet                      -208
Transitional B cell                -1
Type 2 polarized memory B cell      0
Name: count, dtype: int64

Percent difference

In [44]:
(new_counts - old_counts) / old_counts * 100

AIFI_L3
Activated memory B cell           0.000000
B_mono doublet                   -0.786164
B_platelet doublet               -0.147965
CD27+ effector B cell            -0.014259
CD27- effector B cell             0.000000
CD95 memory B cell                0.000000
Core memory B cell               -0.007836
Core naive B cell                -0.004651
Dying cells                      -4.109589
Early memory B cell               0.000000
Erythrocyte contamination        -0.049933
ISG+ naive B cell                -0.024295
NK_Mono doublet                  -7.553004
Plasma cell                      -0.138313
T_B doublet                      -7.008086
Transitional B cell              -0.010431
Type 2 polarized memory B cell    0.000000
Name: count, dtype: float64

In [45]:
old_id = og_obs[['barcodes','AIFI_L3']].rename({'AIFI_L3':'old_L3'}, axis = 1)
new_id = all_new[['barcodes','AIFI_L3']].rename({'AIFI_L3':'new_L3'}, axis = 1)

In [46]:
sum(new_id['barcodes'].isin(old_id['barcodes']))

177776

In [47]:
comp_old = old_id.merge(new_id, on = 'barcodes', how = 'left')

In [48]:
comp_old.shape

(178198, 3)

In [49]:
sum(comp_old['old_L3'] == comp_old['new_L3'])

177776

In [50]:
diff_old = comp_old[comp_old['old_L3'] != comp_old['new_L3']]
#diff = diff[[not x for x in diff['new_L3'].isna()]]

In [51]:
diff_old.head()

Unnamed: 0,barcodes,old_L3,new_L3
15,cf79337048b611ea8957bafe6d70929e,NK_Mono doublet,
25,cf7c9a1a48b611ea8957bafe6d70929e,NK_Mono doublet,
33,cf831d7c48b611ea8957bafe6d70929e,T_B doublet,
34,cf83775e48b611ea8957bafe6d70929e,T_B doublet,
35,cf839cac48b611ea8957bafe6d70929e,NK_Mono doublet,


In [52]:
diff_old['old_L3'].value_counts()

old_L3
T_B doublet                  208
NK_Mono doublet              171
B_mono doublet                15
B_platelet doublet             6
Dying cells                    6
Core naive B cell              4
Plasma cell                    3
Erythrocyte contamination      3
Core memory B cell             3
ISG+ naive B cell              1
CD27+ effector B cell          1
Transitional B cell            1
Name: count, dtype: int64

In [53]:
comp_new = new_id.merge(old_id, on = 'barcodes', how = 'left')

In [54]:
comp_new.shape

(177994, 3)

In [55]:
sum(comp_new['old_L3'] == comp_new['new_L3'])

177776

In [56]:
diff_new = comp_new[comp_new['old_L3'] != comp_new['new_L3']]
#diff = diff[[not x for x in diff['new_L3'].isna()]]

In [57]:
diff_new.head()

Unnamed: 0,barcodes,new_L3,old_L3
1440,b4de4300862b11ee9a02eed9ef1e2438,,
7653,f7f333c0424a11ecb1157ed7241a30b0,,
15184,a2c0eff48cfc11ebb4592ee9d1c560f8,,
34290,4b21ce26f7e511ec9d95e21fd042e627,,
46745,7a13ab6460d111ed987552e10431fc03,,


In [58]:
diff_new.shape

(218, 3)

Is a cluster in here that was previously assigned to a different class?

In [59]:
sum(diff_new['new_L3'].isna())

218

In [60]:
final_new = pd.concat([final_mem.drop(mem_col, axis = 1),final_m.drop(new_col, axis = 1)])

In [61]:
missing_new = final_new[final_new['barcodes'].isin(diff_new['barcodes'])]

In [62]:
missing_new['AIFI_L3'].value_counts()

AIFI_L3
T_B doublet                  86
NK_Mono doublet              78
B_mono doublet               21
B_platelet doublet           11
Erythrocyte contamination     9
Core naive B cell             4
Core memory B cell            3
Plasma cell                   2
Early memory B cell           1
CD95 memory B cell            1
Transitional B cell           1
ISG+ naive B cell             1
Name: count, dtype: int64

Differences are mostly due to doublets. No problemo.

In [63]:
import session_info
session_info.show()