# Adapt T cell annotations

With slightly different clustering, we need to shift the original expert cluster annotations to new cluster numbers.

We can do this using the barcodes associated with each cluster.

In [1]:
from datetime import date
import hisepy
import os
import pandas as pd
import scanpy as sc

In [2]:
def read_adata_uuid(h5ad_uuid):
    h5ad_path = '/home/jupyter/cache/{u}'.format(u = h5ad_uuid)
    if not os.path.isdir(h5ad_path):
        hise_res = hisepy.reader.cache_files([h5ad_uuid])
    h5ad_filename = os.listdir(h5ad_path)[0]
    h5ad_file = '{p}/{f}'.format(p = h5ad_path, f = h5ad_filename)
    adata = sc.read_h5ad(h5ad_file)
    return adata

In [3]:
def read_parquet_uuid(parquet_uuid):
    pq_path = '/home/jupyter/cache/{u}'.format(u = parquet_uuid)
    if not os.path.isdir(pq_path):
        hise_res = hisepy.reader.cache_files([parquet_uuid])
    pq_filename = os.listdir(pq_path)[0]
    pq_file = '{p}/{f}'.format(p = pq_path, f = pq_filename)
    df = pd.read_parquet(pq_file)
    return df

In [4]:
def join_anno(og_obs, new_obs, new_col):
    new_df = new_obs[['barcodes', new_col]].copy()
    new_df = new_df.reset_index(drop = True)
    joint_df = new_df.merge(og_obs, how = 'left', on = 'barcodes')

    return joint_df

In [5]:
def reduce_anno(joint_df, og_anno, new_col):    
    freq_anno = joint_df.drop('barcodes', axis = 1)
    
    freq_anno = (
        freq_anno
        .groupby(new_col)['AIFI_L3']
        .agg(pd.Series.mode)
        .to_frame()
        .reset_index()
    )

    new_anno = freq_anno.merge(og_anno, how = 'left', on = 'AIFI_L3')

    return(new_anno)

Original Annotations

In [6]:
og_obs_list = []
for i in range(1, 11):
    og_file = 'Annotation_Part_{i}.parquet'.format(i = i)
    og_obs = pd.read_parquet(og_file)
    og_obs_list.append(og_obs)

In [7]:
og_obs = pd.concat(og_obs_list)

In [8]:
og_obs = og_obs[['barcodes',
                 'AIFI_L1', 'AIFI_L1_Final',
                 'AIFI_L2', 'AIFI_L2_Final',
                 'AIFI_L3', 'AIFI_L3_Final']]
og_obs = og_obs.reset_index(drop = True)

In [9]:
og_obs.shape

(1193859, 7)

In [10]:
og_anno = og_obs.drop('barcodes', axis = 1).drop_duplicates()

In [11]:
new_anno_dict = {}

## gdT cells

gdT cells need to be used to filter MAIT, CD8 CM, and CD8 EM cells because we pool some TRDC+ cells from these subsets to assign identities to the gdTs. If we don't remove them, we'll double-count them.

New clustering

In [12]:
cell_class = 't-gd'

In [13]:
pq_uuid = 'ebedbccf-bb5f-4e9e-be6f-df6cc5fd6607'

In [14]:
new_obs = read_parquet_uuid(pq_uuid)

In [15]:
new_obs.shape

(54170, 56)

In [16]:
new_col = 'leiden_resolution_1.5_t-gd'

Join and assign to new clusters

In [17]:
joint_df = join_anno(og_obs, new_obs, new_col)

In [18]:
joint_df['AIFI_L3'].isna().sum()

70

In [19]:
joint_df.head()

Unnamed: 0,barcodes,leiden_resolution_1.5_t-gd,AIFI_L1,AIFI_L1_Final,AIFI_L2,AIFI_L2_Final,AIFI_L3,AIFI_L3_Final
0,cf7365ee48b611ea8957bafe6d70929e,7,NK cell,Yes,CD56dim NK cell,Yes,Adaptive NK cell,Yes
1,cf73841648b611ea8957bafe6d70929e,7,NK cell,Yes,CD56dim NK cell,Yes,Adaptive NK cell,Yes
2,cf74a06c48b611ea8957bafe6d70929e,1,T cell,Yes,gdT,Yes,GZMK+ Vd2 gdT,Yes
3,cf74f51248b611ea8957bafe6d70929e,3,T cell,Yes,gdT,Yes,GZMB+ Vd2 gdT,Yes
4,cf765bbe48b611ea8957bafe6d70929e,3,T cell,Yes,gdT,Yes,GZMB+ Vd2 gdT,Yes


In [20]:
label_df = reduce_anno(joint_df, og_anno, new_col)

  .groupby(new_col)['AIFI_L3']


In [21]:
label_df

Unnamed: 0,leiden_resolution_1.5_t-gd,AIFI_L3,AIFI_L1,AIFI_L1_Final,AIFI_L2,AIFI_L2_Final,AIFI_L3_Final
0,0,GZMK+ Vd2 gdT,T cell,Yes,gdT,Yes,Yes
1,1,GZMK+ Vd2 gdT,T cell,Yes,gdT,Yes,Yes
2,2,GZMB+ Vd2 gdT,T cell,Yes,gdT,Yes,Yes
3,3,GZMB+ Vd2 gdT,T cell,Yes,gdT,Yes,Yes
4,4,KLRF1+ effector Vd1 gdT,T cell,Yes,gdT,Yes,Yes
5,5,Naive Vd1 gdT,T cell,Yes,gdT,Yes,Yes
6,6,KLRF1- effector Vd1 gdT,T cell,Yes,gdT,Yes,Yes
7,7,Adaptive NK cell,NK cell,Yes,CD56dim NK cell,Yes,Yes
8,8,GZMK+ Vd2 gdT,T cell,Yes,gdT,Yes,Yes
9,9,GZMB+ Vd2 gdT,T cell,Yes,gdT,Yes,Yes


In [22]:
out_dir = '../04-Annotation/annotations/'
out_file = '{p}/pbmc_ref_{c}_annotations_{d}.csv'.format(
    p = out_dir,
    c = cell_class,
    d = date.today()
)
label_df.to_csv(out_file)

In [23]:
new_anno = new_obs[['barcodes', new_col]]
new_anno = new_anno.merge(label_df, on = new_col, how = 'left')
new_anno = new_anno.rename({new_col: 'label_cl'}, axis = 1)
new_anno['label_cl'] = new_anno['label_cl'].astype(str)
new_anno['label_cl'] = [new_col + '_cl' + x for x in new_anno['label_cl']]
new_anno_dict[cell_class] = new_anno

  np.array(self.categories._na_value).astype(dtype)


For filtering

In [24]:
gdt_bc = joint_df['barcodes'].to_list()

In [25]:
gdt_bc[0:5]

['cf7365ee48b611ea8957bafe6d70929e',
 'cf73841648b611ea8957bafe6d70929e',
 'cf74a06c48b611ea8957bafe6d70929e',
 'cf74f51248b611ea8957bafe6d70929e',
 'cf765bbe48b611ea8957bafe6d70929e']

## Naive CD4 T cells

New clustering

In [26]:
cell_class = 't-cd4-naive'

In [27]:
pq_uuid = '979c9314-5c4f-4da2-a674-cc099cd06412'

In [28]:
new_obs = read_parquet_uuid(pq_uuid)

In [29]:
new_col = 'leiden_resolution_1.5_t-cd4-naive'

Join and assign to new clusters

In [30]:
joint_df = join_anno(og_obs, new_obs, new_col)

In [31]:
joint_df.shape

(389874, 8)

In [32]:
joint_df['AIFI_L3'].isna().sum()

29

In [33]:
label_df = reduce_anno(joint_df, og_anno, new_col)

  .groupby(new_col)['AIFI_L3']


In [34]:
label_df

Unnamed: 0,leiden_resolution_1.5_t-cd4-naive,AIFI_L3,AIFI_L1,AIFI_L1_Final,AIFI_L2,AIFI_L2_Final,AIFI_L3_Final
0,0,Core naive CD4 T cell,T cell,Yes,Naive CD4 T cell,Yes,Yes
1,1,Core naive CD4 T cell,T cell,Yes,Naive CD4 T cell,Yes,Yes
2,2,Core naive CD4 T cell,T cell,Yes,Naive CD4 T cell,Yes,Yes
3,3,Core naive CD4 T cell,T cell,Yes,Naive CD4 T cell,Yes,Yes
4,4,Core naive CD4 T cell,T cell,Yes,Naive CD4 T cell,Yes,Yes
5,5,Core naive CD4 T cell,T cell,Yes,Naive CD4 T cell,Yes,Yes
6,6,Core naive CD4 T cell,T cell,Yes,Naive CD4 T cell,Yes,Yes
7,7,SOX4+ naive CD4 T cell,T cell,Yes,Naive CD4 T cell,Yes,Yes
8,8,Core naive CD4 T cell,T cell,Yes,Naive CD4 T cell,Yes,Yes
9,9,Naive CD8 T cell Platelet Doublets,T cell,Yes,Naive CD8 T cell Platelet Doublets,Yes,Yes


In [35]:
out_dir = '../04-Annotation/annotations/'
out_file = '{p}/pbmc_ref_{c}_annotations_{d}.csv'.format(
    p = out_dir,
    c = cell_class,
    d = date.today()
)
label_df.to_csv(out_file)

In [36]:
new_anno = new_obs[['barcodes', new_col]]
new_anno = new_anno.merge(label_df, on = new_col, how = 'left')
new_anno = new_anno.rename({new_col: 'label_cl'}, axis = 1)
new_anno['label_cl'] = new_anno['label_cl'].astype(str)
new_anno['label_cl'] = [new_col + '_cl' + x for x in new_anno['label_cl']]
new_anno_dict[cell_class] = new_anno

  np.array(self.categories._na_value).astype(dtype)


## MAIT cells

New clustering

In [37]:
cell_class = 't-mait'

In [38]:
pq_uuid = 'a1ecdac3-1332-4411-a135-41b94977b6c0'

In [39]:
new_obs = read_parquet_uuid(pq_uuid)

Remove gdT cells

In [40]:
keep_bc = [not x for x in new_obs['barcodes'].isin(gdt_bc)]

In [41]:
sum(keep_bc)

48027

In [42]:
new_obs = new_obs[keep_bc]

In [43]:
new_col = 'leiden_resolution_1.5_t-mait'

Join and assign to new clusters

In [44]:
joint_df = join_anno(og_obs, new_obs, new_col)

In [45]:
joint_df['AIFI_L3'].isna().sum()

21

In [46]:
joint_df[new_col] = joint_df[new_col].cat.remove_unused_categories()

In [47]:
joint_df.head()

Unnamed: 0,barcodes,leiden_resolution_1.5_t-mait,AIFI_L1,AIFI_L1_Final,AIFI_L2,AIFI_L2_Final,AIFI_L3,AIFI_L3_Final
0,cf71f47048b611ea8957bafe6d70929e,0,T cell,Yes,MAIT,Yes,CD8 MAIT,Yes
1,cf7236ec48b611ea8957bafe6d70929e,2,T cell,Yes,MAIT,Yes,CD8 MAIT,Yes
2,cf75435048b611ea8957bafe6d70929e,1,T cell,Yes,MAIT,Yes,CD8 MAIT,Yes
3,cf75b11e48b611ea8957bafe6d70929e,2,T cell,Yes,MAIT,Yes,CD8 MAIT,Yes
4,cf78181e48b611ea8957bafe6d70929e,0,T cell,Yes,MAIT,Yes,CD8 MAIT,Yes


In [48]:
label_df = reduce_anno(joint_df, og_anno, new_col)

  .groupby(new_col)['AIFI_L3']


In [49]:
label_df

Unnamed: 0,leiden_resolution_1.5_t-mait,AIFI_L3,AIFI_L1,AIFI_L1_Final,AIFI_L2,AIFI_L2_Final,AIFI_L3_Final
0,0,CD8 MAIT,T cell,Yes,MAIT,Yes,Yes
1,1,CD8 MAIT,T cell,Yes,MAIT,Yes,Yes
2,2,CD8 MAIT,T cell,Yes,MAIT,Yes,Yes
3,3,CD8 MAIT,T cell,Yes,MAIT,Yes,Yes
4,4,CD8 MAIT,T cell,Yes,MAIT,Yes,Yes
5,5,CD8 MAIT,T cell,Yes,MAIT,Yes,Yes
6,6,CD8 MAIT,T cell,Yes,MAIT,Yes,Yes
7,7,CD8 MAIT,T cell,Yes,MAIT,Yes,Yes
8,8,CD8 MAIT,T cell,Yes,MAIT,Yes,Yes
9,10,HBB+ MAIT,T cell,Yes,MAIT,Yes,Yes


In [50]:
out_dir = '../04-Annotation/annotations/'
out_file = '{p}/pbmc_ref_{c}_annotations_{d}.csv'.format(
    p = out_dir,
    c = cell_class,
    d = date.today()
)
label_df.to_csv(out_file)

In [51]:
new_anno = new_obs[['barcodes', new_col]]
new_anno = new_anno.merge(label_df, on = new_col, how = 'left')
new_anno = new_anno.rename({new_col: 'label_cl'}, axis = 1)
new_anno['label_cl'] = new_anno['label_cl'].astype(str)
new_anno['label_cl'] = [new_col + '_cl' + x for x in new_anno['label_cl']]
new_anno_dict[cell_class] = new_anno

## CD8 CM cells

New clustering

In [52]:
cell_class = 't-cd8-cm'

In [53]:
pq_uuid = 'f46939aa-424e-433d-8b0d-45ea77d7308d'

In [54]:
new_obs = read_parquet_uuid(pq_uuid)

Remove gdT cells

In [55]:
keep_bc = [not x for x in new_obs['barcodes'].isin(gdt_bc)]

In [56]:
sum(keep_bc)

37568

In [57]:
new_obs = new_obs[keep_bc]

In [58]:
new_obs.shape

(37568, 56)

In [59]:
new_col = 'leiden_resolution_1.5_t-cd8-cm'

Join and assign to new clusters

In [60]:
joint_df = join_anno(og_obs, new_obs, new_col)

In [61]:
joint_df['AIFI_L3'].isna().sum()

90

In [62]:
joint_df[new_col] = joint_df[new_col].cat.remove_unused_categories()

In [63]:
joint_df.head()

Unnamed: 0,barcodes,leiden_resolution_1.5_t-cd8-cm,AIFI_L1,AIFI_L1_Final,AIFI_L2,AIFI_L2_Final,AIFI_L3,AIFI_L3_Final
0,cf73d01048b611ea8957bafe6d70929e,1,T cell,Yes,Memory CD8 T cell,Yes,GZMK+ CD27+ EM CD8 T cell,Yes
1,cf74475248b611ea8957bafe6d70929e,1,T cell,Yes,Memory CD8 T cell,Yes,CM CD8 T cell,Yes
2,cf75d27048b611ea8957bafe6d70929e,2,T cell,Yes,Memory CD8 T cell,Yes,GZMK- CD27+ EM CD8 T cell,Yes
3,cf77109048b611ea8957bafe6d70929e,5,T cell,Yes,Memory CD8 T cell,Yes,GZMK+ CD27+ EM CD8 T cell,Yes
4,cf771b0848b611ea8957bafe6d70929e,1,T cell,Yes,Memory CD8 T cell,Yes,GZMK+ CD27+ EM CD8 T cell,Yes


In [64]:
label_df = reduce_anno(joint_df, og_anno, new_col)

  .groupby(new_col)['AIFI_L3']


In [65]:
label_df

Unnamed: 0,leiden_resolution_1.5_t-cd8-cm,AIFI_L3,AIFI_L1,AIFI_L1_Final,AIFI_L2,AIFI_L2_Final,AIFI_L3_Final
0,0,CD8aa,T cell,Yes,CD8aa,Yes,Yes
1,1,CM CD8 T cell,T cell,Yes,Memory CD8 T cell,Yes,Yes
2,2,GZMK- CD27+ EM CD8 T cell,T cell,Yes,Memory CD8 T cell,Yes,Yes
3,3,CM CD8 T cell,T cell,Yes,Memory CD8 T cell,Yes,Yes
4,5,CM CD8 T cell,T cell,Yes,Memory CD8 T cell,Yes,Yes
5,6,CM CD8 T cell,T cell,Yes,Memory CD8 T cell,Yes,Yes
6,7,CM CD8 T cell,T cell,Yes,Memory CD8 T cell,Yes,Yes
7,8,CM CD8 T cell,T cell,Yes,Memory CD8 T cell,Yes,Yes
8,9,CM CD8 T cell,T cell,Yes,Memory CD8 T cell,Yes,Yes
9,10,CM CD8 T cell,T cell,Yes,Memory CD8 T cell,Yes,Yes


In [66]:
out_dir = '../04-Annotation/annotations/'
out_file = '{p}/pbmc_ref_{c}_annotations_{d}.csv'.format(
    p = out_dir,
    c = cell_class,
    d = date.today()
)
label_df.to_csv(out_file)

In [67]:
new_anno = new_obs[['barcodes', new_col]]
new_anno = new_anno.merge(label_df, on = new_col, how = 'left')
new_anno = new_anno.rename({new_col: 'label_cl'}, axis = 1)
new_anno['label_cl'] = new_anno['label_cl'].astype(str)
new_anno['label_cl'] = [new_col + '_cl' + x for x in new_anno['label_cl']]
new_anno_dict[cell_class] = new_anno

## CD8 EM cells

New clustering

In [68]:
cell_class = 't-cd8-em'

In [69]:
pq_uuid = '39f48474-671d-4584-a10e-e1dd47b33dac'

In [70]:
new_obs = read_parquet_uuid(pq_uuid)

Remove gdT cells

In [71]:
keep_bc = [not x for x in new_obs['barcodes'].isin(gdt_bc)]

In [72]:
sum(keep_bc)

105726

In [73]:
new_obs = new_obs[keep_bc]

In [74]:
new_obs.shape

(105726, 56)

In [75]:
new_col = 'leiden_resolution_3_t-cd8-em'

Join and assign to new clusters

In [76]:
joint_df = join_anno(og_obs, new_obs, new_col)

In [77]:
joint_df['AIFI_L3'].isna().sum()

61

In [78]:
joint_df[new_col] = joint_df[new_col].cat.remove_unused_categories()

In [79]:
joint_df.head()

Unnamed: 0,barcodes,leiden_resolution_3_t-cd8-em,AIFI_L1,AIFI_L1_Final,AIFI_L2,AIFI_L2_Final,AIFI_L3,AIFI_L3_Final
0,cf72a88448b611ea8957bafe6d70929e,27,Contamination,Yes,C5_C12_Contamination,Yes,C5_C12_Contamination,Yes
1,cf7380e248b611ea8957bafe6d70929e,31,Contamination,Yes,C5_C12_Contamination,Yes,C5_C12_Contamination,Yes
2,cf74919e48b611ea8957bafe6d70929e,9,T cell,Yes,Memory CD8 T cell,Yes,KLRF1- GZMB+ CD27- EM CD8 T cell,Yes
3,cf76a8e448b611ea8957bafe6d70929e,2,T cell,Yes,Memory CD8 T cell,Yes,KLRF1- GZMB+ CD27- EM CD8 T cell,Yes
4,cf7734ee48b611ea8957bafe6d70929e,31,T cell,Yes,Memory CD4 T cell,Yes,KLRF1- GZMB+ CD27- memory CD4 T cell,Yes


In [80]:
label_df = reduce_anno(joint_df, og_anno, new_col)

  .groupby(new_col)['AIFI_L3']


In [81]:
label_df

Unnamed: 0,leiden_resolution_3_t-cd8-em,AIFI_L3,AIFI_L1,AIFI_L1_Final,AIFI_L2,AIFI_L2_Final,AIFI_L3_Final
0,0,KLRF1- GZMB+ CD27- EM CD8 T cell,T cell,Yes,Memory CD8 T cell,Yes,Yes
1,1,KLRF1+ GZMB+ CD27- EM CD8 T cell,T cell,Yes,Memory CD8 T cell,Yes,Yes
2,2,KLRF1- GZMB+ CD27- EM CD8 T cell,T cell,Yes,Memory CD8 T cell,Yes,Yes
3,3,KLRF1+ GZMB+ CD27- EM CD8 T cell,T cell,Yes,Memory CD8 T cell,Yes,Yes
4,4,KLRF1- GZMB+ CD27- EM CD8 T cell,T cell,Yes,Memory CD8 T cell,Yes,Yes
5,5,KLRF1+ GZMB+ CD27- EM CD8 T cell,T cell,Yes,Memory CD8 T cell,Yes,Yes
6,7,KLRF1- GZMB+ CD27- memory CD4 T cell,T cell,Yes,Memory CD4 T cell,Yes,Yes
7,8,KLRF1- GZMB+ CD27- EM CD8 T cell,T cell,Yes,Memory CD8 T cell,Yes,Yes
8,9,KLRF1- GZMB+ CD27- EM CD8 T cell,T cell,Yes,Memory CD8 T cell,Yes,Yes
9,10,KLRF1- GZMB+ CD27- memory CD4 T cell,T cell,Yes,Memory CD4 T cell,Yes,Yes


In [82]:
out_dir = '../04-Annotation/annotations/'
out_file = '{p}/pbmc_ref_{c}_annotations_{d}.csv'.format(
    p = out_dir,
    c = cell_class,
    d = date.today()
)
label_df.to_csv(out_file)

In [83]:
new_anno = new_obs[['barcodes', new_col]]
new_anno = new_anno.merge(label_df, on = new_col, how = 'left')
new_anno = new_anno.rename({new_col: 'label_cl'}, axis = 1)
new_anno['label_cl'] = new_anno['label_cl'].astype(str)
new_anno['label_cl'] = [new_col + '_cl' + x for x in new_anno['label_cl']]
new_anno_dict[cell_class] = new_anno

## Treg cells

New clustering

In [84]:
cell_class = 'treg'

In [85]:
pq_uuid = '8f76dce0-81ec-45c9-9612-68bddcecdd8b'

In [86]:
new_obs = read_parquet_uuid(pq_uuid)

In [87]:
new_obs.shape

(39087, 56)

In [88]:
new_col = 'leiden_resolution_1.5_treg'

Join and assign to new clusters

In [89]:
joint_df = join_anno(og_obs, new_obs, new_col)

In [90]:
joint_df['AIFI_L3'].isna().sum()

8

In [91]:
joint_df.head()

Unnamed: 0,barcodes,leiden_resolution_1.5_treg,AIFI_L1,AIFI_L1_Final,AIFI_L2,AIFI_L2_Final,AIFI_L3,AIFI_L3_Final
0,cf76fa6048b611ea8957bafe6d70929e,1,T cell,Yes,Treg,Yes,Naive CD4 Treg,Yes
1,cf7b9c8248b611ea8957bafe6d70929e,5,T cell,Yes,Treg,Yes,Memory CD4 Treg,Yes
2,cf7d735448b611ea8957bafe6d70929e,6,T cell,Yes,Treg,Yes,KLRB1+ memory CD4 Treg,Yes
3,cf84101048b611ea8957bafe6d70929e,0,T cell,Yes,Treg,Yes,Naive CD4 Treg,Yes
4,cf8b1d1a48b611ea8957bafe6d70929e,8,T cell,Yes,Treg,Yes,Naive CD4 Treg,Yes


In [92]:
label_df = reduce_anno(joint_df, og_anno, new_col)

  .groupby(new_col)['AIFI_L3']


In [93]:
label_df

Unnamed: 0,leiden_resolution_1.5_treg,AIFI_L3,AIFI_L1,AIFI_L1_Final,AIFI_L2,AIFI_L2_Final,AIFI_L3_Final
0,0,Naive CD4 Treg,T cell,Yes,Treg,Yes,Yes
1,1,Naive CD4 Treg,T cell,Yes,Treg,Yes,Yes
2,2,Memory CD4 Treg,T cell,Yes,Treg,Yes,Yes
3,3,Naive CD4 Treg,T cell,Yes,Treg,Yes,Yes
4,4,Memory CD4 Treg,T cell,Yes,Treg,Yes,Yes
5,5,Naive CD4 Treg,T cell,Yes,Treg,Yes,Yes
6,6,KLRB1+ memory CD4 Treg,T cell,Yes,Treg,Yes,Yes
7,7,Memory CD4 Treg,T cell,Yes,Treg,Yes,Yes
8,8,Naive CD4 Treg,T cell,Yes,Treg,Yes,Yes
9,9,Naive CD4 Treg,T cell,Yes,Treg,Yes,Yes


In [94]:
out_dir = '../04-Annotation/annotations/'
out_file = '{p}/pbmc_ref_{c}_annotations_{d}.csv'.format(
    p = out_dir,
    c = cell_class,
    d = date.today()
)
label_df.to_csv(out_file)

In [95]:
new_anno = new_obs[['barcodes', new_col]]
new_anno = new_anno.merge(label_df, on = new_col, how = 'left')
new_anno = new_anno.rename({new_col: 'label_cl'}, axis = 1)
new_anno['label_cl'] = new_anno['label_cl'].astype(str)
new_anno['label_cl'] = [new_col + '_cl' + x for x in new_anno['label_cl']]
new_anno_dict[cell_class] = new_anno

  np.array(self.categories._na_value).astype(dtype)


## CD8 Naive cells

New clustering

In [96]:
cell_class = 't-cd8-naive'

In [97]:
pq_uuid = '9fb15399-7867-4ea9-b4a1-bcbf2d29e0a1'

In [98]:
new_obs = read_parquet_uuid(pq_uuid)

In [99]:
new_obs.shape

(121643, 56)

In [100]:
new_col = 'leiden_resolution_1.5_t-cd8-naive'

Join and assign to new clusters

In [101]:
joint_df = join_anno(og_obs, new_obs, new_col)

In [102]:
joint_df['AIFI_L3'].isna().sum()

22

In [103]:
joint_df.head()

Unnamed: 0,barcodes,leiden_resolution_1.5_t-cd8-naive,AIFI_L1,AIFI_L1_Final,AIFI_L2,AIFI_L2_Final,AIFI_L3,AIFI_L3_Final
0,cf7224c248b611ea8957bafe6d70929e,4,T cell,Yes,Naive CD8 T cell,Yes,Core naive CD8 T cell,Yes
1,cf7228a048b611ea8957bafe6d70929e,10,T cell,Yes,Naive CD8 T cell,Yes,Core naive CD8 T cell,Yes
2,cf7232e648b611ea8957bafe6d70929e,8,T cell,Yes,Naive CD8 T cell,Yes,Core naive CD8 T cell,Yes
3,cf72937648b611ea8957bafe6d70929e,2,T cell,Yes,Naive CD8 T cell,Yes,Core naive CD8 T cell,Yes
4,cf72bd2448b611ea8957bafe6d70929e,1,T cell,Yes,Naive CD8 T cell,Yes,Core naive CD8 T cell,Yes


In [104]:
label_df = reduce_anno(joint_df, og_anno, new_col)

  .groupby(new_col)['AIFI_L3']


In [105]:
label_df

Unnamed: 0,leiden_resolution_1.5_t-cd8-naive,AIFI_L3,AIFI_L1,AIFI_L1_Final,AIFI_L2,AIFI_L2_Final,AIFI_L3_Final
0,0,Core naive CD8 T cell,T cell,Yes,Naive CD8 T cell,Yes,Yes
1,1,Core naive CD8 T cell,T cell,Yes,Naive CD8 T cell,Yes,Yes
2,2,Core naive CD8 T cell,T cell,Yes,Naive CD8 T cell,Yes,Yes
3,3,Core naive CD8 T cell,T cell,Yes,Naive CD8 T cell,Yes,Yes
4,4,Core naive CD8 T cell,T cell,Yes,Naive CD8 T cell,Yes,Yes
5,5,Core naive CD8 T cell,T cell,Yes,Naive CD8 T cell,Yes,Yes
6,6,Core naive CD8 T cell,T cell,Yes,Naive CD8 T cell,Yes,Yes
7,7,Core naive CD8 T cell,T cell,Yes,Naive CD8 T cell,Yes,Yes
8,8,Core naive CD8 T cell,T cell,Yes,Naive CD8 T cell,Yes,Yes
9,9,SOX4+ naive CD8 T cell,T cell,Yes,Naive CD8 T cell,Yes,Yes


In [106]:
out_dir = '../04-Annotation/annotations/'
out_file = '{p}/pbmc_ref_{c}_annotations_{d}.csv'.format(
    p = out_dir,
    c = cell_class,
    d = date.today()
)
label_df.to_csv(out_file)

In [107]:
new_anno = new_obs[['barcodes', new_col]]
new_anno = new_anno.merge(label_df, on = new_col, how = 'left')
new_anno = new_anno.rename({new_col: 'label_cl'}, axis = 1)
new_anno['label_cl'] = new_anno['label_cl'].astype(str)
new_anno['label_cl'] = [new_col + '_cl' + x for x in new_anno['label_cl']]
new_anno_dict[cell_class] = new_anno

  np.array(self.categories._na_value).astype(dtype)


## Proliferating cells

New clustering

In [108]:
cell_class = 't-proliferating'

In [109]:
pq_uuid = '5043bfef-7430-413b-af8d-411e3578dab6'

In [110]:
new_obs = read_parquet_uuid(pq_uuid)

In [111]:
new_obs.shape

(4330, 56)

In [112]:
new_col = 'leiden_resolution_1.5_t-proliferating'

Join and assign to new clusters

In [113]:
joint_df = join_anno(og_obs, new_obs, new_col)

In [114]:
joint_df['AIFI_L3'].isna().sum()

55

In [115]:
joint_df.head()

Unnamed: 0,barcodes,leiden_resolution_1.5_t-proliferating,AIFI_L1,AIFI_L1_Final,AIFI_L2,AIFI_L2_Final,AIFI_L3,AIFI_L3_Final
0,cf721a4048b611ea8957bafe6d70929e,14,NK cell,Yes,Proliferating NK cell,Yes,Proliferating NK cell,Yes
1,cf73ebd648b611ea8957bafe6d70929e,11,T cell,Yes,Proliferating T cell,Yes,Proliferating T cell,Yes
2,cf742e8448b611ea8957bafe6d70929e,13,Doublets,Yes,Proliferating Cells+Monocytes Doublets,Yes,Proliferating Cells+Monocytes Doublets,Yes
3,cf74d2da48b611ea8957bafe6d70929e,15,T cell,Yes,Proliferating T cell,Yes,Proliferating T cell,Yes
4,cf78947448b611ea8957bafe6d70929e,6,T cell,Yes,Proliferating T cell,Yes,Proliferating T cell,Yes


In [116]:
label_df = reduce_anno(joint_df, og_anno, new_col)

  .groupby(new_col)['AIFI_L3']


In [117]:
label_df

Unnamed: 0,leiden_resolution_1.5_t-proliferating,AIFI_L3,AIFI_L1,AIFI_L1_Final,AIFI_L2,AIFI_L2_Final,AIFI_L3_Final
0,0,Proliferating NK cell,NK cell,Yes,Proliferating NK cell,Yes,Yes
1,1,Proliferating NK cell,NK cell,Yes,Proliferating NK cell,Yes,Yes
2,2,Proliferating T cell,T cell,Yes,Proliferating T cell,Yes,Yes
3,3,Proliferating NK cell,NK cell,Yes,Proliferating NK cell,Yes,Yes
4,4,Proliferating NK cell,NK cell,Yes,Proliferating NK cell,Yes,Yes
5,5,Proliferating T cell,T cell,Yes,Proliferating T cell,Yes,Yes
6,6,Proliferating T cell,T cell,Yes,Proliferating T cell,Yes,Yes
7,7,Proliferating T cell,T cell,Yes,Proliferating T cell,Yes,Yes
8,8,Proliferating T cell,T cell,Yes,Proliferating T cell,Yes,Yes
9,9,Proliferating NK cell,NK cell,Yes,Proliferating NK cell,Yes,Yes


In [118]:
out_dir = '../04-Annotation/annotations/'
out_file = '{p}/pbmc_ref_{c}_annotations_{d}.csv'.format(
    p = out_dir,
    c = cell_class,
    d = date.today()
)
label_df.to_csv(out_file)

In [119]:
new_anno = new_obs[['barcodes', new_col]]
new_anno = new_anno.merge(label_df, on = new_col, how = 'left')
new_anno = new_anno.rename({new_col: 'label_cl'}, axis = 1)
new_anno['label_cl'] = new_anno['label_cl'].astype(str)
new_anno['label_cl'] = [new_col + '_cl' + x for x in new_anno['label_cl']]
new_anno_dict[cell_class] = new_anno

  np.array(self.categories._na_value).astype(dtype)


## ISG-high cells

New clustering

In [120]:
cell_class = 't-isg-high'

In [121]:
pq_uuid = 'c6be4029-b822-4558-9629-f33b96e227c5'

In [122]:
new_obs = read_parquet_uuid(pq_uuid)

In [123]:
new_obs.shape

(14140, 56)

In [124]:
new_col = 'leiden_resolution_2_t-isg-high'

Join and assign to new clusters

In [125]:
joint_df = join_anno(og_obs, new_obs, new_col)

In [126]:
joint_df['AIFI_L3'].isna().sum()

1

In [127]:
joint_df.head()

Unnamed: 0,barcodes,leiden_resolution_2_t-isg-high,AIFI_L1,AIFI_L1_Final,AIFI_L2,AIFI_L2_Final,AIFI_L3,AIFI_L3_Final
0,cf76c3ba48b611ea8957bafe6d70929e,2,T cell,Yes,Memory CD4 T cell,Yes,ISG+ memory CD4 T cell,Yes
1,cf78976c48b611ea8957bafe6d70929e,1,T cell,Yes,Naive CD4 T cell,Yes,ISG+ naive CD4 T cell,Yes
2,cf7c68a648b611ea8957bafe6d70929e,15,T cell,Yes,Memory CD4 T cell,Yes,ISG+ memory CD4 T cell,Yes
3,cf7e220448b611ea8957bafe6d70929e,13,T cell,Yes,Naive CD4 T cell,Yes,ISG+ naive CD4 T cell,Yes
4,cf84c32048b611ea8957bafe6d70929e,7,T cell,Yes,Naive CD8 T cell,Yes,ISG+ naive CD8 T cell,Yes


In [128]:
label_df = reduce_anno(joint_df, og_anno, new_col)

  .groupby(new_col)['AIFI_L3']


In [129]:
label_df

Unnamed: 0,leiden_resolution_2_t-isg-high,AIFI_L3,AIFI_L1,AIFI_L1_Final,AIFI_L2,AIFI_L2_Final,AIFI_L3_Final
0,0,ISG+ naive CD4 T cell,T cell,Yes,Naive CD4 T cell,Yes,Yes
1,1,ISG+ naive CD4 T cell,T cell,Yes,Naive CD4 T cell,Yes,Yes
2,2,ISG+ memory CD4 T cell,T cell,Yes,Memory CD4 T cell,Yes,Yes
3,3,ISG+ naive CD4 T cell,T cell,Yes,Naive CD4 T cell,Yes,Yes
4,4,ISG+ naive CD4 T cell,T cell,Yes,Naive CD4 T cell,Yes,Yes
5,5,ISG+ memory CD4 T cell,T cell,Yes,Memory CD4 T cell,Yes,Yes
6,6,ISG+ memory CD4 T cell,T cell,Yes,Memory CD4 T cell,Yes,Yes
7,7,ISG+ naive CD8 T cell,T cell,Yes,Naive CD8 T cell,Yes,Yes
8,8,ISG+ naive CD4 T cell,T cell,Yes,Naive CD4 T cell,Yes,Yes
9,9,ISG+ memory CD8 T cell,T cell,Yes,Memory CD8 T cell,Yes,Yes


In [130]:
out_dir = '../04-Annotation/annotations/'
out_file = '{p}/pbmc_ref_{c}_annotations_{d}.csv'.format(
    p = out_dir,
    c = cell_class,
    d = date.today()
)
label_df.to_csv(out_file)

In [131]:
new_anno = new_obs[['barcodes', new_col]]
new_anno = new_anno.merge(label_df, on = new_col, how = 'left')
new_anno = new_anno.rename({new_col: 'label_cl'}, axis = 1)
new_anno['label_cl'] = new_anno['label_cl'].astype(str)
new_anno['label_cl'] = [new_col + '_cl' + x for x in new_anno['label_cl']]
new_anno_dict[cell_class] = new_anno

  np.array(self.categories._na_value).astype(dtype)


## Other T cells

New clustering

In [132]:
cell_class = 't-other'

In [133]:
pq_uuid = 'a04b6a89-81ff-43ca-b0f8-0b1939f407f1'

In [134]:
new_obs = read_parquet_uuid(pq_uuid)

In [135]:
new_obs.shape

(376762, 56)

In [136]:
new_col = 'leiden_resolution_1.5_t-other'

Join and assign to new clusters

In [137]:
joint_df = join_anno(og_obs, new_obs, new_col)

In [138]:
joint_df['AIFI_L3'].isna().sum()

226

In [139]:
joint_df.head()

Unnamed: 0,barcodes,leiden_resolution_1.5_t-other,AIFI_L1,AIFI_L1_Final,AIFI_L2,AIFI_L2_Final,AIFI_L3,AIFI_L3_Final
0,cf71fb7848b611ea8957bafe6d70929e,0,T cell,Yes,Memory CD4 T cell,Yes,CM CD4 T cell,Yes
1,cf7216a848b611ea8957bafe6d70929e,5,T cell,Yes,Memory CD4 T cell,Yes,GZMB- CD27+ EM CD4 T cell,Yes
2,cf7226c048b611ea8957bafe6d70929e,13,T cell,Yes,Memory CD4 T cell,Yes,CM CD4 T cell,Yes
3,cf7238b848b611ea8957bafe6d70929e,8,T cell,Yes,Memory CD4 T cell,Yes,CM CD4 T cell,Yes
4,cf72802048b611ea8957bafe6d70929e,9,T cell,Yes,Memory CD4 T cell,Yes,GZMB- CD27- EM CD4 T cell,Yes


In [140]:
label_df = reduce_anno(joint_df, og_anno, new_col)

  .groupby(new_col)['AIFI_L3']


In [141]:
label_df

Unnamed: 0,leiden_resolution_1.5_t-other,AIFI_L3,AIFI_L1,AIFI_L1_Final,AIFI_L2,AIFI_L2_Final,AIFI_L3_Final
0,0,CM CD4 T cell,T cell,Yes,Memory CD4 T cell,Yes,Yes
1,1,CM CD4 T cell,T cell,Yes,Memory CD4 T cell,Yes,Yes
2,2,CM CD4 T cell,T cell,Yes,Memory CD4 T cell,Yes,Yes
3,3,GZMK+ CD27+ EM CD8 T cell,T cell,Yes,Memory CD8 T cell,Yes,Yes
4,4,GZMK+ CD27+ EM CD8 T cell,T cell,Yes,Memory CD8 T cell,Yes,Yes
5,5,GZMB- CD27+ EM CD4 T cell,T cell,Yes,Memory CD4 T cell,Yes,Yes
6,6,GZMB- CD27+ EM CD4 T cell,T cell,Yes,Memory CD4 T cell,Yes,Yes
7,7,CM CD4 T cell,T cell,Yes,Memory CD4 T cell,Yes,Yes
8,8,GZMB- CD27- EM CD4 T cell,T cell,Yes,Memory CD4 T cell,Yes,Yes
9,9,GZMB- CD27- EM CD4 T cell,T cell,Yes,Memory CD4 T cell,Yes,Yes


In [142]:
out_dir = '../04-Annotation/annotations/'
out_file = '{p}/pbmc_ref_{c}_annotations_{d}.csv'.format(
    p = out_dir,
    c = cell_class,
    d = date.today()
)
label_df.to_csv(out_file)

In [143]:
new_anno = new_obs[['barcodes', new_col]]
new_anno = new_anno.merge(label_df, on = new_col, how = 'left')
new_anno = new_anno.rename({new_col: 'label_cl'}, axis = 1)
new_anno['label_cl'] = new_anno['label_cl'].astype(str)
new_anno['label_cl'] = [new_col + '_cl' + x for x in new_anno['label_cl']]
new_anno_dict[cell_class] = new_anno

  np.array(self.categories._na_value).astype(dtype)


## Compare annotations

In [144]:
all_new = pd.concat(new_anno_dict)

In [145]:
new_counts = all_new['AIFI_L3'].value_counts()
new_counts

AIFI_L3
Core naive CD4 T cell                     341521
CM CD4 T cell                             161769
Core naive CD8 T cell                     115126
GZMB- CD27+ EM CD4 T cell                  70587
GZMB- CD27- EM CD4 T cell                  67332
KLRF1- GZMB+ CD27- EM CD8 T cell           65156
GZMK+ CD27+ EM CD8 T cell                  62099
CD8 MAIT                                   44119
SOX4+ naive CD4 T cell                     29411
CM CD8 T cell                              28453
Naive CD4 Treg                             21919
KLRF1+ GZMB+ CD27- EM CD8 T cell           21471
GZMK+ Vd2 gdT                              18382
GZMB+ Vd2 gdT                              17371
KLRF1- GZMB+ CD27- memory CD4 T cell       17162
Naive CD8 T cell Platelet Doublets         14199
Memory CD4 Treg                            13368
Contamination                              10233
ISG+ naive CD4 T cell                       7139
CD4 naive Platelet Doublets                 5769
CD8aa       

In [146]:
sum(new_counts)

1191327

Original type counts

In [147]:
old_counts = og_obs['AIFI_L3'].value_counts()
old_counts

AIFI_L3
Core naive CD4 T cell                     352699
CM CD4 T cell                             173578
Core naive CD8 T cell                     117237
KLRF1- GZMB+ CD27- EM CD8 T cell           70293
GZMB- CD27+ EM CD4 T cell                  62544
GZMB- CD27- EM CD4 T cell                  62119
GZMK+ CD27+ EM CD8 T cell                  60917
CD8 MAIT                                   41850
CM CD8 T cell                              25405
SOX4+ naive CD4 T cell                     22485
Naive CD4 Treg                             19578
KLRF1+ GZMB+ CD27- EM CD8 T cell           18310
GZMB+ Vd2 gdT                              18294
GZMK+ Vd2 gdT                              17826
KLRF1- GZMB+ CD27- memory CD4 T cell       17574
Memory CD4 Treg                            15661
Contamination                               8564
Naive CD8 T cell Platelet Doublets          7293
ISG+ naive CD4 T cell                       7149
GZMK- CD27+ EM CD8 T cell                   6703
SOX4+ naive 

In [148]:
sum(old_counts)

1193859

Difference in counts

In [149]:
new_counts - old_counts

AIFI_L3
Adaptive NK cell                          -2045.0
C5_C12_Contamination                       -298.0
CD4 MAIT                                  -2034.0
CD4 naive CD16 Monocytes Doublets         -2155.0
CD4 naive Erythrocytes Doublets               NaN
CD4 naive Platelet Doublets                1063.0
CD8 MAIT                                   2269.0
CD8aa                                       146.0
CM CD4 T cell                            -11809.0
CM CD8 T cell                              3048.0
Contamination                              1669.0
Core naive CD4 T cell                    -11178.0
Core naive CD8 T cell                     -2111.0
DN T cell                                   123.0
GZMB+ Vd2 gdT                              -923.0
GZMB- CD27+ EM CD4 T cell                  8043.0
GZMB- CD27- EM CD4 T cell                  5213.0
GZMK+ CD27+ EM CD8 T cell                  1182.0
GZMK+ Vd2 gdT                               556.0
GZMK+ memory CD4 Treg                     

Percent difference

In [150]:
(new_counts - old_counts) / old_counts * 100

AIFI_L3
Adaptive NK cell                         -36.060659
C5_C12_Contamination                     -13.333333
CD4 MAIT                                 -63.582369
CD4 naive CD16 Monocytes Doublets        -61.186826
CD4 naive Erythrocytes Doublets                 NaN
CD4 naive Platelet Doublets               22.588185
CD8 MAIT                                   5.421744
CD8aa                                      2.611340
CM CD4 T cell                             -6.803282
CM CD8 T cell                             11.997638
Contamination                             19.488557
Core naive CD4 T cell                     -3.169275
Core naive CD8 T cell                     -1.800626
DN T cell                                  5.525606
GZMB+ Vd2 gdT                             -5.045370
GZMB- CD27+ EM CD4 T cell                 12.859747
GZMB- CD27- EM CD4 T cell                  8.391957
GZMK+ CD27+ EM CD8 T cell                  1.940345
GZMK+ Vd2 gdT                              3.119040
GZMK

In [151]:
old_id = og_obs[['barcodes','AIFI_L3']].rename({'AIFI_L3':'old_L3'}, axis = 1)
new_id = all_new[['barcodes','AIFI_L3']].rename({'AIFI_L3':'new_L3'}, axis = 1)

In [152]:
sum(new_id['barcodes'].isin(old_id['barcodes']))

1190744

In [153]:
comp_old = old_id.merge(new_id, on = 'barcodes', how = 'left')

In [154]:
comp_old.shape

(1193859, 3)

In [155]:
sum(comp_old['old_L3'] == comp_old['new_L3'])

1047724

In [156]:
diff_old = comp_old[comp_old['old_L3'] != comp_old['new_L3']]
#diff = diff[[not x for x in diff['new_L3'].isna()]]

In [157]:
diff_old.head()

Unnamed: 0,barcodes,old_L3,new_L3
2,cf7226c048b611ea8957bafe6d70929e,CM CD4 T cell,Contamination
3,cf7238b848b611ea8957bafe6d70929e,CM CD4 T cell,GZMB- CD27- EM CD4 T cell
6,cf72e17848b611ea8957bafe6d70929e,CM CD4 T cell,Core naive CD4 T cell
7,cf73232c48b611ea8957bafe6d70929e,CM CD4 T cell,Core naive CD4 T cell
13,cf7366ca48b611ea8957bafe6d70929e,GZMB- CD27- EM CD4 T cell,GZMB- CD27+ EM CD4 T cell


In [158]:
diff_old[['old_L3','new_L3']].value_counts().head(50)

old_L3                                new_L3                              
CM CD4 T cell                         GZMB- CD27- EM CD4 T cell               19418
GZMB- CD27- EM CD4 T cell             GZMB- CD27+ EM CD4 T cell                9688
Core naive CD4 T cell                 SOX4+ naive CD4 T cell                   8905
GZMB- CD27- EM CD4 T cell             CM CD4 T cell                            7329
Core naive CD4 T cell                 CM CD4 T cell                            5949
CM CD4 T cell                         GZMB- CD27+ EM CD4 T cell                5664
KLRF1- GZMB+ CD27- EM CD8 T cell      KLRF1+ GZMB+ CD27- EM CD8 T cell         5181
CM CD4 T cell                         Core naive CD4 T cell                    4158
Core naive CD8 T cell                 Naive CD8 T cell Platelet Doublets       4006
GZMB- CD27+ EM CD4 T cell             CM CD4 T cell                            3737
Core naive CD4 T cell                 Naive CD8 T cell Platelet Doublets       3401
G

In [159]:
comp_new = new_id.merge(old_id, on = 'barcodes', how = 'left')

In [160]:
comp_new.shape

(1191327, 3)

In [161]:
sum(comp_new['old_L3'] == comp_new['new_L3'])

1047724

In [162]:
diff_new = comp_new[comp_new['old_L3'] != comp_new['new_L3']]
#diff = diff[[not x for x in diff['new_L3'].isna()]]

In [163]:
diff_new.shape

(143603, 3)

In [164]:
sum(diff_new['new_L3'].isna())

0

Are missing Adaptive NKs now assigned to NK clusters?

In [165]:
nk_uuid = '86d536fc-7975-4ff6-9701-ce29a5df19b5'

In [166]:
nk_obs = read_parquet_uuid(nk_uuid)

In [167]:
nk_obs.shape

(1952128, 52)

In [168]:
sum(diff_old['old_L3'] == 'Adaptive NK cell')

2444

In [169]:
missing_anks = diff_old.loc[diff_old['old_L3'] == 'Adaptive NK cell','barcodes'].tolist()

In [170]:
sum(nk_obs['barcodes'].isin(missing_anks))

2436

Yes, they're (almost) all in there.

In [171]:
import session_info
session_info.show()