# Adapt T cell annotations

With slightly different clustering, we need to shift the original expert cluster annotations to new cluster numbers.

We can do this using the barcodes associated with each cluster.

In [1]:
from datetime import date
import hisepy
import os
import pandas as pd
import scanpy as sc

In [2]:
def read_adata_uuid(h5ad_uuid):
    h5ad_path = '/home/jupyter/cache/{u}'.format(u = h5ad_uuid)
    if not os.path.isdir(h5ad_path):
        hise_res = hisepy.reader.cache_files([h5ad_uuid])
    h5ad_filename = os.listdir(h5ad_path)[0]
    h5ad_file = '{p}/{f}'.format(p = h5ad_path, f = h5ad_filename)
    adata = sc.read_h5ad(h5ad_file)
    return adata

In [3]:
def read_parquet_uuid(parquet_uuid):
    pq_path = '/home/jupyter/cache/{u}'.format(u = parquet_uuid)
    if not os.path.isdir(pq_path):
        hise_res = hisepy.reader.cache_files([parquet_uuid])
    pq_filename = os.listdir(pq_path)[0]
    pq_file = '{p}/{f}'.format(p = pq_path, f = pq_filename)
    df = pd.read_parquet(pq_file)
    return df

In [4]:
def join_anno(og_obs, new_obs, new_col):
    new_df = new_obs[['barcodes', new_col]].copy()
    new_df = new_df.reset_index(drop = True)
    joint_df = new_df.merge(og_obs, how = 'left', on = 'barcodes')

    return joint_df

In [5]:
def reduce_anno(joint_df, og_anno, new_col):    
    freq_anno = joint_df.drop('barcodes', axis = 1)
    
    freq_anno = (
        freq_anno
        .groupby(new_col)['AIFI_L3']
        .agg(pd.Series.mode)
        .to_frame()
        .reset_index()
    )

    new_anno = freq_anno.merge(og_anno, how = 'left', on = 'AIFI_L3')

    return(new_anno)

Original Annotations

In [6]:
og_obs_list = []
for i in range(1, 11):
    og_file = 'Annotation_Part_{i}.parquet'.format(i = i)
    og_obs = pd.read_parquet(og_file)
    og_obs_list.append(og_obs)

In [7]:
og_obs = pd.concat(og_obs_list)

In [8]:
og_obs = og_obs[['barcodes',
                 'AIFI_L1', 'AIFI_L1_Final',
                 'AIFI_L2', 'AIFI_L2_Final',
                 'AIFI_L3', 'AIFI_L3_Final']]
og_obs = og_obs.reset_index(drop = True)

In [9]:
og_obs.shape

(1193859, 7)

In [10]:
og_anno = og_obs.drop('barcodes', axis = 1).drop_duplicates()

In [11]:
joint_df_list = []

## gdT cells

gdT cells need to be used to filter MAIT, CD8 CM, and CD8 EM cells because we pool some TRDC+ cells from these subsets to assign identities to the gdTs. If we don't remove them, we'll double-count them.

New clustering

In [12]:
cell_class = 't-gd'

In [13]:
pq_uuid = '1577b86b-f2cd-46fa-b859-27317fafd74f'

In [14]:
new_obs = read_parquet_uuid(pq_uuid)

In [15]:
new_obs.shape

(54075, 56)

In [16]:
new_col = 'leiden_resolution_1.5_t-gd'

Join and assign to new clusters

In [17]:
joint_df = join_anno(og_obs, new_obs, new_col)

In [18]:
joint_df['AIFI_L3'].isna().sum()

70

In [19]:
joint_df.head()

Unnamed: 0,barcodes,leiden_resolution_1.5_t-gd,AIFI_L1,AIFI_L1_Final,AIFI_L2,AIFI_L2_Final,AIFI_L3,AIFI_L3_Final
0,cf7365ee48b611ea8957bafe6d70929e,8,NK cell,Yes,CD56dim NK cell,Yes,Adaptive NK cell,Yes
1,cf73841648b611ea8957bafe6d70929e,8,NK cell,Yes,CD56dim NK cell,Yes,Adaptive NK cell,Yes
2,cf74a06c48b611ea8957bafe6d70929e,1,T cell,Yes,gdT,Yes,GZMK+ Vd2 gdT,Yes
3,cf74f51248b611ea8957bafe6d70929e,3,T cell,Yes,gdT,Yes,GZMB+ Vd2 gdT,Yes
4,cf765bbe48b611ea8957bafe6d70929e,3,T cell,Yes,gdT,Yes,GZMB+ Vd2 gdT,Yes


In [20]:
joint_df_list.append(joint_df.drop(new_col, axis = 1))

In [21]:
label_df = reduce_anno(joint_df, og_anno, new_col)

  .groupby(new_col)['AIFI_L3']


In [22]:
label_df

Unnamed: 0,leiden_resolution_1.5_t-gd,AIFI_L3,AIFI_L1,AIFI_L1_Final,AIFI_L2,AIFI_L2_Final,AIFI_L3_Final
0,0,GZMK+ Vd2 gdT,T cell,Yes,gdT,Yes,Yes
1,1,GZMK+ Vd2 gdT,T cell,Yes,gdT,Yes,Yes
2,2,GZMB+ Vd2 gdT,T cell,Yes,gdT,Yes,Yes
3,3,GZMB+ Vd2 gdT,T cell,Yes,gdT,Yes,Yes
4,4,KLRF1+ effector Vd1 gdT,T cell,Yes,gdT,Yes,Yes
5,5,GZMK+ Vd2 gdT,T cell,Yes,gdT,Yes,Yes
6,6,KLRF1- effector Vd1 gdT,T cell,Yes,gdT,Yes,Yes
7,7,Naive Vd1 gdT,T cell,Yes,gdT,Yes,Yes
8,8,Adaptive NK cell,NK cell,Yes,CD56dim NK cell,Yes,Yes
9,9,GZMB+ Vd2 gdT,T cell,Yes,gdT,Yes,Yes


In [23]:
out_dir = '../04-Annotation/annotations/'
out_file = '{p}/ref_pbmc_{c}_annotations_{d}.csv'.format(
    p = out_dir,
    c = cell_class,
    d = date.today()
)
label_df.to_csv(out_file)

For filtering

In [24]:
gdt_bc = joint_df['barcodes'].to_list()

In [25]:
gdt_bc[0:5]

['cf7365ee48b611ea8957bafe6d70929e',
 'cf73841648b611ea8957bafe6d70929e',
 'cf74a06c48b611ea8957bafe6d70929e',
 'cf74f51248b611ea8957bafe6d70929e',
 'cf765bbe48b611ea8957bafe6d70929e']

## Naive CD4 T cells

New clustering

In [26]:
cell_class = 't-cd4-naive'

In [27]:
pq_uuid = '979c9314-5c4f-4da2-a674-cc099cd06412'

In [28]:
new_obs = read_parquet_uuid(pq_uuid)

downloading fileID: 979c9314-5c4f-4da2-a674-cc099cd06412
Files have been successfully downloaded!


In [29]:
new_col = 'leiden_resolution_1.5_t-cd4-naive'

Join and assign to new clusters

In [30]:
joint_df = join_anno(og_obs, new_obs, new_col)

In [31]:
joint_df.shape

(389874, 8)

In [32]:
joint_df['AIFI_L3'].isna().sum()

29

In [33]:
joint_df_list.append(joint_df.drop(new_col, axis = 1))

In [34]:
label_df = reduce_anno(joint_df, og_anno, new_col)

  .groupby(new_col)['AIFI_L3']


In [35]:
label_df

Unnamed: 0,leiden_resolution_1.5_t-cd4-naive,AIFI_L3,AIFI_L1,AIFI_L1_Final,AIFI_L2,AIFI_L2_Final,AIFI_L3_Final
0,0,Core naive CD4 T cell,T cell,Yes,Naive CD4 T cell,Yes,Yes
1,1,Core naive CD4 T cell,T cell,Yes,Naive CD4 T cell,Yes,Yes
2,2,Core naive CD4 T cell,T cell,Yes,Naive CD4 T cell,Yes,Yes
3,3,Core naive CD4 T cell,T cell,Yes,Naive CD4 T cell,Yes,Yes
4,4,Core naive CD4 T cell,T cell,Yes,Naive CD4 T cell,Yes,Yes
5,5,Core naive CD4 T cell,T cell,Yes,Naive CD4 T cell,Yes,Yes
6,6,Core naive CD4 T cell,T cell,Yes,Naive CD4 T cell,Yes,Yes
7,7,SOX4+ naive CD4 T cell,T cell,Yes,Naive CD4 T cell,Yes,Yes
8,8,Core naive CD4 T cell,T cell,Yes,Naive CD4 T cell,Yes,Yes
9,9,Naive CD8 T cell Platelet Doublets,T cell,Yes,Naive CD8 T cell Platelet Doublets,Yes,Yes


In [36]:
out_dir = '../04-Annotation/annotations/'
out_file = '{p}/pbmc_ref_{c}_annotations_{d}.csv'.format(
    p = out_dir,
    c = cell_class,
    d = date.today()
)
label_df.to_csv(out_file)

## MAIT cells

New clustering

In [37]:
cell_class = 't-mait'

In [38]:
pq_uuid = 'a1ecdac3-1332-4411-a135-41b94977b6c0'

In [39]:
new_obs = read_parquet_uuid(pq_uuid)

downloading fileID: a1ecdac3-1332-4411-a135-41b94977b6c0
Files have been successfully downloaded!


Remove gdT cells

In [40]:
keep_bc = [not x for x in new_obs['barcodes'].isin(gdt_bc)]

In [41]:
sum(keep_bc)

48027

In [42]:
new_obs = new_obs[keep_bc]

In [43]:
new_col = 'leiden_resolution_1.5_t-mait'

Join and assign to new clusters

In [44]:
joint_df = join_anno(og_obs, new_obs, new_col)

In [45]:
joint_df['AIFI_L3'].isna().sum()

21

In [46]:
joint_df[new_col] = joint_df[new_col].cat.remove_unused_categories()

In [47]:
joint_df.head()

Unnamed: 0,barcodes,leiden_resolution_1.5_t-mait,AIFI_L1,AIFI_L1_Final,AIFI_L2,AIFI_L2_Final,AIFI_L3,AIFI_L3_Final
0,cf71f47048b611ea8957bafe6d70929e,0,T cell,Yes,MAIT,Yes,CD8 MAIT,Yes
1,cf7236ec48b611ea8957bafe6d70929e,2,T cell,Yes,MAIT,Yes,CD8 MAIT,Yes
2,cf75435048b611ea8957bafe6d70929e,1,T cell,Yes,MAIT,Yes,CD8 MAIT,Yes
3,cf75b11e48b611ea8957bafe6d70929e,2,T cell,Yes,MAIT,Yes,CD8 MAIT,Yes
4,cf78181e48b611ea8957bafe6d70929e,0,T cell,Yes,MAIT,Yes,CD8 MAIT,Yes


In [48]:
joint_df_list.append(joint_df.drop(new_col, axis = 1))

In [49]:
label_df = reduce_anno(joint_df, og_anno, new_col)

  .groupby(new_col)['AIFI_L3']


In [50]:
label_df

Unnamed: 0,leiden_resolution_1.5_t-mait,AIFI_L3,AIFI_L1,AIFI_L1_Final,AIFI_L2,AIFI_L2_Final,AIFI_L3_Final
0,0,CD8 MAIT,T cell,Yes,MAIT,Yes,Yes
1,1,CD8 MAIT,T cell,Yes,MAIT,Yes,Yes
2,2,CD8 MAIT,T cell,Yes,MAIT,Yes,Yes
3,3,CD8 MAIT,T cell,Yes,MAIT,Yes,Yes
4,4,CD8 MAIT,T cell,Yes,MAIT,Yes,Yes
5,5,CD8 MAIT,T cell,Yes,MAIT,Yes,Yes
6,6,CD8 MAIT,T cell,Yes,MAIT,Yes,Yes
7,7,CD8 MAIT,T cell,Yes,MAIT,Yes,Yes
8,8,CD8 MAIT,T cell,Yes,MAIT,Yes,Yes
9,10,HBB+ MAIT,T cell,Yes,MAIT,Yes,Yes


In [51]:
out_dir = '../04-Annotation/annotations/'
out_file = '{p}/pbmc_ref_{c}_annotations_{d}.csv'.format(
    p = out_dir,
    c = cell_class,
    d = date.today()
)
label_df.to_csv(out_file)

## CD8 CM cells

New clustering

In [52]:
cell_class = 't-cd8-cm'

In [53]:
pq_uuid = 'f46939aa-424e-433d-8b0d-45ea77d7308d'

In [54]:
new_obs = read_parquet_uuid(pq_uuid)

downloading fileID: f46939aa-424e-433d-8b0d-45ea77d7308d
Files have been successfully downloaded!


Remove gdT cells

In [55]:
keep_bc = [not x for x in new_obs['barcodes'].isin(gdt_bc)]

In [56]:
sum(keep_bc)

37568

In [57]:
new_obs = new_obs[keep_bc]

In [58]:
new_obs.shape

(37568, 56)

In [59]:
new_col = 'leiden_resolution_1.5_t-cd8-cm'

Join and assign to new clusters

In [60]:
joint_df = join_anno(og_obs, new_obs, new_col)

In [61]:
joint_df['AIFI_L3'].isna().sum()

90

In [62]:
joint_df[new_col] = joint_df[new_col].cat.remove_unused_categories()

In [63]:
joint_df.head()

Unnamed: 0,barcodes,leiden_resolution_1.5_t-cd8-cm,AIFI_L1,AIFI_L1_Final,AIFI_L2,AIFI_L2_Final,AIFI_L3,AIFI_L3_Final
0,cf73d01048b611ea8957bafe6d70929e,1,T cell,Yes,Memory CD8 T cell,Yes,GZMK+ CD27+ EM CD8 T cell,Yes
1,cf74475248b611ea8957bafe6d70929e,1,T cell,Yes,Memory CD8 T cell,Yes,CM CD8 T cell,Yes
2,cf75d27048b611ea8957bafe6d70929e,2,T cell,Yes,Memory CD8 T cell,Yes,GZMK- CD27+ EM CD8 T cell,Yes
3,cf77109048b611ea8957bafe6d70929e,5,T cell,Yes,Memory CD8 T cell,Yes,GZMK+ CD27+ EM CD8 T cell,Yes
4,cf771b0848b611ea8957bafe6d70929e,1,T cell,Yes,Memory CD8 T cell,Yes,GZMK+ CD27+ EM CD8 T cell,Yes


In [64]:
joint_df_list.append(joint_df.drop(new_col, axis = 1))

In [65]:
label_df = reduce_anno(joint_df, og_anno, new_col)

  .groupby(new_col)['AIFI_L3']


In [66]:
label_df

Unnamed: 0,leiden_resolution_1.5_t-cd8-cm,AIFI_L3,AIFI_L1,AIFI_L1_Final,AIFI_L2,AIFI_L2_Final,AIFI_L3_Final
0,0,CD8aa,T cell,Yes,CD8aa,Yes,Yes
1,1,CM CD8 T cell,T cell,Yes,Memory CD8 T cell,Yes,Yes
2,2,GZMK- CD27+ EM CD8 T cell,T cell,Yes,Memory CD8 T cell,Yes,Yes
3,3,CM CD8 T cell,T cell,Yes,Memory CD8 T cell,Yes,Yes
4,5,CM CD8 T cell,T cell,Yes,Memory CD8 T cell,Yes,Yes
5,6,CM CD8 T cell,T cell,Yes,Memory CD8 T cell,Yes,Yes
6,7,CM CD8 T cell,T cell,Yes,Memory CD8 T cell,Yes,Yes
7,8,CM CD8 T cell,T cell,Yes,Memory CD8 T cell,Yes,Yes
8,9,CM CD8 T cell,T cell,Yes,Memory CD8 T cell,Yes,Yes
9,10,CM CD8 T cell,T cell,Yes,Memory CD8 T cell,Yes,Yes


In [67]:
out_dir = '../04-Annotation/annotations/'
out_file = '{p}/pbmc_ref_{c}_annotations_{d}.csv'.format(
    p = out_dir,
    c = cell_class,
    d = date.today()
)
label_df.to_csv(out_file)

## CD8 EM cells

New clustering

In [68]:
cell_class = 't-cd8-em'

In [69]:
pq_uuid = '39f48474-671d-4584-a10e-e1dd47b33dac'

In [70]:
new_obs = read_parquet_uuid(pq_uuid)

downloading fileID: 39f48474-671d-4584-a10e-e1dd47b33dac
Files have been successfully downloaded!


Remove gdT cells

In [71]:
keep_bc = [not x for x in new_obs['barcodes'].isin(gdt_bc)]

In [72]:
sum(keep_bc)

105716

In [73]:
new_obs = new_obs[keep_bc]

In [74]:
new_obs.shape

(105716, 56)

In [75]:
new_col = 'leiden_resolution_3_t-cd8-em'

Join and assign to new clusters

In [76]:
joint_df = join_anno(og_obs, new_obs, new_col)

In [77]:
joint_df['AIFI_L3'].isna().sum()

61

In [78]:
joint_df[new_col] = joint_df[new_col].cat.remove_unused_categories()

In [79]:
joint_df.head()

Unnamed: 0,barcodes,leiden_resolution_3_t-cd8-em,AIFI_L1,AIFI_L1_Final,AIFI_L2,AIFI_L2_Final,AIFI_L3,AIFI_L3_Final
0,cf72a88448b611ea8957bafe6d70929e,27,Contamination,Yes,C5_C12_Contamination,Yes,C5_C12_Contamination,Yes
1,cf7380e248b611ea8957bafe6d70929e,31,Contamination,Yes,C5_C12_Contamination,Yes,C5_C12_Contamination,Yes
2,cf74919e48b611ea8957bafe6d70929e,9,T cell,Yes,Memory CD8 T cell,Yes,KLRF1- GZMB+ CD27- EM CD8 T cell,Yes
3,cf76a8e448b611ea8957bafe6d70929e,2,T cell,Yes,Memory CD8 T cell,Yes,KLRF1- GZMB+ CD27- EM CD8 T cell,Yes
4,cf7734ee48b611ea8957bafe6d70929e,31,T cell,Yes,Memory CD4 T cell,Yes,KLRF1- GZMB+ CD27- memory CD4 T cell,Yes


In [80]:
joint_df_list.append(joint_df.drop(new_col, axis = 1))

In [81]:
label_df = reduce_anno(joint_df, og_anno, new_col)

  .groupby(new_col)['AIFI_L3']


In [82]:
label_df

Unnamed: 0,leiden_resolution_3_t-cd8-em,AIFI_L3,AIFI_L1,AIFI_L1_Final,AIFI_L2,AIFI_L2_Final,AIFI_L3_Final
0,0,KLRF1- GZMB+ CD27- EM CD8 T cell,T cell,Yes,Memory CD8 T cell,Yes,Yes
1,1,KLRF1+ GZMB+ CD27- EM CD8 T cell,T cell,Yes,Memory CD8 T cell,Yes,Yes
2,2,KLRF1- GZMB+ CD27- EM CD8 T cell,T cell,Yes,Memory CD8 T cell,Yes,Yes
3,3,KLRF1+ GZMB+ CD27- EM CD8 T cell,T cell,Yes,Memory CD8 T cell,Yes,Yes
4,4,KLRF1- GZMB+ CD27- EM CD8 T cell,T cell,Yes,Memory CD8 T cell,Yes,Yes
5,5,KLRF1+ GZMB+ CD27- EM CD8 T cell,T cell,Yes,Memory CD8 T cell,Yes,Yes
6,7,KLRF1- GZMB+ CD27- memory CD4 T cell,T cell,Yes,Memory CD4 T cell,Yes,Yes
7,8,KLRF1- GZMB+ CD27- EM CD8 T cell,T cell,Yes,Memory CD8 T cell,Yes,Yes
8,9,KLRF1- GZMB+ CD27- EM CD8 T cell,T cell,Yes,Memory CD8 T cell,Yes,Yes
9,10,KLRF1- GZMB+ CD27- memory CD4 T cell,T cell,Yes,Memory CD4 T cell,Yes,Yes


In [83]:
out_dir = '../04-Annotation/annotations/'
out_file = '{p}/pbmc_ref_{c}_annotations_{d}.csv'.format(
    p = out_dir,
    c = cell_class,
    d = date.today()
)
label_df.to_csv(out_file)

## Treg cells

New clustering

In [84]:
cell_class = 'treg'

In [85]:
pq_uuid = '8f76dce0-81ec-45c9-9612-68bddcecdd8b'

In [86]:
new_obs = read_parquet_uuid(pq_uuid)

downloading fileID: 8f76dce0-81ec-45c9-9612-68bddcecdd8b
Files have been successfully downloaded!


In [87]:
new_obs.shape

(39087, 56)

In [88]:
new_col = 'leiden_resolution_1.5_treg'

Join and assign to new clusters

In [89]:
joint_df = join_anno(og_obs, new_obs, new_col)

In [90]:
joint_df['AIFI_L3'].isna().sum()

8

In [91]:
joint_df.head()

Unnamed: 0,barcodes,leiden_resolution_1.5_treg,AIFI_L1,AIFI_L1_Final,AIFI_L2,AIFI_L2_Final,AIFI_L3,AIFI_L3_Final
0,cf76fa6048b611ea8957bafe6d70929e,1,T cell,Yes,Treg,Yes,Naive CD4 Treg,Yes
1,cf7b9c8248b611ea8957bafe6d70929e,5,T cell,Yes,Treg,Yes,Memory CD4 Treg,Yes
2,cf7d735448b611ea8957bafe6d70929e,6,T cell,Yes,Treg,Yes,KLRB1+ memory CD4 Treg,Yes
3,cf84101048b611ea8957bafe6d70929e,0,T cell,Yes,Treg,Yes,Naive CD4 Treg,Yes
4,cf8b1d1a48b611ea8957bafe6d70929e,8,T cell,Yes,Treg,Yes,Naive CD4 Treg,Yes


In [92]:
joint_df_list.append(joint_df.drop(new_col, axis = 1))

In [93]:
label_df = reduce_anno(joint_df, og_anno, new_col)

  .groupby(new_col)['AIFI_L3']


In [94]:
label_df

Unnamed: 0,leiden_resolution_1.5_treg,AIFI_L3,AIFI_L1,AIFI_L1_Final,AIFI_L2,AIFI_L2_Final,AIFI_L3_Final
0,0,Naive CD4 Treg,T cell,Yes,Treg,Yes,Yes
1,1,Naive CD4 Treg,T cell,Yes,Treg,Yes,Yes
2,2,Memory CD4 Treg,T cell,Yes,Treg,Yes,Yes
3,3,Naive CD4 Treg,T cell,Yes,Treg,Yes,Yes
4,4,Memory CD4 Treg,T cell,Yes,Treg,Yes,Yes
5,5,Naive CD4 Treg,T cell,Yes,Treg,Yes,Yes
6,6,KLRB1+ memory CD4 Treg,T cell,Yes,Treg,Yes,Yes
7,7,Memory CD4 Treg,T cell,Yes,Treg,Yes,Yes
8,8,Naive CD4 Treg,T cell,Yes,Treg,Yes,Yes
9,9,Naive CD4 Treg,T cell,Yes,Treg,Yes,Yes


In [95]:
out_dir = '../04-Annotation/annotations/'
out_file = '{p}/pbmc_ref_{c}_annotations_{d}.csv'.format(
    p = out_dir,
    c = cell_class,
    d = date.today()
)
label_df.to_csv(out_file)

## CD8 Naive cells

New clustering

In [96]:
cell_class = 't-cd8-naive'

In [97]:
pq_uuid = '9fb15399-7867-4ea9-b4a1-bcbf2d29e0a1'

In [98]:
new_obs = read_parquet_uuid(pq_uuid)

downloading fileID: 9fb15399-7867-4ea9-b4a1-bcbf2d29e0a1
Files have been successfully downloaded!


In [99]:
new_obs.shape

(121643, 56)

In [100]:
new_col = 'leiden_resolution_1.5_t-cd8-naive'

Join and assign to new clusters

In [101]:
joint_df = join_anno(og_obs, new_obs, new_col)

In [102]:
joint_df['AIFI_L3'].isna().sum()

22

In [103]:
joint_df.head()

Unnamed: 0,barcodes,leiden_resolution_1.5_t-cd8-naive,AIFI_L1,AIFI_L1_Final,AIFI_L2,AIFI_L2_Final,AIFI_L3,AIFI_L3_Final
0,cf7224c248b611ea8957bafe6d70929e,4,T cell,Yes,Naive CD8 T cell,Yes,Core naive CD8 T cell,Yes
1,cf7228a048b611ea8957bafe6d70929e,10,T cell,Yes,Naive CD8 T cell,Yes,Core naive CD8 T cell,Yes
2,cf7232e648b611ea8957bafe6d70929e,8,T cell,Yes,Naive CD8 T cell,Yes,Core naive CD8 T cell,Yes
3,cf72937648b611ea8957bafe6d70929e,2,T cell,Yes,Naive CD8 T cell,Yes,Core naive CD8 T cell,Yes
4,cf72bd2448b611ea8957bafe6d70929e,1,T cell,Yes,Naive CD8 T cell,Yes,Core naive CD8 T cell,Yes


In [104]:
joint_df_list.append(joint_df.drop(new_col, axis = 1))

In [105]:
label_df = reduce_anno(joint_df, og_anno, new_col)

  .groupby(new_col)['AIFI_L3']


In [106]:
label_df

Unnamed: 0,leiden_resolution_1.5_t-cd8-naive,AIFI_L3,AIFI_L1,AIFI_L1_Final,AIFI_L2,AIFI_L2_Final,AIFI_L3_Final
0,0,Core naive CD8 T cell,T cell,Yes,Naive CD8 T cell,Yes,Yes
1,1,Core naive CD8 T cell,T cell,Yes,Naive CD8 T cell,Yes,Yes
2,2,Core naive CD8 T cell,T cell,Yes,Naive CD8 T cell,Yes,Yes
3,3,Core naive CD8 T cell,T cell,Yes,Naive CD8 T cell,Yes,Yes
4,4,Core naive CD8 T cell,T cell,Yes,Naive CD8 T cell,Yes,Yes
5,5,Core naive CD8 T cell,T cell,Yes,Naive CD8 T cell,Yes,Yes
6,6,Core naive CD8 T cell,T cell,Yes,Naive CD8 T cell,Yes,Yes
7,7,Core naive CD8 T cell,T cell,Yes,Naive CD8 T cell,Yes,Yes
8,8,Core naive CD8 T cell,T cell,Yes,Naive CD8 T cell,Yes,Yes
9,9,SOX4+ naive CD8 T cell,T cell,Yes,Naive CD8 T cell,Yes,Yes


In [107]:
out_dir = '../04-Annotation/annotations/'
out_file = '{p}/pbmc_ref_{c}_annotations_{d}.csv'.format(
    p = out_dir,
    c = cell_class,
    d = date.today()
)
label_df.to_csv(out_file)

## Proliferating cells

New clustering

In [108]:
cell_class = 't-proliferating'

In [111]:
pq_uuid = '5043bfef-7430-413b-af8d-411e3578dab6'

In [112]:
new_obs = read_parquet_uuid(pq_uuid)

downloading fileID: 5043bfef-7430-413b-af8d-411e3578dab6
Files have been successfully downloaded!


In [113]:
new_obs.shape

(4330, 56)

In [114]:
new_col = 'leiden_resolution_1.5_t-proliferating'

Join and assign to new clusters

In [115]:
joint_df = join_anno(og_obs, new_obs, new_col)

In [116]:
joint_df['AIFI_L3'].isna().sum()

55

In [117]:
joint_df.head()

Unnamed: 0,barcodes,leiden_resolution_1.5_t-proliferating,AIFI_L1,AIFI_L1_Final,AIFI_L2,AIFI_L2_Final,AIFI_L3,AIFI_L3_Final
0,cf721a4048b611ea8957bafe6d70929e,14,NK cell,Yes,Proliferating NK cell,Yes,Proliferating NK cell,Yes
1,cf73ebd648b611ea8957bafe6d70929e,11,T cell,Yes,Proliferating T cell,Yes,Proliferating T cell,Yes
2,cf742e8448b611ea8957bafe6d70929e,13,Doublets,Yes,Proliferating Cells+Monocytes Doublets,Yes,Proliferating Cells+Monocytes Doublets,Yes
3,cf74d2da48b611ea8957bafe6d70929e,15,T cell,Yes,Proliferating T cell,Yes,Proliferating T cell,Yes
4,cf78947448b611ea8957bafe6d70929e,6,T cell,Yes,Proliferating T cell,Yes,Proliferating T cell,Yes


In [118]:
joint_df_list.append(joint_df.drop(new_col, axis = 1))

In [119]:
label_df = reduce_anno(joint_df, og_anno, new_col)

  .groupby(new_col)['AIFI_L3']


In [120]:
label_df

Unnamed: 0,leiden_resolution_1.5_t-proliferating,AIFI_L3,AIFI_L1,AIFI_L1_Final,AIFI_L2,AIFI_L2_Final,AIFI_L3_Final
0,0,Proliferating NK cell,NK cell,Yes,Proliferating NK cell,Yes,Yes
1,1,Proliferating NK cell,NK cell,Yes,Proliferating NK cell,Yes,Yes
2,2,Proliferating T cell,T cell,Yes,Proliferating T cell,Yes,Yes
3,3,Proliferating NK cell,NK cell,Yes,Proliferating NK cell,Yes,Yes
4,4,Proliferating NK cell,NK cell,Yes,Proliferating NK cell,Yes,Yes
5,5,Proliferating T cell,T cell,Yes,Proliferating T cell,Yes,Yes
6,6,Proliferating T cell,T cell,Yes,Proliferating T cell,Yes,Yes
7,7,Proliferating T cell,T cell,Yes,Proliferating T cell,Yes,Yes
8,8,Proliferating T cell,T cell,Yes,Proliferating T cell,Yes,Yes
9,9,Proliferating NK cell,NK cell,Yes,Proliferating NK cell,Yes,Yes


In [121]:
out_dir = '../04-Annotation/annotations/'
out_file = '{p}/pbmc_ref_{c}_annotations_{d}.csv'.format(
    p = out_dir,
    c = cell_class,
    d = date.today()
)
label_df.to_csv(out_file)

## ISG-high cells

New clustering

In [122]:
cell_class = 't-isg-high'

In [123]:
pq_uuid = 'c6be4029-b822-4558-9629-f33b96e227c5'

In [124]:
new_obs = read_parquet_uuid(pq_uuid)

downloading fileID: c6be4029-b822-4558-9629-f33b96e227c5
Files have been successfully downloaded!


In [125]:
new_obs.shape

(14140, 56)

In [126]:
new_col = 'leiden_resolution_2_t-isg-high'

Join and assign to new clusters

In [127]:
joint_df = join_anno(og_obs, new_obs, new_col)

In [128]:
joint_df['AIFI_L3'].isna().sum()

1

In [129]:
joint_df.head()

Unnamed: 0,barcodes,leiden_resolution_2_t-isg-high,AIFI_L1,AIFI_L1_Final,AIFI_L2,AIFI_L2_Final,AIFI_L3,AIFI_L3_Final
0,cf76c3ba48b611ea8957bafe6d70929e,2,T cell,Yes,Memory CD4 T cell,Yes,ISG+ memory CD4 T cell,Yes
1,cf78976c48b611ea8957bafe6d70929e,1,T cell,Yes,Naive CD4 T cell,Yes,ISG+ naive CD4 T cell,Yes
2,cf7c68a648b611ea8957bafe6d70929e,15,T cell,Yes,Memory CD4 T cell,Yes,ISG+ memory CD4 T cell,Yes
3,cf7e220448b611ea8957bafe6d70929e,13,T cell,Yes,Naive CD4 T cell,Yes,ISG+ naive CD4 T cell,Yes
4,cf84c32048b611ea8957bafe6d70929e,7,T cell,Yes,Naive CD8 T cell,Yes,ISG+ naive CD8 T cell,Yes


In [130]:
joint_df_list.append(joint_df.drop(new_col, axis = 1))

In [131]:
label_df = reduce_anno(joint_df, og_anno, new_col)

  .groupby(new_col)['AIFI_L3']


In [132]:
label_df

Unnamed: 0,leiden_resolution_2_t-isg-high,AIFI_L3,AIFI_L1,AIFI_L1_Final,AIFI_L2,AIFI_L2_Final,AIFI_L3_Final
0,0,ISG+ naive CD4 T cell,T cell,Yes,Naive CD4 T cell,Yes,Yes
1,1,ISG+ naive CD4 T cell,T cell,Yes,Naive CD4 T cell,Yes,Yes
2,2,ISG+ memory CD4 T cell,T cell,Yes,Memory CD4 T cell,Yes,Yes
3,3,ISG+ naive CD4 T cell,T cell,Yes,Naive CD4 T cell,Yes,Yes
4,4,ISG+ naive CD4 T cell,T cell,Yes,Naive CD4 T cell,Yes,Yes
5,5,ISG+ memory CD4 T cell,T cell,Yes,Memory CD4 T cell,Yes,Yes
6,6,ISG+ memory CD4 T cell,T cell,Yes,Memory CD4 T cell,Yes,Yes
7,7,ISG+ naive CD8 T cell,T cell,Yes,Naive CD8 T cell,Yes,Yes
8,8,ISG+ naive CD4 T cell,T cell,Yes,Naive CD4 T cell,Yes,Yes
9,9,ISG+ memory CD8 T cell,T cell,Yes,Memory CD8 T cell,Yes,Yes


In [133]:
out_dir = '../04-Annotation/annotations/'
out_file = '{p}/pbmc_ref_{c}_annotations_{d}.csv'.format(
    p = out_dir,
    c = cell_class,
    d = date.today()
)
label_df.to_csv(out_file)

## Other T cells

New clustering

In [134]:
cell_class = 't-cell'

In [137]:
pq_uuid = '4f858a12-d709-485b-b833-5c7248b61ebb'

In [138]:
new_obs = read_parquet_uuid(h5ad_uuid)

In [139]:
new_obs.shape

(1191327, 55)

In [140]:
new_col = 'leiden_resolution_1.5'

Remove previously assigned cells

In [141]:
prev = pd.concat(joint_df_list)

In [142]:
prev.shape

(814460, 7)

In [143]:
keep_barcodes = [not x for x in new_obs['barcodes'].isin(prev['barcodes'])]

In [144]:
sum(keep_barcodes)

376867

In [145]:
new_obs = new_obs[keep_barcodes]

In [146]:
new_obs[new_col] = new_obs[new_col].cat.remove_unused_categories()

In [147]:
new_obs.shape

(376867, 55)

Join and assign to new clusters

In [148]:
joint_df = join_anno(og_obs, new_obs, new_col)

In [149]:
joint_df['AIFI_L3'].isna().sum()

226

In [150]:
joint_df.head()

Unnamed: 0,barcodes,leiden_resolution_1.5,AIFI_L1,AIFI_L1_Final,AIFI_L2,AIFI_L2_Final,AIFI_L3,AIFI_L3_Final
0,cf71fb7848b611ea8957bafe6d70929e,6,T cell,Yes,Memory CD4 T cell,Yes,CM CD4 T cell,Yes
1,cf7216a848b611ea8957bafe6d70929e,8,T cell,Yes,Memory CD4 T cell,Yes,GZMB- CD27+ EM CD4 T cell,Yes
2,cf7226c048b611ea8957bafe6d70929e,3,T cell,Yes,Memory CD4 T cell,Yes,CM CD4 T cell,Yes
3,cf7238b848b611ea8957bafe6d70929e,3,T cell,Yes,Memory CD4 T cell,Yes,CM CD4 T cell,Yes
4,cf72802048b611ea8957bafe6d70929e,2,T cell,Yes,Memory CD4 T cell,Yes,GZMB- CD27- EM CD4 T cell,Yes


In [151]:
label_df = reduce_anno(joint_df, og_anno, new_col)

  .groupby(new_col)['AIFI_L3']


In [152]:
label_df

Unnamed: 0,leiden_resolution_1.5,AIFI_L3,AIFI_L1,AIFI_L1_Final,AIFI_L2,AIFI_L2_Final,AIFI_L3_Final
0,2,GZMB- CD27- EM CD4 T cell,T cell,Yes,Memory CD4 T cell,Yes,Yes
1,3,CM CD4 T cell,T cell,Yes,Memory CD4 T cell,Yes,Yes
2,6,CM CD4 T cell,T cell,Yes,Memory CD4 T cell,Yes,Yes
3,8,GZMB- CD27+ EM CD4 T cell,T cell,Yes,Memory CD4 T cell,Yes,Yes
4,9,GZMK+ CD27+ EM CD8 T cell,T cell,Yes,Memory CD8 T cell,Yes,Yes
5,15,GZMK+ Vd2 gdT,T cell,Yes,gdT,Yes,Yes
6,20,CM CD4 T cell,T cell,Yes,Memory CD4 T cell,Yes,Yes
7,21,Contamination,Contamination,Yes,Contamination,Yes,Yes


In [153]:
out_dir = '../04-Annotation/annotations/'
out_file = '{p}/pbmc_ref_{c}_annotations_{d}.csv'.format(
    p = out_dir,
    c = cell_class,
    d = date.today()
)
label_df.to_csv(out_file)

In [154]:
all_new = pd.concat([prev, joint_df.drop(new_col, axis = 1)])

In [155]:
new_counts = all_new['AIFI_L3'].value_counts()
new_counts

AIFI_L3
Core naive CD4 T cell                     352448
CM CD4 T cell                             173515
Core naive CD8 T cell                     117205
KLRF1- GZMB+ CD27- EM CD8 T cell           70221
GZMB- CD27+ EM CD4 T cell                  62476
GZMB- CD27- EM CD4 T cell                  62105
GZMK+ CD27+ EM CD8 T cell                  60871
CD8 MAIT                                   41831
CM CD8 T cell                              25337
SOX4+ naive CD4 T cell                     22484
Naive CD4 Treg                             19576
KLRF1+ GZMB+ CD27- EM CD8 T cell           18269
GZMB+ Vd2 gdT                              18267
GZMK+ Vd2 gdT                              17795
KLRF1- GZMB+ CD27- memory CD4 T cell       17558
Memory CD4 Treg                            15659
Contamination                               8450
Naive CD8 T cell Platelet Doublets          7293
ISG+ naive CD4 T cell                       7149
GZMK- CD27+ EM CD8 T cell                   6691
SOX4+ naive 

In [156]:
sum(new_counts)

1190744

Original type counts

In [157]:
old_counts = og_obs['AIFI_L3'].value_counts()
old_counts

AIFI_L3
Core naive CD4 T cell                     352699
CM CD4 T cell                             173578
Core naive CD8 T cell                     117237
KLRF1- GZMB+ CD27- EM CD8 T cell           70293
GZMB- CD27+ EM CD4 T cell                  62544
GZMB- CD27- EM CD4 T cell                  62119
GZMK+ CD27+ EM CD8 T cell                  60917
CD8 MAIT                                   41850
CM CD8 T cell                              25405
SOX4+ naive CD4 T cell                     22485
Naive CD4 Treg                             19578
KLRF1+ GZMB+ CD27- EM CD8 T cell           18310
GZMB+ Vd2 gdT                              18294
GZMK+ Vd2 gdT                              17826
KLRF1- GZMB+ CD27- memory CD4 T cell       17574
Memory CD4 Treg                            15661
Contamination                               8564
Naive CD8 T cell Platelet Doublets          7293
ISG+ naive CD4 T cell                       7149
GZMK- CD27+ EM CD8 T cell                   6703
SOX4+ naive 

In [158]:
sum(old_counts)

1193859

Difference in counts

In [159]:
new_counts - old_counts

AIFI_L3
Adaptive NK cell                         -1605
C5_C12_Contamination                      -131
CD4 MAIT                                   -30
CD4 naive CD16 Monocytes Doublets         -204
CD4 naive Erythrocytes Doublets             -6
CD4 naive Platelet Doublets                -77
CD8 MAIT                                   -19
CD8aa                                       -8
CM CD4 T cell                              -63
CM CD8 T cell                              -68
Contamination                             -114
Core naive CD4 T cell                     -251
Core naive CD8 T cell                      -32
DN T cell                                    0
GZMB+ Vd2 gdT                              -27
GZMB- CD27+ EM CD4 T cell                  -68
GZMB- CD27- EM CD4 T cell                  -14
GZMK+ CD27+ EM CD8 T cell                  -46
GZMK+ Vd2 gdT                              -31
GZMK+ memory CD4 Treg                        0
GZMK- CD27+ EM CD8 T cell                  -12
HBB+ 

Percent difference

In [160]:
(new_counts - old_counts) / old_counts * 100

AIFI_L3
Adaptive NK cell                         -28.301887
C5_C12_Contamination                      -5.861298
CD4 MAIT                                  -0.937793
CD4 naive CD16 Monocytes Doublets         -5.792164
CD4 naive Erythrocytes Doublets           -0.684932
CD4 naive Platelet Doublets               -1.636209
CD8 MAIT                                  -0.045400
CD8aa                                     -0.143087
CM CD4 T cell                             -0.036295
CM CD8 T cell                             -0.267664
Contamination                             -1.331154
Core naive CD4 T cell                     -0.071165
Core naive CD8 T cell                     -0.027295
DN T cell                                  0.000000
GZMB+ Vd2 gdT                             -0.147589
GZMB- CD27+ EM CD4 T cell                 -0.108723
GZMB- CD27- EM CD4 T cell                 -0.022537
GZMK+ CD27+ EM CD8 T cell                 -0.075513
GZMK+ Vd2 gdT                             -0.173903
GZMK

In [161]:
old_id = og_obs[['barcodes','AIFI_L3']].rename({'AIFI_L3':'old_L3'}, axis = 1)
new_id = all_new[['barcodes','AIFI_L3']].rename({'AIFI_L3':'new_L3'}, axis = 1)

In [162]:
sum(new_id['barcodes'].isin(old_id['barcodes']))

1190744

In [163]:
comp_old = old_id.merge(new_id, on = 'barcodes', how = 'left')

In [164]:
comp_old.shape

(1193859, 3)

In [165]:
sum(comp_old['old_L3'] == comp_old['new_L3'])

1190744

In [166]:
diff_old = comp_old[comp_old['old_L3'] != comp_old['new_L3']]
#diff = diff[[not x for x in diff['new_L3'].isna()]]

In [167]:
diff_old.head()

Unnamed: 0,barcodes,old_L3,new_L3
26,cf748be048b611ea8957bafe6d70929e,GZMK+ CD27+ EM CD8 T cell,
192,cf94d71048b611ea8957bafe6d70929e,GZMB- CD27+ EM CD4 T cell,
202,cf96f8ec48b611ea8957bafe6d70929e,GZMB- CD27+ EM CD4 T cell,
1272,a77f2b22489811ea92388a6a0c683ffb,GZMK+ CD27+ EM CD8 T cell,
1636,c6abb0d648bd11eab2ee7a6edf799b61,GZMB- CD27+ EM CD4 T cell,


In [168]:
diff_old['old_L3'].value_counts()

old_L3
Adaptive NK cell                          1605
Core naive CD4 T cell                      251
CD4 naive CD16 Monocytes Doublets          204
C5_C12_Contamination                       131
Contamination                              114
CD4 naive Platelet Doublets                 77
Proliferating Cells+Monocytes Doublets      76
KLRF1- GZMB+ CD27- EM CD8 T cell            72
CM CD8 T cell                               68
GZMB- CD27+ EM CD4 T cell                   68
CM CD4 T cell                               63
KLRF1+ effector Vd1 gdT                     47
GZMK+ CD27+ EM CD8 T cell                   46
KLRF1+ GZMB+ CD27- EM CD8 T cell            41
Core naive CD8 T cell                       32
GZMK+ Vd2 gdT                               31
CD4 MAIT                                    30
GZMB+ Vd2 gdT                               27
KLRF1- effector Vd1 gdT                     22
CD8 MAIT                                    19
KLRF1- GZMB+ CD27- memory CD4 T cell        16
GZMB- 

In [169]:
comp_new = new_id.merge(old_id, on = 'barcodes', how = 'left')

In [170]:
comp_new.shape

(1191327, 3)

In [171]:
sum(comp_new['old_L3'] == comp_new['new_L3'])

1190744

In [172]:
diff_new = comp_new[comp_new['old_L3'] != comp_new['new_L3']]
#diff = diff[[not x for x in diff['new_L3'].isna()]]

In [173]:
diff_new.shape

(583, 3)

In [174]:
sum(diff_new['new_L3'].isna())

583

Are missing Adaptive NKs now assigned to NK clusters?

In [175]:
nk_uuid = '86d536fc-7975-4ff6-9701-ce29a5df19b5'

In [176]:
nk_obs = read_parquet_uuid(nk_uuid)

In [177]:
nk_obs.shape

(1952128, 52)

In [178]:
sum(diff_old['old_L3'] == 'Adaptive NK cell')

1605

In [179]:
missing_anks = diff_old.loc[diff_old['old_L3'] == 'Adaptive NK cell','barcodes'].tolist()

In [180]:
sum(nk_obs['barcodes'].isin(missing_anks))

1597

Yes, they're (almost) all in there.

Most other differences are due to doublets and contamination clusters, which is fine

In [181]:
import session_info
session_info.show()