## Adapt Other cell annotations

With slightly different clustering, we need to shift the original expert cluster annotations to new cluster numbers.

We can do this using the barcodes associated with each cluster.

In [1]:
from datetime import date
import hisepy
import os
import pandas as pd
import scanpy as sc

## Read new subclustering results from HISE

In [2]:
cell_class = 'others'

In [3]:
h5ad_uuid = '1eb6ca8c-b8ed-4968-b515-c954497441dc'
h5ad_path = '/home/jupyter/cache/{u}'.format(u = h5ad_uuid)

In [4]:
if not os.path.isdir(h5ad_path):
    hise_res = hisepy.reader.cache_files([h5ad_uuid])

In [5]:
h5ad_filename = os.listdir(h5ad_path)[0]
h5ad_file = '{p}/{f}'.format(p = h5ad_path, f = h5ad_filename)

In [6]:
adata = sc.read_h5ad(h5ad_file)

In [7]:
adata

AnnData object with n_obs × n_vars = 24603 × 4305
    obs: 'barcodes', 'batch_id', 'cell_name', 'cell_uuid', 'chip_id', 'hto_barcode', 'hto_category', 'n_genes', 'n_mito_umis', 'n_reads', 'n_umis', 'original_barcodes', 'pbmc_sample_id', 'pool_id', 'well_id', 'sample.sampleKitGuid', 'cohort.cohortGuid', 'subject.subjectGuid', 'subject.biologicalSex', 'subject.race', 'subject.ethnicity', 'subject.birthYear', 'sample.visitName', 'sample.drawDate', 'file.id', 'subject.cmv', 'subject.bmi', 'celltypist.low', 'seurat.l1', 'seurat.l1.score', 'seurat.l2', 'seurat.l2.score', 'seurat.l2.5', 'seurat.l2.5.score', 'seurat.l3', 'seurat.l3.score', 'predicted_doublet', 'doublet_score', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts_mito', 'log1p_total_counts_mito', 'pct_counts_mito', 'leiden', 'leiden_resolution_1', 'leiden_resol

## Read old results

In [8]:
og_pq_file = 'Other_Annotation.parquet'

In [9]:
og_obs = pd.read_parquet(og_pq_file)

In [10]:
og_obs.shape

(23944, 63)

## Read original annotations

In [11]:
og_anno = pd.read_csv('Other_Res1_Annotation.csv')

In [12]:
og_df = og_obs[['barcodes', 'leiden_resolution_1']].copy()

In [13]:
join_col = 'leiden_resolution_1'

In [14]:
og_anno[join_col] = og_anno[join_col].astype('string').astype('category')

In [15]:
og_df = og_df.merge(og_anno, how = 'left', on = 'leiden_resolution_1')

In [16]:
og_df.head()

Unnamed: 0,barcodes,leiden_resolution_1,AIFI_L1,AIFI_L1_Final,AIFI_L2,AIFI_L2_Final,AIFI_L3,AIFI_L3_Final
0,cf7341b848b611ea8957bafe6d70929e,7,T+Erythocytes doublet,Yes,T+Erythocytes doublet,Yes,T+Erythocytes doublet,Yes
1,cf7400bc48b611ea8957bafe6d70929e,4,Monocytes+Erythocytes doublet,Yes,Monocytes+Erythocytes doublet,Yes,Monocytes+Erythocytes doublet,Yes
2,cf763b8448b611ea8957bafe6d70929e,1,T+Erythocytes doublet,Yes,T+Erythocytes doublet,Yes,T+Erythocytes doublet,Yes
3,cf76501a48b611ea8957bafe6d70929e,9,Progenitor cell,Yes,Progenitor cell,Yes,CMP cell,Yes
4,cf83c0ba48b611ea8957bafe6d70929e,1,T+Erythocytes doublet,Yes,T+Erythocytes doublet,Yes,T+Erythocytes doublet,Yes


Rename leiden cluster so names don't conflict with new clustering

In [17]:
og_df = og_df.rename({"leiden_resolution_1":"og_cl"}, axis = 1)

Join original annotations to new clusters by barcodes

In [18]:
new_df = adata.obs[['barcodes', 'leiden_resolution_1']].copy()
new_df = new_df.reset_index(drop = True)

In [19]:
joint_df = new_df.merge(og_df, how = 'left', on = 'barcodes')

In [20]:
joint_df.head()

Unnamed: 0,barcodes,leiden_resolution_1,og_cl,AIFI_L1,AIFI_L1_Final,AIFI_L2,AIFI_L2_Final,AIFI_L3,AIFI_L3_Final
0,cf7341b848b611ea8957bafe6d70929e,3,7,T+Erythocytes doublet,Yes,T+Erythocytes doublet,Yes,T+Erythocytes doublet,Yes
1,cf7400bc48b611ea8957bafe6d70929e,4,4,Monocytes+Erythocytes doublet,Yes,Monocytes+Erythocytes doublet,Yes,Monocytes+Erythocytes doublet,Yes
2,cf763b8448b611ea8957bafe6d70929e,1,1,T+Erythocytes doublet,Yes,T+Erythocytes doublet,Yes,T+Erythocytes doublet,Yes
3,cf76501a48b611ea8957bafe6d70929e,9,9,Progenitor cell,Yes,Progenitor cell,Yes,CMP cell,Yes
4,cf83c0ba48b611ea8957bafe6d70929e,1,1,T+Erythocytes doublet,Yes,T+Erythocytes doublet,Yes,T+Erythocytes doublet,Yes


In [21]:
freq_anno = joint_df.drop(['barcodes', 'og_cl'], axis = 1)

Use `pd.Series.mode` on each group of leiden_resolution_1.5 cells to get the most frequent value of AIFI_L3

In [22]:
freq_anno = (
    freq_anno
    .groupby('leiden_resolution_1')['AIFI_L3']
    .agg(pd.Series.mode)
    .to_frame()
    .reset_index()
)

  .groupby('leiden_resolution_1')['AIFI_L3']


In [23]:
freq_anno.head()

Unnamed: 0,leiden_resolution_1,AIFI_L3
0,0,Platelet
1,1,T+Erythocytes doublet
2,2,Platelet
3,3,T+Erythocytes doublet
4,4,Monocytes+Erythocytes doublet


Select unique associations between L3 labels and the other levels of the hierarchy from the original annotations - should work because these are all subsets of L2 and L1

In [24]:
anno_levels = (
    og_anno
    .drop('leiden_resolution_1', axis = 1)
    .drop_duplicates()
)

Join the other labels to our new annotations

In [25]:
new_anno = freq_anno.merge(anno_levels, how = 'left', on = 'AIFI_L3')

In [26]:
new_anno

Unnamed: 0,leiden_resolution_1,AIFI_L3,AIFI_L1,AIFI_L1_Final,AIFI_L2,AIFI_L2_Final,AIFI_L3_Final
0,0,Platelet,Platelet,Yes,Platelet,Yes,Yes
1,1,T+Erythocytes doublet,T+Erythocytes doublet,Yes,T+Erythocytes doublet,Yes,Yes
2,2,Platelet,Platelet,Yes,Platelet,Yes,Yes
3,3,T+Erythocytes doublet,T+Erythocytes doublet,Yes,T+Erythocytes doublet,Yes,Yes
4,4,Monocytes+Erythocytes doublet,Monocytes+Erythocytes doublet,Yes,Monocytes+Erythocytes doublet,Yes,Yes
5,5,T+Erythocytes doublet,T+Erythocytes doublet,Yes,T+Erythocytes doublet,Yes,Yes
6,6,Platelet,Platelet,Yes,Platelet,Yes,Yes
7,7,T+Erythocytes doublet,T+Erythocytes doublet,Yes,T+Erythocytes doublet,Yes,Yes
8,8,Erythrocyte,Erythrocyte,Yes,Erythrocyte,Yes,Yes
9,9,CMP cell,Progenitor cell,Yes,Progenitor cell,Yes,Yes


In [27]:
out_dir = '../04-Annotation/annotations/'
out_file = '{p}/pbmc_ref_{c}_annotations_{d}.csv'.format(
    p = out_dir,
    c = cell_class,
    d = date.today()
)
new_anno.to_csv(out_file)

In [28]:
new_counts = joint_df['AIFI_L3'].value_counts()
new_counts

AIFI_L3
T+Erythocytes doublet            8647
Platelet                         8037
Monocytes+Erythocytes doublet    2476
Erythrocyte                      1336
CMP cell                         1082
B+Erythocytes doublet             928
NK+Erythocytes doublet            883
CLP cell                          367
BaEoMaP cell                       78
Name: count, dtype: int64

Original type counts

In [29]:
old_counts = og_df['AIFI_L3'].value_counts()
old_counts

AIFI_L3
T+Erythocytes doublet            8679
Platelet                         8037
Monocytes+Erythocytes doublet    2515
Erythrocyte                      1337
CMP cell                         1082
B+Erythocytes doublet             953
NK+Erythocytes doublet            887
CLP cell                          376
BaEoMaP cell                       78
Name: count, dtype: int64

Difference in counts

In [30]:
new_counts - old_counts

AIFI_L3
T+Erythocytes doublet           -32
Platelet                          0
Monocytes+Erythocytes doublet   -39
Erythrocyte                      -1
CMP cell                          0
B+Erythocytes doublet           -25
NK+Erythocytes doublet           -4
CLP cell                         -9
BaEoMaP cell                      0
Name: count, dtype: int64

Percent difference

In [31]:
(new_counts - old_counts) / old_counts * 100

AIFI_L3
T+Erythocytes doublet           -0.368706
Platelet                         0.000000
Monocytes+Erythocytes doublet   -1.550696
Erythrocyte                     -0.074794
CMP cell                         0.000000
B+Erythocytes doublet           -2.623295
NK+Erythocytes doublet          -0.450958
CLP cell                        -2.393617
BaEoMaP cell                     0.000000
Name: count, dtype: float64

In [32]:
import session_info
session_info.show()