# Annotate the dataset once and for all

So we can just load it and use it.


Stephen Fleming

2022.12.01

In [1]:
import anndata
import numpy as np
import pandas as pd
import os
import gc

In [2]:
DATA_DIR = 'data'

# Load dataset

In [3]:
adata = anndata.read_h5ad(os.path.join(DATA_DIR, 'scLevyAll.h5ad'))
adata

AnnData object with n_obs × n_vars = 1276568 × 37701

# Annotate with metadata

In [4]:
# metadata
metadata = pd.read_csv(
    os.path.join(DATA_DIR, 'scLevyAll_metadata.txt'),
    sep='\t',
    header=0,
    index_col=False,
)

# rename columns for aesthetic purposes
metadata = metadata.rename(columns={'PREFIX': 'sample', 
                                    'CELL_BARCODE': 'barcode',
                                    'predClass': 'pred_class',
                                    'DONOR': 'donor',
                                    'NUM_GENIC_READS': 'num_genic_reads',
                                    'NUM_TRANSCRIPTS': 'num_transcripts',
                                    'NUM_GENES': 'num_genes',
                                    'bestSample': 'best_sample',
                                    'bestLikelihood': 'best_likelihood'})

# index will be sample_barcode
metadata.index = metadata['sample'].astype(str) + '_' + metadata['barcode'].astype(str)

# limit metadata to cells we actually have in the dataset
metadata = metadata.loc[adata.obs.index]

# add metadata to adata
adata.obs = metadata

# add a few more annotations
def _get_condition(sample):
    s = sample.split('_')
    if len(s) == 2:
        return 'control'
    else:
        return '_'.join(s[1:-1])
    
def _get_celltype(sample):
    return sample.split('_')[0]

adata.obs['cell_type'] = adata.obs['sample'].astype(str).apply(_get_celltype)
adata.obs['condition'] = adata.obs['sample'].astype(str).apply(_get_condition)
adata.obs['perturbation'] = adata.obs['condition'].apply(lambda s: s.split('_')[0])  # no time info

# add in some donor info: diagnosis, age, sex, genetics
donor_df = pd.read_csv(
    os.path.join(DATA_DIR, 'McleanLevy_Dropulation_Cohort.csv'),
    header=0,
    index_col=False,
)
donor_df = donor_df.rename(columns={'Linking Donor ID': 'donor',
                                    'Clinical Diagnosis': 'clinical_diagnosis',
                                    'Age': 'age',
                                    'Sex': 'sex',
                                    'Genetics': 'genetics'})
adata.obs = adata.obs.reset_index().merge(
    right=donor_df[['donor', 'clinical_diagnosis', 'age', 'sex', 'genetics']],
    how='left',
    on='donor',
).set_index('index')

# add in some drug info
drug_df = pd.read_csv(
    os.path.join(DATA_DIR, 'LevyDrug_class.csv'),
    header=0,
    index_col=False,
)
drug_df = drug_df.rename(columns={'Perturbation': 'perturbation',
                                  'Category': 'perturbation_category',
                                  'Vehicle': 'perturbation_vehicle'})
adata.obs = adata.obs.reset_index().merge(
    right=drug_df,
    how='left',
    on='perturbation',
).set_index('index')

# make stuff categorical
for c in ['sample', 'donor', 'best_sample', 'cell_type', 'condition', 
          'sex', 'genetics', 'clinical_diagnosis',
          'perturbation', 'perturbation_category', 'perturbation_vehicle']:
    adata.obs[c] = adata.obs[c].astype('category')

adata

AnnData object with n_obs × n_vars = 1276568 × 37701
    obs: 'sample', 'barcode', 'pred_class', 'doublet', 'donor', 'num_genic_reads', 'num_transcripts', 'num_genes', 'best_sample', 'best_likelihood', 'pvalue', 'num_retained_transcripts', 'pct_coding', 'pct_utr', 'pct_intergenic', 'pct_genic', 'pct_intronic', 'pct_mt', 'pct_ribosomal', 'frac_contamination', 'cell_type', 'condition', 'perturbation', 'clinical_diagnosis', 'age', 'sex', 'genetics', 'perturbation_category', 'perturbation_vehicle'

In [5]:
adata.obs.head()

Unnamed: 0_level_0,sample,barcode,pred_class,doublet,donor,num_genic_reads,num_transcripts,num_genes,best_sample,best_likelihood,...,frac_contamination,cell_type,condition,perturbation,clinical_diagnosis,age,sex,genetics,perturbation_category,perturbation_vehicle
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Astrocyte_A1CYTO_a_TGCTCCATCTGCGGGT,Astrocyte_A1CYTO_a,TGCTCCATCTGCGGGT,,,ML902-5848,25479.0,20008.0,5922.0,ML902-5848,-197.33,...,0.013364,Astrocyte,A1CYTO,A1CYTO,Bipolar | Psychosis,29.0,Male,-,Inflammatory response,PBS
Astrocyte_A1CYTO_a_ATGCCTCAGTCACTGT,Astrocyte_A1CYTO_a,ATGCCTCAGTCACTGT,,,ML902-5848,24976.0,19621.0,5944.0,ML902-5848,-227.35,...,0.016097,Astrocyte,A1CYTO,A1CYTO,Bipolar | Psychosis,29.0,Male,-,Inflammatory response,PBS
Astrocyte_A1CYTO_a_AGCCAATAGGCCTTGC,Astrocyte_A1CYTO_a,AGCCAATAGGCCTTGC,,,ML909-1385,24190.0,18722.0,5658.0,ML909-1385,-205.34,...,0.016857,Astrocyte,A1CYTO,A1CYTO,"Psychosis - Chronic, NOS | Multiple Substance ...",27.0,Male,-,Inflammatory response,PBS
Astrocyte_A1CYTO_a_TTCCACGCAGCCCAGT,Astrocyte_A1CYTO_a,TTCCACGCAGCCCAGT,,,ML904-8146,23625.0,18270.0,5312.0,ML904-8146,-172.97,...,0.013659,Astrocyte,A1CYTO,A1CYTO,Schizoaffective,60.0,Female,-,Inflammatory response,PBS
Astrocyte_A1CYTO_a_GCCAGCACACAAGGTG,Astrocyte_A1CYTO_a,GCCAGCACACAAGGTG,,,ML902-5848,23675.0,18245.0,5767.0,ML902-5848,-199.52,...,0.016442,Astrocyte,A1CYTO,A1CYTO,Bipolar | Psychosis,29.0,Male,-,Inflammatory response,PBS


# Checks

In [6]:
adata.obs['cell_type'].value_counts()

Astrocyte    673130
Neuron       441795
NPC          103147
iPSC          58496
Name: cell_type, dtype: int64

In [7]:
adata.obs['condition'].value_counts()

control      380816
DMSO_24hr     96509
DMSO_72hr     89632
GLUT          61104
TNFa          57998
CLOZ          57955
HALO          55249
AZT           50400
ATOR          47330
EFA           44772
SIM           42997
C1Q           40087
PBS_24hr      36245
H2O2          33514
A1CYTO        32905
IL1a          31863
INFy_24hr     23867
ISRD          20455
GluN2a        19526
INFy_7hr      18346
PBS_7hr       17531
INFa_7hr      17467
Name: condition, dtype: int64

In [81]:
pd.crosstab(adata.obs['condition'], adata.obs['perturbation'])

condition,ATOR,AZT,CLOZ,DMSO,EFA,GLUT,GluN2a,H2O2,HALO,INFa,INFy,ISRD,PBS,SIM,TNFa,control
full_condition,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
ATOR,13371,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
AZT,0,16751,0,0,0,0,0,0,0,0,0,0,0,0,0,0
CLOZ,0,0,19735,0,0,0,0,0,0,0,0,0,0,0,0,0
DMSO_24hr,0,0,0,42256,0,0,0,0,0,0,0,0,0,0,0,0
DMSO_72hr,0,0,0,49773,0,0,0,0,0,0,0,0,0,0,0,0
EFA,0,0,0,0,11768,0,0,0,0,0,0,0,0,0,0,0
GLUT,0,0,0,0,0,24377,0,0,0,0,0,0,0,0,0,0
GluN2a,0,0,0,0,0,0,19526,0,0,0,0,0,0,0,0,0
H2O2,0,0,0,0,0,0,0,5346,0,0,0,0,0,0,0,0
HALO,0,0,0,0,0,0,0,0,18116,0,0,0,0,0,0,0


In [9]:
pd.crosstab(adata.obs['perturbation'], adata.obs['cell_type'])

cell_type,Astrocyte,NPC,Neuron,iPSC
perturbation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A1CYTO,32905,0,0,0
ATOR,33959,0,13371,0
AZT,33649,0,16751,0
C1Q,40087,0,0,0
CLOZ,38220,0,19735,0
DMSO,94112,0,92029,0
EFA,33004,0,11768,0
GLUT,36727,0,24377,0
GluN2a,0,0,19526,0
H2O2,28168,0,5346,0


In [10]:
pd.crosstab(adata.obs['donor'], adata.obs['cell_type'])

cell_type,Astrocyte,NPC,Neuron,iPSC
donor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CW20183,13215,3276,15588,1461
ML611-2911,34,2673,6427,1256
ML611-3363,0,2196,4482,1439
ML611-5459,0,2435,11690,1136
ML730-5535,39,2461,5371,1705
ML730-7078,24030,4372,21638,1356
ML730-8735,127,1516,11776,790
ML787-6234,42868,2617,9308,2062
ML787-7283,587,3002,4070,2000
ML830-2683,0,1895,6375,1291


In [11]:
pd.crosstab(adata.obs['donor'], adata.obs['perturbation'])

perturbation,A1CYTO,ATOR,AZT,C1Q,CLOZ,DMSO,EFA,GLUT,GluN2a,H2O2,HALO,IL1a,INFa,INFy,ISRD,PBS,SIM,TNFa,control
donor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
CW20183,532,1180,1355,792,1527,4681,1090,1510,591,718,1396,554,645,1403,760,1666,1243,1784,10113
ML611-2911,0,177,224,0,300,1328,151,381,297,87,282,0,238,658,314,388,90,260,5215
ML611-3363,0,104,175,0,201,942,111,269,180,51,216,0,175,425,213,325,44,199,4487
ML611-5459,0,325,386,0,512,2661,303,740,513,98,471,0,463,1201,560,602,153,380,5893
ML730-5535,0,155,219,0,225,1111,131,315,226,71,214,0,223,500,236,350,130,225,5245
ML730-7078,1317,1653,2118,1578,2244,7747,1853,2406,965,1226,2184,1067,886,2077,956,2219,1236,2480,15184
ML730-8735,0,288,508,0,489,2538,302,650,518,145,459,0,433,1165,562,802,169,549,4632
ML787-6234,1830,2585,3070,2212,3044,7538,2200,3069,406,1598,2759,1773,357,898,401,2214,3288,2339,15274
ML787-7283,0,197,219,0,257,898,126,208,169,54,154,0,163,382,172,330,173,282,5875
ML830-2683,0,169,223,0,314,1430,178,401,283,67,259,0,266,581,271,410,66,222,4421


# Write annotated dataset

In [13]:
adata.write(os.path.join(DATA_DIR, 'sc_levy_annotated.h5ad'))

In [14]:
!ls -lh {DATA_DIR}

total 41G
-rw-r--r-- 1 sfleming sfleming  596 Dec  1 17:44 LevyDrug_class.csv
-rw-r--r-- 1 sfleming sfleming 7.1K Dec  1 17:44 McleanLevy_Dropulation_Cohort.csv
-rw-rw-r-- 1 sfleming sfleming 6.1G Dec  1 17:46 scLevyAll.h5ad
-rw-rw-r-- 1 sfleming sfleming 502M Dec  1 17:47 scLevyAll_metadata.txt
-rw-rw-r-- 1 sfleming sfleming  35G Dec  1 20:17 sc_levy_annotated.h5ad
