In [1]:
# This script merges three h5ad files and generates
# 1. The absolute and relative cell frequencies
# 2. Pseudo bulk RNA counts per samples averaged across the cells.

In [2]:
import anndata
import pandas as pd
import numpy as np

In [3]:
# The HTAN CRC scRNA exp data is split into epithelial and non-epithelial at: https://cellxgene.cziscience.com/collections/a48f5033-3438-4550-8574-cdff3263fdfd
# Download the files,
# Discovery (DIS) set of human colorectal tumor: Epithelial
# Validation (Val) set of human colorectal tumor: Epithelial
# VAL and DIS datasets: Non-Epithelial
dis_epi_h5ad = anndata.read_h5ad("dis_epi.h5ad")
val_epi_h5ad = anndata.read_h5ad("val_epi.h5ad")
non_epi_h5ad = anndata.read_h5ad("dis_val_non_epi.h5ad")

In [4]:
# For discovery epithelial
dis_epi_exp_df = pd.DataFrame(dis_epi_h5ad.X, columns=dis_epi_h5ad.var_names, index=dis_epi_h5ad.obs_names)
dis_epi_cell_meta_df = pd.DataFrame(dis_epi_h5ad.obs)
dis_epi_gene_meta_df = pd.DataFrame(dis_epi_h5ad.var)

In [5]:
# For validation epithelial
val_epi_exp_df = pd.DataFrame(val_epi_h5ad.X, columns=val_epi_h5ad.var_names, index=val_epi_h5ad.obs_names)
val_epi_cell_meta_df = pd.DataFrame(val_epi_h5ad.obs)
val_epi_gene_meta_df = pd.DataFrame(val_epi_h5ad.var)

In [6]:
# For non-epithelial
non_epi_exp_df = pd.DataFrame(non_epi_h5ad.X, columns=non_epi_h5ad.var_names, index=non_epi_h5ad.obs_names)
non_epi_cell_meta_df = pd.DataFrame(non_epi_h5ad.obs)
non_epi_gene_meta_df = pd.DataFrame(non_epi_h5ad.var)

In [7]:
# Merge the cell meta 
merged_cell_meta = pd.concat([dis_epi_cell_meta_df, val_epi_cell_meta_df, non_epi_cell_meta_df])
merged_cell_meta = merged_cell_meta[~merged_cell_meta.index.duplicated(keep='first')]
merged_cell_meta

Unnamed: 0,HTAN Parent Data File ID,HTAN Specimen ID,Cell_Type,Polyp_Type,Sample_Classification,development_stage_ontology_term_id,donor_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,tissue_ontology_term_id,...,cell_type,assay,disease,organism,sex,tissue,self_reported_ethnicity,development_stage,observation_joinid,Tumor_Type
AAATCAGAAGTGATGC-0,HTA11_3410_200000101113111,HTA11_3410_2000001011,GOB,TA,AD,HsapDv:0000164,HTA11_3410,HANCESTRO:0005,PATO:0000384,UBERON:0001157,...,goblet cell,TruDrop,tubular adenoma,Homo sapiens,male,transverse colon,European,70-year-old human stage,02J9@!!fe6,
GACCCGAATGAGGCAACG-0,HTA11_3410_200000101113111,HTA11_3410_2000001011,TAC,TA,AD,HsapDv:0000164,HTA11_3410,HANCESTRO:0005,PATO:0000384,UBERON:0001157,...,transit amplifying cell,TruDrop,tubular adenoma,Homo sapiens,male,transverse colon,European,70-year-old human stage,={#ZrNd~6s,
GACTTCTTCGATATGCAT-0,HTA11_3410_200000101113111,HTA11_3410_2000001011,ASC,TA,AD,HsapDv:0000164,HTA11_3410,HANCESTRO:0005,PATO:0000384,UBERON:0001157,...,neoplastic cell,TruDrop,tubular adenoma,Homo sapiens,male,transverse colon,European,70-year-old human stage,Xq(x&#DXEv,
GAACCACGCTACCTTGCC-0,HTA11_3410_200000101113111,HTA11_3410_2000001011,ASC,TA,AD,HsapDv:0000164,HTA11_3410,HANCESTRO:0005,PATO:0000384,UBERON:0001157,...,neoplastic cell,TruDrop,tubular adenoma,Homo sapiens,male,transverse colon,European,70-year-old human stage,sW^nV;R~|`,
TGCCTCACGTGGAGCT-0,HTA11_3410_200000101113111,HTA11_3410_2000001011,ASC,TA,AD,HsapDv:0000164,HTA11_3410,HANCESTRO:0005,PATO:0000384,UBERON:0001157,...,neoplastic cell,TruDrop,tubular adenoma,Homo sapiens,male,transverse colon,European,70-year-old human stage,lD3fvh9JSx,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ATCTTTGTACTGAGTT-19-5898,HTA11_6801_200000201113111,HTA11_6801_2000002011,T,,NL,HsapDv:0000158,HTA11_6801,HANCESTRO:0005,PATO:0000384,UBERON:0001159,...,T cell,TruDrop,normal,Homo sapiens,male,sigmoid colon,European,64-year-old human stage,-#M;T*$+N;,NL
ATCCGCTAAAGCTTCT-19-5898,HTA11_6801_200000201113111,HTA11_6801_2000002011,T,,NL,HsapDv:0000158,HTA11_6801,HANCESTRO:0005,PATO:0000384,UBERON:0001159,...,T cell,TruDrop,normal,Homo sapiens,male,sigmoid colon,European,64-year-old human stage,pUJHFo~IZZ,NL
TGATCCCTGATCGGGTAGT-20-5898,HTA11_6182_200000101113111,HTA11_6182_2000001011,PLA,,AD,HsapDv:0000146,HTA11_6182,HANCESTRO:0005,PATO:0000384,UBERON:0001158,...,plasma cell,TruDrop,tubulovillous adenoma,Homo sapiens,male,descending colon,European,52-year-old human stage,_!-?h_JTtm,TVA
TGAATGACTTTGGGAAGGT-22-5898,HTA11_8504_200000201113111,HTA11_8504_2000002011,MAS,,NL,HsapDv:0000140,HTA11_8504,HANCESTRO:0005,PATO:0000383,UBERON:0001156,...,mast cell,TruDrop,normal,Homo sapiens,female,ascending colon,European,46-year-old human stage,wCDpR!fFn+,NL


In [8]:
# Merge the gene meta data
merged_gene_meta = pd.concat([dis_epi_gene_meta_df, val_epi_gene_meta_df, non_epi_gene_meta_df])
merged_gene_meta = merged_gene_meta[~merged_gene_meta.index.duplicated(keep='first')]
merged_gene_meta

Unnamed: 0,feature_is_filtered,feature_name,feature_reference,feature_biotype,feature_length
ENSG00000121410,False,A1BG,NCBITaxon:9606,gene,3999
ENSG00000268895,False,A1BG-AS1,NCBITaxon:9606,gene,3374
ENSG00000148584,False,A1CF,NCBITaxon:9606,gene,9603
ENSG00000175899,False,A2M,NCBITaxon:9606,gene,6318
ENSG00000245105,False,A2M-AS1,NCBITaxon:9606,gene,2948
...,...,...,...,...,...
ENSG00000275923,False,ENSG00000275923.1,NCBITaxon:9606,gene,310
ENSG00000239608,False,RUVBL1-AS1,NCBITaxon:9606,gene,588
ENSG00000230005,False,SNAP47-AS1,NCBITaxon:9606,gene,1708
ENSG00000198573,False,SPANXC,NCBITaxon:9606,gene,408


In [9]:
# Merge the expression data
merged_exp_df = pd.concat([dis_epi_exp_df, val_epi_exp_df, non_epi_exp_df])
merged_exp_df

Unnamed: 0,ENSG00000121410,ENSG00000268895,ENSG00000148584,ENSG00000175899,ENSG00000245105,ENSG00000166535,ENSG00000256661,ENSG00000256904,ENSG00000128274,ENSG00000118017,...,ENSG00000278177,ENSG00000267078,ENSG00000249196,ENSG00000257534,ENSG00000228559,ENSG00000275923,ENSG00000239608,ENSG00000230005,ENSG00000198573,ENSG00000275743
AAATCAGAAGTGATGC-0,-0.019469,-0.005354,-0.679289,-0.069293,-0.015500,-0.053891,-0.057669,-0.005543,-0.064006,-0.019828,...,,,,,,,,,,
GACCCGAATGAGGCAACG-0,-0.019469,-0.005354,-0.679289,-0.069293,-0.015500,-0.053891,-0.057669,-0.005543,-0.064006,-0.019828,...,,,,,,,,,,
GACTTCTTCGATATGCAT-0,-0.019469,-0.005354,-0.679289,-0.069293,-0.015500,-0.053891,-0.057669,-0.005543,-0.064006,-0.019828,...,,,,,,,,,,
GAACCACGCTACCTTGCC-0,-0.019469,-0.005354,-0.679289,-0.069293,-0.015500,-0.053891,-0.057669,-0.005543,-0.064006,-0.019828,...,,,,,,,,,,
TGCCTCACGTGGAGCT-0,-0.019469,-0.005354,-0.679289,-0.069293,-0.015500,-0.053891,-0.057669,-0.005543,-0.064006,-0.019828,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ATCTTTGTACTGAGTT-19-5898,-0.028682,,-0.091783,-0.282198,-0.023624,-0.042998,-0.051998,0.000000,-0.109127,-0.029677,...,,,,,0.0,0.0,,,0.0,
ATCCGCTAAAGCTTCT-19-5898,-0.028682,,-0.091783,-0.282198,-0.023624,-0.042998,-0.051998,0.000000,-0.109127,-0.029677,...,,,,,0.0,0.0,,,0.0,
TGATCCCTGATCGGGTAGT-20-5898,-0.028682,,-0.091783,-0.282198,-0.023624,-0.042998,-0.051998,0.000000,-0.109127,-0.029677,...,,,,,0.0,0.0,,,0.0,
TGAATGACTTTGGGAAGGT-22-5898,-0.028682,,-0.091783,-0.282198,-0.023624,-0.042998,-0.051998,0.000000,-0.109127,-0.029677,...,,,,,0.0,0.0,,,0.0,


In [10]:
# Calculate Absolute and relative frequencies of cells
absolute_freq = merged_cell_meta.groupby(['HTAN Specimen ID', 'Cell_Type']).size().reset_index(name='Absolute_Frequency')
total_freq = merged_cell_meta.groupby('HTAN Specimen ID').size().reset_index(name='Total_freq')
result = pd.merge(absolute_freq, total_freq, on='HTAN Specimen ID')
result['Relative_freq'] = result['Absolute_Frequency'] / result['Total_freq']
result['Relative_freq'] = np.ceil(result['Relative_freq'] * 1000) / 1000
result

Unnamed: 0,HTAN Specimen ID,Cell_Type,Absolute_Frequency,Total_freq,Relative_freq
0,HTA11_10167_2000001011,ABS,382,592,0.646
1,HTA11_10167_2000001011,ASC,1,592,0.002
2,HTA11_10167_2000001011,B,6,592,0.011
3,HTA11_10167_2000001011,CT,1,592,0.002
4,HTA11_10167_2000001011,EE,4,592,0.007
...,...,...,...,...,...
1497,HTA11_99999974143_84620,END,45,117,0.385
1498,HTA11_99999974143_84620,FIB,26,117,0.223
1499,HTA11_99999974143_84620,MYE,17,117,0.146
1500,HTA11_99999974143_84620,PLA,9,117,0.077


In [11]:
absolute_freq_pivot = result.pivot(index='Cell_Type', columns='HTAN Specimen ID', values='Absolute_Frequency').fillna(0).astype(int)
absolute_freq_pivot.to_csv('abs_counts.txt', sep='\t')
absolute_freq_pivot

HTAN Specimen ID,HTA11_10167_2000001011,HTA11_10167_2000002021,HTA11_104_2000001011,HTA11_104_2000002011,HTA11_104_2000002021,HTA11_10623_2000001011,HTA11_10711_2000001011,HTA11_11156_2000001011,HTA11_11156_2000001021,HTA11_11167_2000001011,...,HTA11_9408_2000002021,HTA11_99999965062_69753,HTA11_99999965062_69756,HTA11_99999965104_69815,HTA11_99999970781_79442,HTA11_99999970781_79443,HTA11_99999971662_82457,HTA11_99999973458_83798,HTA11_99999973899_84307,HTA11_99999974143_84620
Cell_Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ABS,382,451,46,547,296,574,139,73,34,313,...,265,0,0,0,0,0,0,0,0,0
ASC,1,1,4,6,2,49,219,91,12,103,...,3,0,0,0,0,0,0,0,0,0
B,6,1,0,93,0,0,2,0,1,0,...,0,93,1,0,0,0,3,1,17,0
CT,1,11,0,32,4,12,0,1,1,1,...,40,0,0,0,0,0,0,0,0,0
EE,4,5,0,13,0,5,0,2,0,1,...,12,0,0,0,0,0,0,0,0,0
END,0,1,0,1,0,1,2,0,0,0,...,4,0,0,0,0,0,134,28,44,45
FIB,2,3,0,1,0,21,1,0,2,2,...,1,18,0,0,0,0,297,6,32,26
GOB,54,90,2,332,44,149,54,76,12,100,...,100,0,0,0,0,0,0,0,0,0
MAS,0,0,1,1,0,2,0,2,0,0,...,2,8,0,3,0,0,0,0,0,0
MYE,1,2,2,4,0,1,4,4,1,5,...,5,99,2,0,9,0,214,301,58,17


In [12]:
relative_freq_pivot = result.pivot(index='Cell_Type', columns='HTAN Specimen ID', values='Relative_freq').fillna(0)
relative_freq_pivot.to_csv('relative_freq.txt', sep='\t')
relative_freq_pivot

HTAN Specimen ID,HTA11_10167_2000001011,HTA11_10167_2000002021,HTA11_104_2000001011,HTA11_104_2000002011,HTA11_104_2000002021,HTA11_10623_2000001011,HTA11_10711_2000001011,HTA11_11156_2000001011,HTA11_11156_2000001021,HTA11_11167_2000001011,...,HTA11_9408_2000002021,HTA11_99999965062_69753,HTA11_99999965062_69756,HTA11_99999965104_69815,HTA11_99999970781_79442,HTA11_99999970781_79443,HTA11_99999971662_82457,HTA11_99999973458_83798,HTA11_99999973899_84307,HTA11_99999974143_84620
Cell_Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ABS,0.646,0.502,0.517,0.437,0.79,0.539,0.259,0.245,0.41,0.468,...,0.499,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ASC,0.002,0.002,0.045,0.005,0.006,0.046,0.408,0.306,0.145,0.154,...,0.006,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B,0.011,0.002,0.0,0.075,0.0,0.0,0.004,0.0,0.013,0.0,...,0.0,0.163,0.013,0.0,0.0,0.0,0.005,0.003,0.045,0.0
CT,0.002,0.013,0.0,0.026,0.011,0.012,0.0,0.004,0.013,0.002,...,0.076,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
EE,0.007,0.006,0.0,0.011,0.0,0.005,0.0,0.007,0.0,0.002,...,0.023,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
END,0.0,0.002,0.0,0.001,0.0,0.001,0.004,0.0,0.0,0.0,...,0.008,0.0,0.0,0.0,0.0,0.0,0.188,0.059,0.115,0.385
FIB,0.004,0.004,0.0,0.001,0.0,0.02,0.002,0.0,0.025,0.003,...,0.002,0.032,0.0,0.0,0.0,0.0,0.416,0.013,0.084,0.223
GOB,0.092,0.1,0.023,0.265,0.118,0.14,0.101,0.256,0.145,0.15,...,0.188,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MAS,0.0,0.0,0.012,0.001,0.0,0.002,0.0,0.007,0.0,0.0,...,0.004,0.014,0.0,0.6,0.0,0.0,0.0,0.0,0.0,0.0
MYE,0.002,0.003,0.023,0.004,0.0,0.001,0.008,0.014,0.013,0.008,...,0.01,0.173,0.026,0.0,0.22,0.0,0.3,0.634,0.152,0.146


In [13]:
# Merge the cell metadata to the exp df
cell_meta = merged_cell_meta[['HTAN Specimen ID','Cell_Type']]
gene_exp = pd.merge(merged_exp_df, merged_cell_meta[['HTAN Specimen ID']],left_index=True, right_index=True)

In [14]:
gene_exp

Unnamed: 0,ENSG00000121410,ENSG00000268895,ENSG00000148584,ENSG00000175899,ENSG00000245105,ENSG00000166535,ENSG00000256661,ENSG00000256904,ENSG00000128274,ENSG00000118017,...,ENSG00000267078,ENSG00000249196,ENSG00000257534,ENSG00000228559,ENSG00000275923,ENSG00000239608,ENSG00000230005,ENSG00000198573,ENSG00000275743,HTAN Specimen ID
AAATCAGAAGTGATGC-0,-0.019469,-0.005354,-0.679289,-0.069293,-0.015500,-0.053891,-0.057669,-0.005543,-0.064006,-0.019828,...,,,,,,,,,,HTA11_3410_2000001011
GACCCGAATGAGGCAACG-0,-0.019469,-0.005354,-0.679289,-0.069293,-0.015500,-0.053891,-0.057669,-0.005543,-0.064006,-0.019828,...,,,,,,,,,,HTA11_3410_2000001011
GACTTCTTCGATATGCAT-0,-0.019469,-0.005354,-0.679289,-0.069293,-0.015500,-0.053891,-0.057669,-0.005543,-0.064006,-0.019828,...,,,,,,,,,,HTA11_3410_2000001011
GAACCACGCTACCTTGCC-0,-0.019469,-0.005354,-0.679289,-0.069293,-0.015500,-0.053891,-0.057669,-0.005543,-0.064006,-0.019828,...,,,,,,,,,,HTA11_3410_2000001011
TGCCTCACGTGGAGCT-0,-0.019469,-0.005354,-0.679289,-0.069293,-0.015500,-0.053891,-0.057669,-0.005543,-0.064006,-0.019828,...,,,,,,,,,,HTA11_3410_2000001011
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ATCTTTGTACTGAGTT-19-5898,-0.028682,,-0.091783,-0.282198,-0.023624,-0.042998,-0.051998,0.000000,-0.109127,-0.029677,...,,,,0.0,0.0,,,0.0,,HTA11_6801_2000002011
ATCCGCTAAAGCTTCT-19-5898,-0.028682,,-0.091783,-0.282198,-0.023624,-0.042998,-0.051998,0.000000,-0.109127,-0.029677,...,,,,0.0,0.0,,,0.0,,HTA11_6801_2000002011
TGATCCCTGATCGGGTAGT-20-5898,-0.028682,,-0.091783,-0.282198,-0.023624,-0.042998,-0.051998,0.000000,-0.109127,-0.029677,...,,,,0.0,0.0,,,0.0,,HTA11_6182_2000001011
TGAATGACTTTGGGAAGGT-22-5898,-0.028682,,-0.091783,-0.282198,-0.023624,-0.042998,-0.051998,0.000000,-0.109127,-0.029677,...,,,,0.0,0.0,,,0.0,,HTA11_8504_2000002011


In [15]:
# Calculate pseudo bulk exp by averaging the values across the cells from the same sample
pseudo_bulk_rna = gene_exp.groupby('HTAN Specimen ID').mean()
pseudo_bulk_rna = np.ceil(pseudo_bulk_rna * 1000) / 1000
pseudo_bulk_rna

Unnamed: 0_level_0,ENSG00000121410,ENSG00000268895,ENSG00000148584,ENSG00000175899,ENSG00000245105,ENSG00000166535,ENSG00000256661,ENSG00000256904,ENSG00000128274,ENSG00000118017,...,ENSG00000278177,ENSG00000267078,ENSG00000249196,ENSG00000257534,ENSG00000228559,ENSG00000275923,ENSG00000239608,ENSG00000230005,ENSG00000198573,ENSG00000275743
HTAN Specimen ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HTA11_10167_2000001011,0.015,,0.127,-0.035,-0.012,-0.005,-0.026,-0.006,-0.028,-0.017,...,0.0,-0.004,-0.008,-0.004,-0.003,0.0,-0.007,-0.006,0.0,0.0
HTA11_10167_2000002021,-0.018,,0.170,0.005,-0.011,-0.044,-0.030,-0.007,-0.037,-0.016,...,0.0,-0.004,-0.008,-0.004,-0.004,0.0,-0.007,-0.006,0.0,0.0
HTA11_104_2000001011,-0.020,-0.005,0.401,0.080,-0.016,0.289,-0.057,-0.004,0.168,-0.020,...,,,,,0.000,0.0,,,0.0,
HTA11_104_2000002011,-0.020,-0.005,0.261,0.003,-0.016,0.021,0.008,-0.004,0.085,0.066,...,,,,,0.000,0.0,,,0.0,
HTA11_104_2000002021,-0.019,-0.005,0.048,0.071,-0.015,-0.053,-0.010,-0.005,0.142,-0.019,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HTA11_99999970781_79443,-0.028,,0.490,-0.282,-0.023,-0.042,-0.051,0.000,-0.109,-0.029,...,,,,,0.000,0.0,,,0.0,
HTA11_99999971662_82457,0.065,,-0.005,0.099,0.030,0.032,-0.051,0.000,0.349,0.024,...,,,,,0.000,0.0,,,0.0,
HTA11_99999973458_83798,-0.028,,-0.066,-0.105,-0.023,0.028,0.008,0.000,0.037,-0.029,...,,,,,0.000,0.0,,,0.0,
HTA11_99999973899_84307,-0.028,,-0.091,0.002,0.061,-0.042,-0.051,0.000,0.012,-0.029,...,,,,,0.000,0.0,,,0.0,


In [16]:
pseudo_bulk_rna_pivot = pseudo_bulk_rna.transpose()
pseudo_bulk_rna_pivot

HTAN Specimen ID,HTA11_10167_2000001011,HTA11_10167_2000002021,HTA11_104_2000001011,HTA11_104_2000002011,HTA11_104_2000002021,HTA11_10623_2000001011,HTA11_10711_2000001011,HTA11_11156_2000001011,HTA11_11156_2000001021,HTA11_11167_2000001011,...,HTA11_9408_2000002021,HTA11_99999965062_69753,HTA11_99999965062_69756,HTA11_99999965104_69815,HTA11_99999970781_79442,HTA11_99999970781_79443,HTA11_99999971662_82457,HTA11_99999973458_83798,HTA11_99999973899_84307,HTA11_99999974143_84620
ENSG00000121410,0.015,-0.018,-0.020,-0.020,-0.019,-0.019,-0.019,-0.018,-0.019,-0.018,...,-0.018,0.020,-0.028,-0.028,-0.028,-0.028,0.065,-0.028,-0.028,-0.028
ENSG00000268895,,,-0.005,-0.005,-0.005,,,,,,...,,,,,,,,,,
ENSG00000148584,0.127,0.170,0.401,0.261,0.048,0.155,-0.277,-0.312,-0.372,-0.182,...,-0.103,0.005,0.164,-0.091,0.112,0.490,-0.005,-0.066,-0.091,0.022
ENSG00000175899,-0.035,0.005,0.080,0.003,0.071,-0.022,0.008,0.051,0.353,0.001,...,-0.031,0.099,-0.266,1.596,-0.230,-0.282,0.099,-0.105,0.002,0.273
ENSG00000245105,-0.012,-0.011,-0.016,-0.016,-0.015,-0.012,-0.012,-0.011,-0.012,-0.011,...,-0.011,-0.005,-0.023,-0.023,-0.023,-0.023,0.030,-0.023,0.061,-0.023
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000275923,0.000,0.000,0.000,0.000,,0.000,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
ENSG00000239608,-0.007,-0.007,,,,-0.007,-0.007,-0.007,-0.007,-0.007,...,-0.007,,,,,,,,,
ENSG00000230005,-0.006,-0.006,,,,-0.006,-0.006,-0.006,-0.006,-0.006,...,-0.006,,,,,,,,,
ENSG00000198573,0.000,0.000,0.000,0.000,,0.000,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000


In [17]:
# Merge the gene metadata to the exp df
bulk_rna_exp = pd.merge(pseudo_bulk_rna_pivot, merged_gene_meta[['feature_name']], left_index=True, right_index=True)
bulk_rna_exp.set_index('feature_name',inplace=True)
bulk_rna_exp = bulk_rna_exp.applymap(lambda x: np.ceil(x * 1000) / 1000).fillna('NA')
bulk_rna_exp

Unnamed: 0_level_0,HTA11_10167_2000001011,HTA11_10167_2000002021,HTA11_104_2000001011,HTA11_104_2000002011,HTA11_104_2000002021,HTA11_10623_2000001011,HTA11_10711_2000001011,HTA11_11156_2000001011,HTA11_11156_2000001021,HTA11_11167_2000001011,...,HTA11_9408_2000002021,HTA11_99999965062_69753,HTA11_99999965062_69756,HTA11_99999965104_69815,HTA11_99999970781_79442,HTA11_99999970781_79443,HTA11_99999971662_82457,HTA11_99999973458_83798,HTA11_99999973899_84307,HTA11_99999974143_84620
feature_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,0.015,-0.017,-0.019,-0.019,-0.018,-0.018,-0.018,-0.017,-0.018,-0.017,...,-0.017,0.02,-0.028,-0.028,-0.028,-0.028,0.065,-0.028,-0.028,-0.028
A1BG-AS1,,,-0.004,-0.004,-0.004,,,,,,...,,,,,,,,,,
A1CF,0.128,0.171,0.401,0.262,0.049,0.156,-0.277,-0.312,-0.372,-0.181,...,-0.103,0.005,0.165,-0.09,0.113,0.491,-0.004,-0.065,-0.09,0.022
A2M,-0.035,0.005,0.08,0.004,0.072,-0.021,0.009,0.051,0.353,0.002,...,-0.03,0.099,-0.266,1.596,-0.23,-0.282,0.099,-0.104,0.003,0.274
A2M-AS1,-0.012,-0.01,-0.016,-0.016,-0.014,-0.012,-0.012,-0.01,-0.012,-0.01,...,-0.01,-0.004,-0.023,-0.023,-0.023,-0.023,0.03,-0.023,0.062,-0.023
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000275923.1,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
RUVBL1-AS1,-0.007,-0.007,,,,-0.007,-0.007,-0.007,-0.007,-0.007,...,-0.007,,,,,,,,,
SNAP47-AS1,-0.006,-0.006,,,,-0.006,-0.006,-0.006,-0.006,-0.006,...,-0.006,,,,,,,,,
SPANXC,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
# Run this if needed
# Subset the output file to the required samples - the h5ad files have samples not in the cohort.
# Provide a .txt file with the list of samples to subset
sample_list = pd.read_csv('samples.txt', delimiter='\t', header=None)
row_labels = sample_list.T.values.tolist()[0]
available_labels = [label for label in row_labels if label in absolute_freq_pivot.columns]
subset_absolute_freq_df = absolute_freq_pivot.loc[:, available_labels]
subset_absolute_freq_df.to_csv('data_absolute_counts.txt', sep='\t')

subset_relative_freq_df = relative_freq_pivot.loc[:, available_labels]
subset_relative_freq_df.to_csv('data_relative_fraction.txt', sep='\t')

subset_bulk_rna_exp_df = bulk_rna_exp.loc[:, available_labels]
subset_bulk_rna_exp_df.to_csv('data_mrna_seq_expression.txt', sep='\t')