In [1]:
import h5py
import numpy as np
import pandas as pd
import scanpy as sc
from scipy.cluster.hierarchy import linkage,leaves_list, dendrogram
from scipy.spatial.distance import pdist, euclidean
from scipy.special import comb
import re

In [46]:
# gives the name of dataset we want as an input
# celltype / celltype_dataset / celltype_dataset_timepoint
def read_file(df_type):
    with h5py.File('./output/condensed_lung_atlas_in_cpm.h5',"r") as h5_data:
    
        df = pd.DataFrame(
                data=np.array(h5_data[df_type]['gene_expression_average']['block0_values']).astype(np.float32),
                index=np.array(h5_data[df_type]['gene_expression_average']['axis1'].asstr()),
                columns=np.array(h5_data[df_type]['gene_expression_average']['axis0'].asstr()),
            ).T
    return df

In [47]:
df = read_file('celltype_dataset_timepoint')
gene = 'Car4'
filtered_df = df.filter(items=[gene],axis=0)
filtered_df

Unnamed: 0,Adventitial fibroblast_ACZ_P21,Adventitial fibroblast_ACZ_P7,Adventitial fibroblast_Hurskainen2021_P14,Adventitial fibroblast_Hurskainen2021_P3,Adventitial fibroblast_Hurskainen2021_P7,Adventitial fibroblast_TMS_18m,Adventitial fibroblast_TMS_24m,Adventitial fibroblast_TMS_3m,Airway smooth muscle_ACZ_P21,Airway smooth muscle_Hurskainen2021_P14,...,neutrophil_ACZ_E18.5,neutrophil_ACZ_P1,neutrophil_ACZ_P21,neutrophil_ACZ_P7,neutrophil_Hurskainen2021_P14,neutrophil_Hurskainen2021_P3,neutrophil_Hurskainen2021_P7,neutrophil_TMS_18m,neutrophil_TMS_24m,neutrophil_TMS_3m
Car4,0.015945,0.0,7.019531,6.179688,9.023438,2.792969,13.703125,4.105469,0.0,14.140625,...,0.0,6.242188,0.056,26.359375,39.53125,25.859375,5.808594,8.78125,0.0,27.359375


In [7]:
all_celltypes = []
dt_combinations = []  # dataset and timepoint combinations

for column_name in filtered_df.columns:
    celltype = column_name.split("_")[0]
    dataset_timepoint = column_name.split(celltype+"_")[1]
    if celltype not in all_celltypes:
        all_celltypes.append(celltype)
    if dataset_timepoint not in dt_combinations:
        dt_combinations.append(dataset_timepoint)

In [8]:
all_celltypes
dt_combinations

['ACZ_P21',
 'ACZ_P7',
 'Hurskainen2021_P14',
 'Hurskainen2021_P3',
 'Hurskainen2021_P7',
 'TMS_18m',
 'TMS_24m',
 'TMS_3m',
 'ACZ_E18.5',
 'ACZ_P1']

In [58]:
expression = {}
for dt in dt_combinations:
    expression[dt] = {}
    for ct in all_celltypes:
        name = "_".join([ct,dt])
        if name not in filtered_df.columns:
            exp_value = -1
        else:
            exp_value = filtered_df[name].values[0]
        expression[dt][ct] = exp_value
        

In [59]:
expression

{'ACZ_P21': {'Adventitial fibroblast': 0.015945435,
  'Airway smooth muscle': 0.0,
  'Alveolar fibroblast': 26.765625,
  'Alveolar type II': 0.0,
  'Alveolar type I': 0.0,
  'Arterial EC II': 0.0,
  'Arterial EC I': 0.0,
  'B cell': 13.0625,
  'Car4+ capillaries': 7268.0,
  'DC III': 0.0,
  'DC II': 0.0,
  'DC I': 120.1875,
  'Early Car4- capillaries': 310.0,
  'Early adventitial fibroblast': 0.0,
  'Early airway smooth muscle': 0.0,
  'Early alveolar fibroblast': 0.0,
  'Fibroblast precursor': -1,
  'IL cell': 0.0,
  'Late Car4- capillaries': 585.5,
  'Lymphatic EC': 0.0,
  'Mac III': 1047.0,
  'Mac II': 0.0,
  'Mac IV': 8.203125,
  'Mac I': -1,
  'Mac V': 0.0019779205,
  'Myofibroblast and smooth muscle precursor': -1,
  'Myofibroblast': -1,
  'NK cell': 12.9140625,
  'Nonproliferative embryonic EC': -1,
  'Pericyte': 0.0034923553,
  'Proliferating fibroblast': -1,
  'Proliferating myofibroblast': -1,
  'Proliferating pericyte': -1,
  'Proliferative EC': 550.5,
  'Striated muscle': 0