In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import scanpy as sc
import pickle as pkl
import sys
sys.path.append("..")

In [3]:
data = sc.read_h5ad("/lfs/local/0/ayushag/uce_data/coarse_uce_10000_per_dataset_33l_8ep.h5ad")
data

In [28]:
def make_labels(adata, column):
    from sklearn.preprocessing import LabelEncoder
    """
    Make labels for the adata object.
    Deterministic process since we are using LabelEncoder, which sorts.
    """
    return LabelEncoder().fit_transform(adata.obs[column].values)

In [32]:
tissue_type = make_labels(data, 'coarse_tissue')
# see which tissue type is mapped to which label
dict(zip(data.obs['coarse_tissue'], tissue_type))

{'brain': 4,
 'lung': 12,
 'female reproductive': 7,
 'male reproductive': 13,
 'blood': 2,
 'eye': 6,
 'pancreas': 16,
 'tongue': 21,
 'bone': 3,
 'kidney': 10,
 'adipose': 0,
 'heart': 8,
 'intestines': 9,
 'liver': 11,
 'stomach': 19,
 'esophagus': 5,
 'thymus': 20,
 'skin': 17,
 'spleen': 18,
 'muscle': 14,
 'bladder': 1,
 'nose': 15}

In [24]:
import os
os.environ['EMB_PATH'] = "/dfs/user/ayushag/ucegen/ucegen/helpers/cond_embs/"
with open(os.path.join(os.environ['EMB_PATH'], "random_tissue_type_emb.pkl"), 'rb') as f:
    tissue_type_emb = pkl.load(f)
tissue_type_emb

{0: array([ 1.0909079 ,  0.02205896,  1.2550262 , ...,  0.09497749,
        -0.5430705 ,  2.206266  ], dtype=float32),
 1: array([-0.17955552, -2.4261034 ,  0.5934913 , ..., -0.19445932,
         0.14138907, -0.7746364 ], dtype=float32),
 2: array([-0.6401492 , -0.35096616, -1.5632441 , ...,  0.71997184,
         1.6256462 ,  0.17792806], dtype=float32),
 3: array([ 1.0791659 , -1.3136574 , -0.21075338, ..., -0.5424797 ,
        -0.81301624, -0.12719327], dtype=float32),
 4: array([-1.9705681 ,  0.92527944, -0.7815848 , ..., -2.2507322 ,
        -1.0740699 ,  0.22059344], dtype=float32),
 5: array([-0.17892289, -0.45541883, -2.6831372 , ..., -1.4223462 ,
         0.37045804,  0.20250699], dtype=float32),
 6: array([-0.96819705,  0.03238983, -0.5556706 , ...,  1.149582  ,
        -0.4985467 , -0.2895851 ], dtype=float32),
 7: array([-0.1973634, -1.3966022,  1.2681212, ...,  0.8424581,  1.1032338,
        -2.511348 ], dtype=float32),
 8: array([-0.22072755, -0.11917397, -0.2224622 , ...,

### Make embeddings for different conditional variables

In [5]:
from helpers.data_helpers import make_labels

cell_type = make_labels(data, "coarse_cell_type")
tissue_type = make_labels(data, "coarse_tissue")

In [None]:
from helpers.data_utils import create_random_label_embeddings

cell_type_emb = create_random_label_embeddings(cell_type, 1280)    # dict of {label: embedding}
with open("/dfs/user/ayushag/ucegen/helpers/cond_embs/random_cell_type_emb.pkl", "wb") as f:
    pkl.dump(cell_type_emb, f)

tissue_type_emb = create_random_label_embeddings(tissue_type, 1280)
with open("/dfs/user/ayushag/ucegen/helpers/cond_embs/random_tissue_type_emb.pkl", "wb") as f:
    pkl.dump(tissue_type_emb, f)

In [7]:
from helpers.data_utils import create_averaged_label_embeddings

cell_avg_embedding = create_averaged_label_embeddings(data, cell_type, 'coarse_cell_type', 1280)
with open("/dfs/user/ayushag/ucegen/helpers/cond_embs/averaged_cell_type_emb.pkl", "wb") as f:
    pkl.dump(cell_avg_embedding, f)

tis_avg_embedding = create_averaged_label_embeddings(data, tissue_type, 'coarse_tissue', 1280)
with open("/dfs/user/ayushag/ucegen/datasets/cond_embs/averaged_tissue_type_emb.pkl", "wb") as f:
    pkl.dump(tis_avg_embedding, f)

{0: array([ 0.03866731,  0.01493303, -0.02345957, ...,  0.00994206,
       -0.05746154, -0.0015082 ], dtype=float32), 1: array([-0.0188835 , -0.00589531,  0.00778027, ..., -0.00882767,
        0.0239766 ,  0.00914056], dtype=float32), 2: array([-0.02350244, -0.01001995,  0.03087838, ...,  0.00670765,
        0.02786334,  0.00567702], dtype=float32), 3: array([-0.01504765, -0.01546091,  0.03408169, ..., -0.00154339,
        0.02264903,  0.01509834], dtype=float32), 4: array([-0.0056897 , -0.00039082, -0.00556177, ..., -0.01795588,
       -0.03113024,  0.01938825], dtype=float32), 5: array([-0.00940052, -0.02001699,  0.02165758, ...,  0.01083889,
        0.00377982, -0.01340587], dtype=float32), 6: array([-0.01127574, -0.05953153,  0.0014274 , ...,  0.01397338,
       -0.00183006, -0.04500984], dtype=float32), 7: array([ 0.01159541,  0.00071055, -0.01246785, ...,  0.00672238,
        0.01455051,  0.01433145], dtype=float32), 8: array([-0.00655379,  0.00996195, -0.00930615, ..., -0.008854

### Coarsening conditional variables

In [57]:
data.obs["cell_type"].value_counts()

cell_type
neuron                                      807435
oligodendrocyte                             157730
missing                                      58248
astrocyte                                    53539
glutamatergic neuron                         48306
                                             ...  
meis2 expressing cortical GABAergic cell         1
vascular leptomeningeal cell (Mmus)              1
brush cell of bronchus                           1
Purkinje myocyte                                 1
nodal myocyte                                    1
Name: count, Length: 839, dtype: int64

In [58]:
data.obs['tissue'].value_counts()

tissue
cerebral cortex               343567
missing                       223529
blood                         195922
cerebral nuclei               150000
thalamic complex              145450
                               ...  
white matter of cerebellum        85
bone spine                        82
Liver-10X_P7_1                    51
Bladder-10X_P4_3                  18
kidney blood vessel               16
Name: count, Length: 197, dtype: int64

In [59]:
from datasets.mappings.coarse_ct_mapping import get_ct_mapping
from datasets.mappings.coarse_tis_mapping import get_tissue_mapping

cell_type_mapping = get_ct_mapping()
tissue_mapping = get_tissue_mapping()

In [60]:
# Function to map fine cell types to coarse cell types, with adjustment for missing cell types
def map_to_coarse(var_type, mapping_dict):
    for coarse, fines in mapping_dict.items():
        if var_type in fines:
            return coarse
    return "missing"  # Adjusted to return "Missing" for cell types not found in the dictionary

In [68]:
# Update the mapping dictionary to include a "Missing" category
cell_type_mapping["missing"] = []

# Apply the mapping to create a new 'coarse_cell_type' column in data.obs
data.obs['coarse_cell_type'] = data.obs['cell_type'].apply(map_to_coarse, mapping_dict=cell_type_mapping)

In [70]:
data = data[data.obs['coarse_cell_type'] != 'missing']

In [71]:
data.obs['coarse_cell_type'].value_counts(dropna=False)

coarse_cell_type
neuron                                  939277
oligodendrocyte                         196002
epithelial                              184984
t cell                                  168607
endothelial                              84222
b cell                                   80352
fibroblast                               77175
macrophage                               73660
stem                                     71701
monocyte                                 70996
astrocyte                                56443
glial                                    44057
lymphoid                                 32677
pneumocyte                               32625
muscle                                   26123
mesenchymal                              25368
thymocyte                                21316
erythroid                                20058
tumor                                    20055
endocrine                                14145
leukocyte                                13

In [72]:
data.write_h5ad("/lfs/local/0/ayushag/uce_data/coarse_uce_10000_per_dataset_33l_8ep.h5ad")

  df[key] = c


### Saving the actual original values for examination

In [21]:
tis_list = data.obs["tissue"].values
unique = list(set(tis_list))

In [9]:
# save the output of data.obs["tissue"].values  to a .txt file locally

saved_file = open("tissue.txt", "w")
for i in unique:
    saved_file.write(i + "\n")
saved_file.close()


In [18]:
tissue_file = open("tissue.txt", "r")
tissue_list = tissue_file.read().splitlines()
tissue_file.close()

# Flatten tissue_categories into a single list
flat_tissue_categories = []
for category, subcategories in tissue_mapping.items():
    if isinstance(subcategories, dict):
        for subcategory, tissues in subcategories.items():
            if isinstance(tissues, list):
                flat_tissue_categories.extend(tissues)
            else:
                flat_tissue_categories.append(tissues)
    elif isinstance(subcategories, list):
        flat_tissue_categories.extend(subcategories)
    else:
        flat_tissue_categories.append(subcategories)

# Check if each tissue from tissue_list is in the flattened tissue_categories
for i in tissue_list:
    if i not in flat_tissue_categories:
        print(i)