# Extract all the features

Similar to notebook2 but we exctract all the features according to all pretrained models

In [1]:
# TO REMOVE when notebook is stable

%load_ext autoreload
%autoreload 2

### Common Imports

In [2]:
import torch
import tarfile
import os
from anndata import read_h5ad

# tissue_purifier import
import tissue_purifier as tp

### Download and untar the example dataset

In [3]:
import tissue_purifier.io

bucket_name = "ld-data-bucket"
data_source_path = "tissue-purifier/slideseq_testis_anndata_h5ad.tar.gz"
data_destination_path = "./slideseq_testis_anndata_h5ad.tar.gz"
data_destination_folder = "./testis_anndata"

# download and untar the data from google bucket
# tp.io.download_from_bucket(bucket_name, data_source_path, data_destination_path)

# untar the data
# with tarfile.open(data_destination_path, "r:gz") as fp:
#    fp.extractall(path=data_destination_folder)

# Make a list of all the h5ad files in the data_destination_folder
fname_list = []
for f in os.listdir(data_destination_folder):
    if f.endswith('.h5ad'):
        fname_list.append(f)
print(fname_list)

['anndata_sick3.h5ad', 'anndata_sick1.h5ad', 'anndata_sick2.h5ad', 'anndata_wt2.h5ad', 'anndata_wt1.h5ad', 'anndata_wt3.h5ad']


### copy the data into a new folder 

In [None]:
new_data_folder = "./testis_anndata_annotated"


### Download all the checkpoint files

In [4]:
bucket_name = "ld-data-bucket"
all_ckpts = ["ckpt_barlow.pt", "ckpt_simclr.pt", "ckpt_dino.pt", "ckpt_vae.pt"]
all_models = ["barlow", "simclr", "dino", "vae"]
all_ckpts_dest = []
for ckpt in all_ckpts:
    ckpt_source = os.path.join("tissue-purifier", ckpt)
    ckpt_dest = os.path.join("./", ckpt)
    all_ckpts_dest.append(ckpt_dest)
    # tp.io.download_from_bucket(bucket_name, ckpt_source, ckpt_dest)
    
print(all_ckpts_dest)

['./ckpt_barlow.pt', './ckpt_simclr.pt', './ckpt_dino.pt', './ckpt_vae.pt']


### Extract features with all the models (Barlow, Simclr, Dino, Vae)

In [5]:
from tissue_purifier.data import AnndataFolderDM
from tissue_purifier.models.ssl_models import *
# now you have access to Barlow, SImclr, Dino, Vae

n_patches_max = 1000 # cover each tissue with this many overlapping patches

for ckpt_path, model_name in zip(all_ckpts_dest, all_models):
    
    print("----------")
    print("Model --->", model_name, ckpt_path)
    print("----------")
    
    if model_name == "barlow":
        model = tp.models.ssl_models.Barlow.load_from_checkpoint(checkpoint_path=ckpt_path, strict=False)
    elif model_name == "simclr":
        model = tp.models.ssl_models.Simclr.load_from_checkpoint(checkpoint_path=ckpt_path, strict=False)
    elif model_name == "dino":
        model = tp.models.ssl_models.Dino.load_from_checkpoint(checkpoint_path=ckpt_path, strict=False)
    elif model_name == "vae":
        model = tp.models.ssl_models.Vae.load_from_checkpoint(checkpoint_path=ckpt_path, strict=False)
    else:
        raise Exception("Model name not recongnized {}".format(model_name))
        
    # create the datamodule associated with the pretrained model
    dm = tp.data.AnndataFolderDM(**model._hparams) 
    
    # put the model on GPU if available
    if torch.cuda.is_available():
        model = model.cuda()
    
    # process all the anndata with the model, datamodule pair
    for fname in fname_list:
        anndata = read_h5ad(os.path.join(data_destination_folder, fname))
        sp_img = dm.anndata_to_sparseimage(anndata)
        
        # add some ncv as well
        if model_name == "barlow":
            for k in 10, 20, 50, 100, 200, 500:
                sp_img.compute_ncv(feature_name="ncv_k{}".format(k), k=k)
                
        # put sparse image on GPU if available
        if torch.cuda.is_available():
            sp_img = sp_img.cuda()
        
        # compute the patch-feature
        sp_img.compute_patch_features(
            feature_name=model_name, 
            datamodule=dm, 
            model=model, 
            batch_size=64,
            n_patches_max=n_patches_max,
            overwrite=True)
        
        sp_img.transfer_patch_to_spot(keys_to_transfer=model_name, overwrite=True)
        
        new_adata = sp_img.to_anndata()
        new_adata.write(filename=os.path.join(data_destination_folder, fname)) # overwrite the file but with extra annotations   

Model ---> barlow ./ckpt_barlow.pt
number of elements ---> 33441
mean and median spacing 15.479421424523398, 15.47439133436206
The dense shape of the image is -> torch.Size([9, 1164, 1165])
Occupacy (zero, single, double, ...) of voxels in 3D sparse array -> [    0 33433     4]
Occupacy (zero, single, double, ...) of voxels  in 2D sparse array (summed over category) -> [    0 33423     9]
number of elements ---> 27194
mean and median spacing 16.12433160571037, 15.591954248205585
The dense shape of the image is -> torch.Size([9, 1176, 1180])
Occupacy (zero, single, double, ...) of voxels in 3D sparse array -> [    0 27190     2]
Occupacy (zero, single, double, ...) of voxels  in 2D sparse array (summed over category) -> [    0 27174    10]
number of elements ---> 42776
mean and median spacing 15.424869146306138, 15.42998109176031
The dense shape of the image is -> torch.Size([9, 1180, 1180])
Occupacy (zero, single, double, ...) of voxels in 3D sparse array -> [    0 42772     2]
Occupac

### check what I have done

In [6]:
for fname in fname_list:
    anndata = read_h5ad(os.path.join(data_destination_folder, fname))
    print(fname)
    print(anndata)

anndata_sick3.h5ad
AnnData object with n_obs × n_vars = 33441 × 23514
    obs: 'x', 'y', 'cell_type'
anndata_sick1.h5ad
AnnData object with n_obs × n_vars = 27194 × 24420
    obs: 'x', 'y', 'cell_type'
anndata_sick2.h5ad
AnnData object with n_obs × n_vars = 42776 × 24263
    obs: 'x', 'y', 'cell_type'
anndata_wt2.h5ad
AnnData object with n_obs × n_vars = 33059 × 23741
    obs: 'x', 'y', 'cell_type'
anndata_wt1.h5ad
AnnData object with n_obs × n_vars = 31659 × 24450
    obs: 'x', 'y', 'cell_type'
anndata_wt3.h5ad
AnnData object with n_obs × n_vars = 39206 × 23705
    obs: 'x', 'y', 'cell_type'
