In [1]:
import pandas as pd 
import numpy as np
import scanpy as sc
import anndata as ad
from scipy.sparse import csr_matrix

from pathlib import Path

In [23]:
data_directory = Path("/work/magroup/ehaber/SCP1375/preprocessed_data_new")

In [3]:
raw_expr_df = pd.read_csv(data_directory / 'expression'/ 'expression_matrix_raw.csv')

In [4]:
raw_expr_df

Unnamed: 0,GENE,1,3,4,5,6,7,8,9,10,...,76834,76836,76837,76838,76839,76840,76842,76843,76845,76846
0,A2m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Aagab,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Aak1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Abca2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Abca7,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2761,Zhx1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2762,Zic1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2763,Zim1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2764,Zmym1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [5]:
#spatial coordinates of different mice replicates (control vs AD mice)
spatial_8mon_contr_repl1_df = pd.read_csv(data_directory / 'cluster' / 'spatial_8months-control-replicate_1.csv', header=0, skiprows=lambda x: x == 1)
spatial_8mon_contr_repl2_df = pd.read_csv(data_directory / 'cluster' / 'spatial_8months-control-replicate_2.csv', header=0, skiprows=lambda x: x == 1)
spatial_8mon_dis_repl1_df = pd.read_csv(data_directory / 'cluster' / 'spatial_8months-disease-replicate_1.csv', header=0, skiprows=lambda x: x == 1)
spatial_8mon_dis_repl2_df = pd.read_csv(data_directory / 'cluster' / 'spatial_8months-disease-replicate_2.csv', header=0, skiprows=lambda x: x == 1)

spatial_13mon_contr_repl1_df = pd.read_csv(data_directory / 'cluster' / 'spatial_13months-control-replicate_1.csv', header=0, skiprows=lambda x: x == 1)
spatial_13mon_contr_repl2_df = pd.read_csv(data_directory / 'cluster' / 'spatial_13months-control-replicate_2.csv', header=0, skiprows=lambda x: x == 1)
spatial_13mon_dis_repl1_df = pd.read_csv(data_directory / 'cluster' / 'spatial_13months-disease-replicate_1.csv', header=0, skiprows=lambda x: x == 1)
spatial_13mon_dis_repl2_df = pd.read_csv(data_directory / 'cluster' / 'spatial_13months-disease-replicate_2.csv', header=0, skiprows=lambda x: x == 1)

In [6]:
spatial_info = [
    spatial_8mon_dis_repl1_df,
    spatial_8mon_dis_repl2_df,
    spatial_8mon_contr_repl1_df,
    spatial_8mon_contr_repl2_df,
    spatial_13mon_dis_repl1_df,
    spatial_13mon_dis_repl2_df,
    spatial_13mon_contr_repl1_df,
    spatial_13mon_contr_repl2_df
]

In [7]:
#Create Anndata for each replicate, normalize gene expression, find most highly variable genes, index anndata
def preprocess_dataset(spatial_metadata, raw_dataframe):
    spatial_metadata['X'] = spatial_metadata['X'].astype('int')
    spatial_metadata['Y'] = spatial_metadata['Y'].astype('int')
    idx = spatial_metadata['NAME'].astype(str)
    
    raw_values = raw_dataframe.loc[:, idx]
    
    dataset = ad.AnnData(csr_matrix(raw_values.T.values), dtype=np.float64)
    dataset.obs_names = [f"Cell_{i:d}" for i in range(dataset.n_obs)]
    dataset.var_names = [raw_expr_df['GENE'].iloc[i] for i in range(dataset.n_vars)]
    
    sc.pp.normalize_total(dataset, inplace=True)
    sc.pp.log1p(dataset)
    sc.pp.highly_variable_genes(dataset, n_top_genes=500)
    dataset = dataset[:, dataset.var.highly_variable]
    
    dataset.obsm['spatial'] = spatial_metadata[['X', 'Y']].values
    
    return dataset

In [8]:
datasets = [preprocess_dataset(spatial_metadata, raw_expr_df) for spatial_metadata in spatial_info]

  dataset.obsm['spatial'] = spatial_metadata[['X', 'Y']].values
  dataset.obsm['spatial'] = spatial_metadata[['X', 'Y']].values
  dataset.obsm['spatial'] = spatial_metadata[['X', 'Y']].values
  dataset.obsm['spatial'] = spatial_metadata[['X', 'Y']].values
  dataset.obsm['spatial'] = spatial_metadata[['X', 'Y']].values
  dataset.obsm['spatial'] = spatial_metadata[['X', 'Y']].values
  dataset.obsm['spatial'] = spatial_metadata[['X', 'Y']].values
  dataset.obsm['spatial'] = spatial_metadata[['X', 'Y']].values


In [9]:
dataset_names = [
    '8mon_dis_repl1',
    '8mon_dis_repl2',
    '8mon_contr_repl1',
    '8mon_contr_repl2',
    '13mon_dis_repl1',
    '13mon_dis_repl2',
    '13mon_contr_repl1',
    '13mon_contr_repl2'
]

In [10]:
from popari.components import PopariDataset
from popari.io import save_anndata, load_anndata

In [19]:
Path.joinpath(data_directory, preprocessed_data_new, dataset_names[0]+'.h5ad')

NameError: name 'preprocessed_data_new' is not defined

In [28]:
replicate_names

['8mon_dis_repl1', '8mon_dis_repl2', '13mon_dis_repl1', '13mon_dis_repl2']

In [27]:
#AD datasets
datasets_AD = []
replicate_names_AD = []
for fov in [0,1,4,5]:
    path = Path.joinpath(data_directory, dataset_names[fov]+'.h5ad')
    dataset = ad.read_h5ad(path) # Each dataset must have spatial information stored as an adjacency matrix
    name = f"{dataset_names[fov]}"
    datasets_AD.append(dataset)x
    replicate_names_AD.append(name)

In [30]:
#control datasets
datasets_contr = []
replicate_names_contr = []
for fov in [2,3,6,7]:
    path = Path.joinpath(data_directory, dataset_names[fov]+'.h5ad')
    dataset = ad.read_h5ad(path) # Each dataset must have spatial information stored as an adjacency matrix
    name = f"{dataset_names[fov]}"
    datasets_contr.append(dataset)
    replicate_names_contr.append(name)

In [None]:
from popari.components import PopariDataset
from popari.io import save_anndata

(data_directory / "preprocessed_data_new").mkdir(exist_ok=True)
resulting_datasets = []
for dataset, name in zip(datasets_AD, replicate_names_AD): # Use only the datasets that you want for a single run
    wrapped_dataset = PopariDataset(dataset, name)
    wrapped_dataset.compute_spatial_neighbors()
    filename = f'{name}.h5ad'
    resulting_dataset = save_anndata(data_directory / "preprocessed_data_new" / f"{filename}", [wrapped_dataset], ignore_raw_data=False)
    resulting_datasets.append(resulting_dataset)

In [25]:
from popari.components import PopariDataset
from popari.io import save_anndata

(data_directory / "preprocessed_data_new").mkdir(exist_ok=True)
resulting_datasets = []
for dataset, name in zip(datasets, dataset_names): # Use only the datasets that you want for a single run
    wrapped_dataset = PopariDataset(dataset, name)
    wrapped_dataset.compute_spatial_neighbors()
    filename = f'{name}.h5ad'
    resulting_dataset = save_anndata(data_directory / "preprocessed_data_new" / f"{filename}", [wrapped_dataset], ignore_raw_data=False)
    resulting_datasets.append(resulting_dataset)

In [26]:
loaded_datasets, replicate_names = load_anndata(data_directory / "preprocessed_data_new" / f"8mon_contr_repl1.h5ad")


These matrices should now be stored in the .obsp attribute.
This slicing behavior will be removed in anndata 0.8.
  warn(


In [33]:
loaded_datasets

[AnnData object with n_obs × n_vars = 8506 × 500
     obs: 'batch', 'adjacency_list'
     var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
     uns: 'adjacency_matrix', 'dataset_name', 'hvg', 'log1p', 'spatial_neighbors'
     obsm: 'spatial'
     obsp: 'adjacency_matrix']

In [32]:
len(loaded_datasets)

1