In [2]:
import numpy as np
import pandas as pd
import anndata
import scanpy as sc
from scipy.io import mmread
import requests
import os
from tqdm import trange

In [2]:
def download_file(doi,ext):
    url = 'https://api.datacite.org/dois/'+doi+'/media'
    r = requests.get(url).json()
    netcdf_url = r['data'][0]['attributes']['url']
    r = requests.get(netcdf_url,stream=True)
    #Set file name
    fname = doi.split('/')[-1]+ext
    #Download file with progress bar
    if r.status_code == 403:
        print("File Unavailable")
    if 'content-length' not in r.headers:
        print("Did not get file")
    else:
        with open(fname, 'wb') as f:
            total_length = int(r.headers.get('content-length'))
            pbar = trange(int(total_length/1024), unit="B")
            for chunk in r.iter_content(chunk_size=1024):
                if chunk:
                    pbar.update()
                    f.write(chunk)
        return fname

### Prepare Ex Utero data

In [3]:
#Download
!wget -O ../data/raw/GSE149372_scRNA.MetaData.85.csv.gz https://ftp.ncbi.nlm.nih.gov/geo/series/GSE149nnn/GSE149372/suppl/GSE149372_scRNA.MetaData.85.csv.gz
!wget -O ../data/raw/GSE149372_scRNA.normalized.assay85.csv.gz https://ftp.ncbi.nlm.nih.gov/geo/series/GSE149nnn/GSE149372/suppl/GSE149372_scRNA.normalized.assay85.csv.gz
!gunzip -f -k ../data/raw/*.gz

--2024-02-22 18:44:15--  https://ftp.ncbi.nlm.nih.gov/geo/series/GSE149nnn/GSE149372/suppl/GSE149372_scRNA.MetaData.85.csv.gz
Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 2607:f220:41e:250::7, 2607:f220:41e:250::10, 130.14.250.12, ...
Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|2607:f220:41e:250::7|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 224984 (220K) [application/x-gzip]
Saving to: ‘../data/raw/GSE149372_scRNA.MetaData.85.csv.gz’


2024-02-22 18:44:16 (449 KB/s) - ‘../data/raw/GSE149372_scRNA.MetaData.85.csv.gz’ saved [224984/224984]

--2024-02-22 18:44:16--  https://ftp.ncbi.nlm.nih.gov/geo/series/GSE149nnn/GSE149372/suppl/GSE149372_scRNA.normalized.assay85.csv.gz
Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 2607:f220:41e:250::10, 2607:f220:41e:250::7, 130.14.250.12, ...
Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|2607:f220:41e:250::10|:443... connected.
HTTP request sent, awaiting response... 200 O

In [4]:
# Load full data
counts_e85 = pd.read_csv('../data/raw/GSE149372_scRNA.normalized.assay85.csv',index_col=0)
cellMeta_e85 = pd.read_csv('../data/raw/GSE149372_scRNA.MetaData.85.csv')
counts_e85.shape

(19588, 10290)

In [5]:
#process metadata for all E8.5 data
count_mat_e85 = counts_e85.values.T
adata_e85 = anndata.AnnData(X = count_mat_e85)
adata_e85.obs_names = list(counts_e85.columns)
adata_e85.var_names = list(counts_e85.index)
adata_e85.obs['Utero'] = cellMeta_e85.Utero.values
adata_e85.obs['Cluster'] = cellMeta_e85.seurat_clusters.values

#subset to only ex-utero data
adata_exut = adata_e85[adata_e85.obs['Utero']=='ExUt']
clusterlabels_exut = adata_exut.obs['Cluster'].values
adata_exut.shape

(6205, 19588)

In [6]:
#preprocess as Chari & Pachter 
sc.pp.filter_cells(adata_exut, min_counts=0)
sc.pp.filter_genes(adata_exut, min_counts=0)
sc.pp.highly_variable_genes(adata_exut,n_top_genes=2000)
adata_exut = adata_exut[:,adata_exut.var['highly_variable']]

  adata.obs['n_counts'] = number


In [7]:
#celltype infos and colors as Chari&Pachter
celltype_dict = {'0' : 'Blood', '11': 'Blood' ,
                 '1' : 'Somitic Mesoderm',
                 '2' : 'Foregut Mid-hindgut',
                 '3' : 'Cardiac',
                 '4' : 'Mid-hindbrain',
                 '5' : 'Endothelial',
                 '6' : 'Presomitic/Mixed Mesoderm',
                 '7' : 'Amnion', '15' : 'Amnion', '17' : 'Amnion',
                 '8' : 'Extra-Embryonic Endoderm',
                 '9' : 'Pharyngeal Mesoderm',
                 '10': 'Extra-Embryonic Ectoderm',
                 '12': 'Placodes',
                 '13': 'Neural Tube', '16' : 'Neural Tube',
                 '14': 'Extra-Embryonic Mesoderm',
                 '18': 'Endothelial'}

color_dict = {'Blood':'#70D4CE','Presomitic/Mixed Mesoderm':'#81A483','Foregut Mid-hindgut':'#4D50E3',
        'Cardiac':'#E0B7CF','Mid-hindbrain':'#FC514A','Endothelial':'#EDE7D1',
        'Amnion':'#B4CA72',
        'Extra-Embryonic Endoderm':'#C7A2F3','Pharyngeal Mesoderm':'#CCD0BC','Extra-Embryonic Ectoderm':'#73F0FD',
        'Placodes':'#AE995B','Neural Tube':'#87BD7E','Somitic Mesoderm':'#D34FEA',
        'Extra-Embryonic Mesoderm':'#0F5813'}

colors_exut = [color_dict[celltype_dict[str(cl)]] for cl in clusterlabels_exut]
colors_new = np.array(list(colors_exut.copy()),dtype='<U15')

#add extra colors for clusters that get the same colors in the original Chari&Pachter palette
colors_new[clusterlabels_exut==11]='tab:red'
colors_new[clusterlabels_exut==15]='tab:orange'
colors_new[clusterlabels_exut==16]='tab:blue'
colors_new[clusterlabels_exut==17]='tab:green'

In [8]:
#drop unneeded fields
adata_exut_clean = anndata.AnnData(X=adata_exut.X,
                                   obs=dict(clusterlabels=clusterlabels_exut,
                                            clusternames=[celltype_dict[str(x)] for x in clusterlabels_exut],
                                            colors=colors_new),
                                   uns=dict(dataset='exut'))
#Chari&Pachter scaling
adata_exut_clean.layers['X_scaled'] = sc.pp.scale(adata_exut_clean.X.copy())
adata_exut_clean.shape

(6205, 2000)

In [9]:
#save
adata_exut_clean.write_h5ad('../data/preprocessed/exut_adata.h5ad')

In [10]:
adata_exut_clean

AnnData object with n_obs × n_vars = 6205 × 2000
    obs: 'clusterlabels', 'clusternames', 'colors'
    uns: 'dataset'
    layers: 'X_scaled'

In [11]:
adata_exut_clean.obs

Unnamed: 0,clusterlabels,clusternames,colors
0,1,Somitic Mesoderm,#D34FEA
1,0,Blood,#70D4CE
2,13,Neural Tube,#87BD7E
3,1,Somitic Mesoderm,#D34FEA
4,9,Pharyngeal Mesoderm,#CCD0BC
...,...,...,...
6200,0,Blood,#70D4CE
6201,1,Somitic Mesoderm,#D34FEA
6202,0,Blood,#70D4CE
6203,6,Presomitic/Mixed Mesoderm,#81A483


### Prepare MERFISH data

In [12]:
#download and unpack
#metadata.csv
download_file('10.22002/D1.2063','.gz')
#counts.h5ad
download_file('10.22002/D1.2064','.gz')

os.system("gunzip D1.2063.gz")
os.system("gunzip D1.2064.gz")
os.system("mv D1.2063 ../data/raw/merfish_metadata.csv")
os.system("mv D1.2064 ../data/raw/merfish_counts.h5ad")

9785B [00:06, 1542.88B/s]                                                                                                                           
59361B [00:14, 4070.44B/s]                                                                                                                          


0

In [13]:
#load full MERFISH data
counts_merfish = anndata.read_h5ad('../data/raw/merfish_counts.h5ad')
cellMeta_merfish = pd.read_csv('../data/raw/merfish_metadata.csv')
counts_merfish.shape

(280327, 254)

In [14]:
#process clusternames and colors
counts_merfish.obs['slice'] = pd.Categorical(cellMeta_merfish.slice_id)
counts_merfish.obs['clusternames'] = pd.Categorical(cellMeta_merfish.subclass)
_,clusterlabels_merfish = np.unique(counts_merfish.obs['clusternames'],return_inverse=True)
counts_merfish.obs['clusterlabels'] = clusterlabels_merfish

cdict = {'Microglia':'#A778BC',
         'L6_IT':'#EA95C1',
         'Astrocytes':'#B6657E','Oligodendrocytes':'#B49599',
         'L23_IT':'#628EA2','OPC':'#72B9A5','L45_IT':'#71B7A2','Endothelial':'#7794C3','Pvalb':'#FD8085',
         'L6_IT_Car3':'#FAD1FC','L56_NP':'#93B09B','PVM':'#ECBDD2','L5_IT':'#ABA8DF',
         'L5_PT':'#D36486','Sst':'#8FE0B1','L6_CT':'#9589C4','Vip':'#CD83AC','VLMC':'#BBDBF2',
         'SMC':'#7A849E','other':'#A382BC','L6b':'#64D7F0','Pericytes':'#95BA98',
         'Lamp5':'#A3E169','Sncg':'#92B1DB','Sst_Chodl':'#81FAC9'}
counts_merfish.obs['colors'] = [cdict[i] for i in counts_merfish.obs['clusternames']]


#subset MERFISH data to same slide as Chari & Pachter
slice_choice = np.unique(cellMeta_merfish.slice_id)[7]

subset_merfish = counts_merfish[counts_merfish.obs['slice'].isin([slice_choice])]
subset_merfish.shape

(6963, 254)

In [15]:
#preprocess as Chari&Pachter
sc.pp.log1p(subset_merfish)
subset_merfish.uns['dataset'] = 'merfish'
#Chari&Pachter scaling
subset_merfish.layers['X_scaled'] = sc.pp.scale(subset_merfish.X.copy())

  view_to_actual(adata)


In [16]:
#save
subset_merfish.write_h5ad('../data/preprocessed/merfish_adata.h5ad')

In [17]:
subset_merfish

AnnData object with n_obs × n_vars = 6963 × 254
    obs: 'slice', 'clusternames', 'clusterlabels', 'colors'
    uns: 'log1p', 'dataset'
    layers: 'X_scaled'

In [18]:
subset_merfish.obs

Unnamed: 0_level_0,slice,clusternames,clusterlabels,colors
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
100028810813183819239792138762506953059,mouse1_slice162,Microglia,12,#A778BC
100042292675978612289781789372249900646,mouse1_slice162,L6_IT,8,#EA95C1
100093902545314506370058151058375627035,mouse1_slice162,Astrocytes,0,#B6657E
100140206101682663983686904841717501025,mouse1_slice162,Oligodendrocytes,14,#B49599
100176313983591085952263714025726530686,mouse1_slice162,L23_IT,2,#628EA2
...,...,...,...,...
99658397536625795119891455351124847928,mouse1_slice162,L6_IT_Car3,9,#FAD1FC
99716035693339658241621650353650240200,mouse1_slice162,Astrocytes,0,#B6657E
99757070583384140112761609672322877368,mouse1_slice162,L23_IT,2,#628EA2
99840731649735034456729605869484432530,mouse1_slice162,L45_IT,3,#71B7A2


### Prepare SmartSeq

In [19]:
# Download and unpack
#metadata.csv
download_file('10.22002/D1.2067','.gz')
#smartseq.mtx (log counts)
download_file('10.22002/D1.2071','.gz')

os.system("gunzip D1.2067.gz")
os.system("gunzip D1.2071.gz")

os.system("mv D1.2067 ../data/raw/smartseq_metadata.csv")
os.system("mv D1.2071 ../data/raw/smartseq.mtx")

65B [00:00, 169.67B/s]                                                                                                                              
26701B [00:06, 4231.22B/s]                                                                                                                          


0

In [20]:
#load data
counts_smartseq = mmread('../data/raw/smartseq.mtx')
meta_smartseq = pd.read_csv('../data/raw/smartseq_metadata.csv')
counts_smartseq.shape

(3850, 1999)

In [21]:
#add metadata
adata_smartseq = anndata.AnnData(X=counts_smartseq)
adata_smartseq.obs['clusternames'] = [s for s in meta_smartseq.smartseq_cluster]
_,smartseq_clusterlabels = np.unique(meta_smartseq.smartseq_cluster,return_inverse=True)
adata_smartseq.obs['clusterlabels'] = smartseq_clusterlabels
adata_smartseq.obs['colors'] = [c for c in meta_smartseq.smartseq_cluster_color]
adata_smartseq.uns['dataset']='smartseq'

In [22]:
#Chari&Pachter scaling
adata_smartseq.layers['X_scaled'] = sc.pp.scale(adata_smartseq.X.copy())

In [23]:
adata_smartseq.write_h5ad('../data/preprocessed/smartseq_adata.h5ad')

In [24]:
adata_smartseq

AnnData object with n_obs × n_vars = 3850 × 1999
    obs: 'clusternames', 'clusterlabels', 'colors'
    uns: 'dataset'
    layers: 'X_scaled'

In [25]:
adata_smartseq.obs

Unnamed: 0,clusternames,clusterlabels,colors
0,Nr5a1_9|11 Rorb,18,#6C39A6
1,Nr5a1_9|11 Rorb,18,#6C39A6
2,Nr5a1_9|11 Rorb,18,#6C39A6
3,Nr5a1_4|7 Glipr1,16,#763BFF
4,Nr5a1_4|7 Glipr1,16,#763BFF
...,...,...,...
3845,Nr5a1_Foxp2_1 Prdm13,19,#D92AAB
3846,Nr5a1_Foxp2_1 Prdm13,19,#D92AAB
3847,Nr5a1_Foxp2_1 Prdm13,19,#D92AAB
3848,Nr5a1_Foxp2_1 Prdm13,19,#D92AAB


### Prepare MNIST data

In [26]:
import sklearn.datasets
#download
x_mnist,y_mnist = sklearn.datasets.fetch_openml('mnist_784',version=1,return_X_y=True)

#adjust datatypes
y_mnist = y_mnist.values.astype(int)
x_mnist = x_mnist.values

#metadata
colors_tab10 = np.array(['tab:blue','tab:orange','tab:green','tab:red','tab:purple','tab:brown','tab:pink','tab:gray','tab:olive','tab:cyan'])
colors_mnist = colors_tab10[y_mnist]

#make adata and save
adata_mnist = anndata.AnnData(X=x_mnist,
                               obs=dict(clusterlabels=y_mnist,
                                        clusternames=[str(y) for y in y_mnist],
                                        colors=colors_mnist),
                               uns=dict(dataset='mnist'))
adata_mnist.layers['X_scaled'] = sc.pp.scale(adata_mnist.X.copy())
adata_mnist.write_h5ad('../data/preprocessed/mnist_adata.h5ad')

  warn(


In [27]:
adata_mnist

AnnData object with n_obs × n_vars = 70000 × 784
    obs: 'clusterlabels', 'clusternames', 'colors'
    uns: 'dataset'
    layers: 'X_scaled'

In [28]:
adata_mnist.obs

Unnamed: 0,clusterlabels,clusternames,colors
0,5,5,tab:brown
1,0,0,tab:blue
2,4,4,tab:purple
3,1,1,tab:orange
4,9,9,tab:cyan
...,...,...,...
69995,2,2,tab:green
69996,3,3,tab:red
69997,4,4,tab:purple
69998,5,5,tab:brown


In [29]:
adata_smartseq

AnnData object with n_obs × n_vars = 3850 × 1999
    obs: 'clusternames', 'clusterlabels', 'colors'
    uns: 'dataset'
    layers: 'X_scaled'

### Copyright notice

This notebook uses code adapted from https://github.com/pachterlab/CP_2023, which is subject to the following licence:

```
BSD 2-Clause License

Copyright (c) 2021, Pachter Lab
All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:

1. Redistributions of source code must retain the above copyright notice, this
   list of conditions and the following disclaimer.

2. Redistributions in binary form must reproduce the above copyright notice,
   this list of conditions and the following disclaimer in the documentation
   and/or other materials provided with the distribution.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
```

### Package versions

In [3]:
np.__version__

'1.24.3'

In [4]:
pd.__version__

'2.0.3'

In [5]:
anndata.__version__

'0.10.3'

In [6]:
sc.__version__

'1.9.6'

In [7]:
import scipy; scipy.__version__

'1.11.1'

In [8]:
requests.__version__

'2.31.0'

In [10]:
import tqdm; tqdm.__version__

'4.66.2'