# Data extraction

In this notebook we are going to extract the raw or processed data from all datasets, to later analyze it in different notebooks.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import mygene
import numpy as np
import os
import pandas as pd
import scanpy as sc
import scanpy.external as sce
from tqdm import tqdm
import triku as tk
pd.set_option('display.max_columns', None)

In [None]:
from cellassign import assign_cats
from fb_functions import metadata_assignment

In [None]:
# Palettes for UMAP gene expression

magma = [plt.get_cmap('magma')(i) for i in np.linspace(0,1, 80)]
magma[0] = (0.88, 0.88, 0.88, 1)
magma = mpl.colors.LinearSegmentedColormap.from_list("", magma[:65])

In [None]:
data_dir = os.getcwd() + '/data/'

In [None]:
mouse_gencode_dir = "/media/seth/SETH_DATA/SETH_Alex/Programs/mouse_GRCm38_gencode.v31"
program_dir = "/media/seth/SETH_DATA/SETH_Alex/Programs/"

## Abbasi et al. 2020

In [None]:
abbasi_2020_dir = data_dir + '/abassi_2020'
os.makedirs(abbasi_2020_dir, exist_ok=True)

In [None]:
os.makedirs(f"{abbasi_2020_dir}/GSM2910020", exist_ok=True)
!cd {abbasi_2020_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM2910nnn/GSM2910020/suppl/GSM2910020_sample_4_matrix.mtx.gz -O GSM2910020/matrix.mtx.gz
!cd {abbasi_2020_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM2910nnn/GSM2910020/suppl/GSM2910020_sample_4_barcodes.tsv.gz -O GSM2910020/barcodes.tsv.gz
!cd {abbasi_2020_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM2910nnn/GSM2910020/suppl/GSM2910020_sample_4_genes.tsv.gz -O GSM2910020/features.tsv.gz

In [None]:
adata_abassi_2020 = sc.read_mtx(f"{abbasi_2020_dir}/GSM2910020/matrix.mtx.gz").transpose()
adata_abassi_2020.var_names = pd.read_csv(f"{abbasi_2020_dir}/GSM2910020/features.tsv.gz", sep='\t', header=None)[1].values
adata_abassi_2020.obs_names = pd.read_csv(f"{abbasi_2020_dir}/GSM2910020/barcodes.tsv.gz", sep='\t', header=None)[0].values

adata_abassi_2020.var_names_make_unique()
metadata_assignment(adata_abassi_2020, 'Abbasi', 2020, 'Ctrl')

In [None]:
adata_abassi_2020.write_h5ad(f"{abbasi_2020_dir}/abassi_2020_ctrl.h5")

## Buechler et al. 2021

In [None]:
buechler_2021_dir = data_dir + '/buechler_2021'
os.makedirs(buechler_2021_dir, exist_ok=True)

In [None]:
!cd {buechler_2021_dir} && wget ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/MTAB/E-MTAB-10315/LIB5436740_SAM24390211_S4_L001_R2_001.fastq.gz
!cd {buechler_2021_dir} && wget ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/MTAB/E-MTAB-10315/LIB5436740_SAM24390211_S4_L001_R1_001.fastq.gz
    
!cd {buechler_2021_dir} && wget ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/MTAB/E-MTAB-10315/LIB5436740_SAM24390211_S4_L002_R1_001.fastq.gz
!cd {buechler_2021_dir} && wget ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/MTAB/E-MTAB-10315/LIB5436740_SAM24390211_S4_L002_R2_001.fastq.gz

In [None]:
df = pd.DataFrame({'name': ['buechler_2021'], 'technology': ['10xv3'], 'targetnumcells': [5000]})
df.to_csv(buechler_dir + '/metadata.tab', sep='\t', index=None)

In [None]:
!cd {buechler_2021_dir} && loompy fromfq buechler_2021.loom buechler_2021 {mouse_gencode_dir} metadata.tab \
LIB5436740_SAM24390211_S4_L001_R1_001.fastq.gz LIB5436740_SAM24390211_S4_L001_R2_001.fastq.gz \
LIB5436740_SAM24390211_S4_L002_R1_001.fastq.gz LIB5436740_SAM24390211_S4_L002_R2_001.fastq.gz 

In [None]:
buechler_2021 = sc.read(f"{buechler_2021_dir}/buechler_2021.loom")
metadata_assignment(buechler_2021_dir, 'Buechler', 2021, '0')
buechler_2021.write_h5ad(f"{buechler_2021_dir}/buechler_2021_ctrl.h5")

## Boothby et al. 2021 

In [None]:
boothby_2021_dir = data_dir + '/boothby_2021'
os.makedirs(boothby_2021_dir, exist_ok=True)

In [None]:
!cd {boothby_2021_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM5549nnn/GSM5549901/suppl/GSM5549901%5FMs%5FPBS%5Ffiltered%5Fgene%5Fbc%5Fmatrices%5Fh5%2Eh5
!cd {boothby_2021_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM5549nnn/GSM5549902/suppl/GSM5549902%5FMs%5FDT%5Ffiltered%5Fgene%5Fbc%5Fmatrices%5Fh5%2Eh5

In [None]:
boothby_2021_PBS = sc.read_10x_h5(boothby_2021_dir + '/GSM5549901_Ms_PBS_filtered_gene_bc_matrices_h5.h5')
boothby_2021_PBS.var_names_make_unique()
metadata_assignment(boothby_2021_PBS, 'Boothby', 2021, 'PBS')

boothby_2021_DT = sc.read_10x_h5(boothby_2021_dir + '/GSM5549902_Ms_DT_filtered_gene_bc_matrices_h5.h5')
boothby_2021_DT.var_names_make_unique()
metadata_assignment(boothby_2021_DT, 'Boothby', 2021, 'DT')

In [None]:
boothby_2021_PBS.write_h5ad(boothby_2021_dir + '/boothby_2021_ctrl.h5')

boothby_2021_DT.write_h5ad(boothby_2021_dir + '/boothby_2021_DT.h5')

## Efremova, Mirjana Panglao DB) 2018

In [None]:
efremova_2018_dir = data_dir + '/efremova_2018'
os.makedirs(efremova_2018_dir, exist_ok=True)

In [None]:
!cd {efremova_2018_dir} && wget ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/MTAB/E-MTAB-7417/3421STDY7639028_S1_L001_R1_001.fastq.gz
!cd {efremova_2018_dir} && wget ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/MTAB/E-MTAB-7417/3421STDY7639028_S1_L001_R2_001.fastq.gz
    
!cd {efremova_2018_dir} && wget ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/MTAB/E-MTAB-7417/3421STDY7639029_S1_L001_R1_001.fastq.gz
!cd {efremova_2018_dir} && wget ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/MTAB/E-MTAB-7417/3421STDY7639029_S1_L001_R2_001.fastq.gz

In [None]:
df = pd.DataFrame({'name': ['efremova_2018_S1', 'efremova_2018_S2'], 'technology': ['10xv2']*2, 'targetnumcells': [5000]*2})
df.to_csv(efremova_2018_dir + '/metadata.tab', sep='\t', index=None)

In [None]:
!cd {efremova_2018_dir} && loompy fromfq efremova_2018_S1.loom efremova_2018_S1 {mouse_gencode_dir} metadata.tab \
3421STDY7639028_S1_L001_R1_001.fastq.gz 3421STDY7639028_S1_L001_R2_001.fastq.gz

In [None]:
!cd {efremova_2018_dir} && loompy fromfq efremova_2018_S2.loom efremova_2018_S2 {mouse_gencode_dir} metadata.tab \
3421STDY7639029_S1_L001_R1_001.fastq.gz 3421STDY7639029_S1_L001_R2_001.fastq.gz

In [None]:
adata_efremova_2018_S1 = sc.read(f"{efremova_2018_dir}/efremova_2018_S1.loom")
adata_efremova_2018_S1.var_names_make_unique()
metadata_assignment(adata_efremova_2018_S1, 'Efremova', 2018, '0')

adata_efremova_2018_S2 = sc.read(f"{efremova_2018_dir}/efremova_2018_S2.loom")
adata_efremova_2018_S2.var_names_make_unique()
metadata_assignment(adata_efremova_2018_S1, 'Efremova', 2018, '1')

adata_efremova_2018 = sc.AnnData.concatenate(adata_efremova_2018_S1, adata_efremova_2018_S2, batch_key='Internal sample identifier')
adata_efremova_2018.write_h5ad(f"{efremova_2018_dir}/efremova_2018_ctrl.h5")

## Haensel et al. 2021

In [None]:
haensel_2021_dir = data_dir + '/haensel_2021'
os.makedirs(haensel_2021_dir, exist_ok=True)

In [None]:
os.makedirs(f"{haensel_dir}/GSM4230076", exist_ok=True)
!cd {haensel_2021_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4230nnn/GSM4230076/suppl/GSM4230076_Un-Wounded_1_scRNA-Seq.mtx.gz -O GSM4230076/matrix.mtx.gz
!cd {haensel_2021_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4230nnn/GSM4230076/suppl/GSM4230076_barcodes_Un-Wounded_1_scRNA-Seq.tsv.gz  -O GSM4230076/barcodes.tsv.gz
!cd {haensel_2021_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4230nnn/GSM4230076/suppl/GSM4230076_genes_Un-Wounded_1_scRNA-Seq.tsv.gz -O GSM4230076/features.tsv.gz

In [None]:
os.makedirs(f"{haensel_2021_dir}/GSM4230077", exist_ok=True)
!cd {haensel_2021_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4230nnn/GSM4230077/suppl/GSM4230077_Un-Wounded_2_scRNA-Seq.mtx.gz -O GSM4230077/matrix.mtx.gz
!cd {haensel_2021_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4230nnn/GSM4230077/suppl/GSM4230077_barcodes_Un-Wounded_2_scRNA-Seq.tsv.gz -O GSM4230077/barcodes.tsv.gz
!cd {haensel_2021_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4230nnn/GSM4230077/suppl/GSM4230077_genes_Un-Wounded_2_scRNA-Seq.tsv.gz -O GSM4230077/features.tsv.gz

In [None]:
os.makedirs(f"{haensel_2021_dir}/GSM4230078", exist_ok=True)
!cd {haensel_2021_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4230nnn/GSM4230078/suppl/GSM4230078_Wounded_1_scRNA-Seq.mtx.gz -O GSM4230078/matrix.mtx.gz
!cd {haensel_2021_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4230nnn/GSM4230078/suppl/GSM4230078_barcodes_Wounded_1_scRNA-Seq.tsv.gz -O GSM4230078/barcodes.tsv.gz
!cd {haensel_2021_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4230nnn/GSM4230078/suppl/GSM4230078_genes_Wounded_1_scRNA-Seq.tsv.gz -O GSM4230078/features.tsv.gz

In [None]:
os.makedirs(f"{haensel_2021_dir}/GSM4230079", exist_ok=True)
!cd {haensel_2021_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4230nnn/GSM4230079/suppl/GSM4230079_Wounded_2_scRNA-Seq.mtx.gz -O GSM4230079/matrix.mtx.gz
!cd {haensel_2021_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4230nnn/GSM4230079/suppl/GSM4230079_barcodes_Wounded_2_scRNA-Seq.tsv.gz -O GSM4230079/barcodes.tsv.gz
!cd {haensel_2021_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4230nnn/GSM4230079/suppl/GSM4230079_genes_Wounded_2_scRNA-Seq.tsv.gz -O GSM4230079/features.tsv.gz

In [None]:
os.makedirs(f"{haensel_2021_dir}/GSM4230080", exist_ok=True)
!cd {haensel_2021_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4230nnn/GSM4230080/suppl/GSM4230080_Wounded_3_scRNA-Seq.mtx.gz -O GSM4230080/matrix.mtx.gz
!cd {haensel_2021_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4230nnn/GSM4230080/suppl/GSM4230080_barcodes_Wounded_3_scRNA-Seq.tsv.gz -O GSM4230080/barcodes.tsv.gz
!cd {haensel_2021_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4230nnn/GSM4230080/suppl/GSM4230080_genes_Wounded_3_scRNA-Seq.tsv.gz -O GSM4230080/features.tsv.gz

In [None]:
adata_GSM4230076 = sc.read_mtx(f"{haensel_2021_dir}/GSM4230076/matrix.mtx.gz").transpose()
adata_GSM4230076.var_names = pd.read_csv(f"{haensel_2021_dir}/GSM4230076/features.tsv.gz", sep='\t', header=None)[1].values
adata_GSM4230076.obs_names = pd.read_csv(f"{haensel_2021_dir}/GSM4230076/barcodes.tsv.gz", sep='\t', header=None)[0].values

adata_GSM4230076.var_names_make_unique()
metadata_assignment(adata_GSM4230076, 'Haensel', 2021, 'Un1')

In [None]:
adata_GSM4230077 = sc.read_mtx(f"{haensel_2021_dir}/GSM4230077/matrix.mtx.gz").transpose()
adata_GSM4230077.var_names = pd.read_csv(f"{haensel_2021_dir}/GSM4230077/features.tsv.gz", sep='\t', header=None)[1].values
adata_GSM4230077.obs_names = pd.read_csv(f"{haensel_2021_dir}/GSM4230077/barcodes.tsv.gz", sep='\t', header=None)[0].values

adata_GSM4230077.var_names_make_unique()
metadata_assignment(adata_GSM4230077, 'Haensel', 2021, 'Un2')

In [None]:
adata_haensel = sc.AnnData.concatenate(adata_GSM4230076, adata_GSM4230077, batch_key='Internal sample identifier', batch_categories=['Un1', 'Un2'])
adata_haensel.write_h5ad(f"{haensel_2021_dir}/haensel_2021_ctrl.h5")

In [None]:
adata_GSM4230078 = sc.read_mtx(f"{haensel_2021_dir}/GSM4230078/matrix.mtx.gz").transpose()
adata_GSM4230078.var_names = pd.read_csv(f"{haensel_2021_dir}/GSM4230078/features.tsv.gz", sep='\t', header=None)[1].values
adata_GSM4230078.obs_names = pd.read_csv(f"{haensel_2021_dir}/GSM4230078/barcodes.tsv.gz", sep='\t', header=None)[0].values

adata_GSM4230078.var_names_make_unique()
metadata_assignment(adata_GSM4230078, 'Haensel', 2021, 'Wo1')

In [None]:
adata_GSM4230079 = sc.read_mtx(f"{haensel_2021_dir}/GSM4230079/matrix.mtx.gz").transpose()
adata_GSM4230079.var_names = pd.read_csv(f"{haensel_2021_dir}/GSM4230079/features.tsv.gz", sep='\t', header=None)[1].values
adata_GSM4230079.obs_names = pd.read_csv(f"{haensel_2021_dir}/GSM4230079/barcodes.tsv.gz", sep='\t', header=None)[0].values

adata_GSM4230079.var_names_make_unique()
metadata_assignment(adata_GSM4230079, 'Haensel', 2021, 'Wo2')

In [None]:
adata_GSM4230080 = sc.read_mtx(f"{haensel_2021_dir}/GSM4230080/matrix.mtx.gz").transpose()
adata_GSM4230080.var_names = pd.read_csv(f"{haensel_2021_dir}/GSM4230080/features.tsv.gz", sep='\t', header=None)[1].values
adata_GSM4230080.obs_names = pd.read_csv(f"{haensel_2021_dir}/GSM4230080/barcodes.tsv.gz", sep='\t', header=None)[0].values

adata_GSM4230080.var_names_make_unique()
metadata_assignment(adata_GSM4230080, 'Haensel', 2021, 'Wo3')

In [None]:
adata_haensel_wounded = sc.AnnData.concatenate(adata_GSM4230078, adata_GSM4230079, adata_GSM4230080, batch_key='Internal sample identifier', batch_categories=['Wo1', 'Wo2', 'Wo3'])
adata_haensel_wounded.write_h5ad(f"{haensel_2021_dir}/haensel_2021_wounding.h5")

## Joost et al. 2020

In [None]:
joost_2020_dir = data_dir + '/joost_2020'
os.makedirs(joost_2020_dir, exist_ok=True)

In [None]:
! cd {joost_2020_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/series/GSE129nnn/GSE129218/suppl/GSE129218%5Fbarcodes%5F5w%2Etsv%2Egz
! cd {joost_2020_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/series/GSE129nnn/GSE129218/suppl/GSE129218%5Fgenes%5F5w%2Etsv%2Egz
! cd {joost_2020_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/series/GSE129nnn/GSE129218/suppl/GSE129218%5Fmatrix%5F5w%2Emtx%2Egz

! cd {joost_2020_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/series/GSE129nnn/GSE129218/suppl/GSE129218%5Fbarcodes%5F9w%2Etsv%2Egz
! cd {joost_2020_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/series/GSE129nnn/GSE129218/suppl/GSE129218%5Fgenes%5F9w%2Etsv%2Egz
! cd {joost_2020_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/series/GSE129nnn/GSE129218/suppl/GSE129218%5Fmatrix%5F9w%2Emtx%2Egz
    
! cd {joost_2020_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/series/GSE129nnn/GSE129218/suppl/GSE129218%5Fassigned%5Fbarcodes%2Etxt%2Egz

In [None]:
! cd {joost_2020_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4186nnn/GSM4186888/suppl/GSM4186888%5F10X%5F19%5F067%5Ffiltered%5Ffeature%5Fbc%5Fmatrix%2Eh5
! cd {joost_2020_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4186nnn/GSM4186889/suppl/GSM4186889%5F10X%5F19%5F069%5Ffiltered%5Ffeature%5Fbc%5Fmatrix%2Eh5
! cd {joost_2020_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4186nnn/GSM4186890/suppl/GSM4186890%5F10X%5F19%5F071%5Ffiltered%5Ffeature%5Fbc%5Fmatrix%2Eh5

! cd {joost_2020_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4186nnn/GSM4186891/suppl/GSM4186891%5F10X%5F19%5F068%5Ffiltered%5Ffeature%5Fbc%5Fmatrix%2Eh5
! cd {joost_2020_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4186nnn/GSM4186892/suppl/GSM4186892%5F10X%5F19%5F070%5Ffiltered%5Ffeature%5Fbc%5Fmatrix%2Eh5
! cd {joost_2020_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4186nnn/GSM4186893/suppl/GSM4186893%5F10X%5F19%5F072%5Ffiltered%5Ffeature%5Fbc%5Fmatrix%2Eh5

In [None]:
df = pd.read_csv(f"{joost_2020_dir}/GSE129218_assigned_barcodes.txt.gz", sep='\t', header=None).set_index([0])
dict_map = {'5wk1': '5w - replicate 1', '5wk2': '5w - replicate 2', '5wk3': '5w - replicate 3', '9wk1': '9w - replicate 1', '9wk2': '9w - replicate 2', '9wk3': '9w - replicate 3'} 
df[1] = df[1].map(dict_map) 
df

In [None]:
df.loc[adata_joost_2020_5w.obs_names][1]

In [None]:
adata_joost_2020_5w.obs_names

In [None]:
adata_joost_2020_5w

In [None]:
adata_joost_2020_5w = sc.read_mtx(f"{joost_2020_dir}/GSE129218_matrix_5w.mtx.gz").transpose()
adata_joost_2020_5w.var_names = pd.read_csv(f"{joost_2020_dir}/GSE129218_genes_5w.tsv.gz", sep='\t', header=None)[1].values
adata_joost_2020_5w.obs_names = pd.read_csv(f"{joost_2020_dir}/GSE129218_barcodes_5w.tsv.gz", sep='\t', header=None)[0].values
adata_joost_2020_5w.obs[''] = pd.read_csv(f"{joost_2020_dir}/GSE129218_barcodes_5w.tsv.gz", sep='\t', header=None)[0].values

adata_joost_2020_5w.var_names_make_unique()
adata_joost_2020_5w.obs['Internal sample identifier'] = df.loc[adata_joost_2020_5w.obs_names][1].values.astype('category')

In [None]:
adata_joost_2020_9w = sc.read_mtx(f"{joost_2020_dir}/GSE129218_matrix_5w.mtx.gz").transpose()
adata_joost_2020_9w.var_names = pd.read_csv(f"{joost_2020_dir}/GSE129218_genes_5w.tsv.gz", sep='\t', header=None)[1].values
adata_joost_2020_9w.obs_names = pd.read_csv(f"{joost_2020_dir}/GSE129218_barcodes_5w.tsv.gz", sep='\t', header=None)[0].values
adata_joost_2020_5w.obs[''] = pd.read_csv(f"{joost_2020_dir}/GSE129218_barcodes_5w.tsv.gz", sep='\t', header=None)[0].values

adata_joost_2020_5w.var_names_make_unique()
adata_joost_2020_5w.obs['Internal sample identifier'] = df.loc[adata_joost_2020_5w.obs_names][1]

In [None]:
adata_joost_2020_5w

In [None]:
adata_GSM4230077 = sc.read_mtx(f"{haensel_2021_dir}/GSM4230077/matrix.mtx.gz").transpose()
adata_GSM4230077.var_names = pd.read_csv(f"{haensel_2021_dir}/GSM4230077/features.tsv.gz", sep='\t', header=None)[1].values
adata_GSM4230077.obs_names = pd.read_csv(f"{haensel_2021_dir}/GSM4230077/barcodes.tsv.gz", sep='\t', header=None)[0].values

adata_GSM4230077.var_names_make_unique()
metadata_assignment(adata_GSM4230077, 'Haensel', 2021, 'Un2')

In [None]:
adata_haensel = sc.AnnData.concatenate(adata_GSM4230076, adata_GSM4230077, batch_key='Internal sample identifier', batch_categories=['Un1', 'Un2'])
adata_haensel.write_h5ad(f"{haensel_2021_dir}/haensel_2021_ctrl.h5")

In [None]:
adata_GSM4230078 = sc.read_mtx(f"{haensel_2021_dir}/GSM4230078/matrix.mtx.gz").transpose()
adata_GSM4230078.var_names = pd.read_csv(f"{haensel_2021_dir}/GSM4230078/features.tsv.gz", sep='\t', header=None)[1].values
adata_GSM4230078.obs_names = pd.read_csv(f"{haensel_2021_dir}/GSM4230078/barcodes.tsv.gz", sep='\t', header=None)[0].values

adata_GSM4230078.var_names_make_unique()
metadata_assignment(adata_GSM4230078, 'Haensel', 2021, 'Wo1')

In [None]:
adata_GSM4230079 = sc.read_mtx(f"{haensel_2021_dir}/GSM4230079/matrix.mtx.gz").transpose()
adata_GSM4230079.var_names = pd.read_csv(f"{haensel_2021_dir}/GSM4230079/features.tsv.gz", sep='\t', header=None)[1].values
adata_GSM4230079.obs_names = pd.read_csv(f"{haensel_2021_dir}/GSM4230079/barcodes.tsv.gz", sep='\t', header=None)[0].values

adata_GSM4230079.var_names_make_unique()
metadata_assignment(adata_GSM4230079, 'Haensel', 2021, 'Wo2')

In [None]:
adata_GSM4230080 = sc.read_mtx(f"{haensel_2021_dir}/GSM4230080/matrix.mtx.gz").transpose()
adata_GSM4230080.var_names = pd.read_csv(f"{haensel_2021_dir}/GSM4230080/features.tsv.gz", sep='\t', header=None)[1].values
adata_GSM4230080.obs_names = pd.read_csv(f"{haensel_2021_dir}/GSM4230080/barcodes.tsv.gz", sep='\t', header=None)[0].values

adata_GSM4230080.var_names_make_unique()
metadata_assignment(adata_GSM4230080, 'Haensel', 2021, 'Wo3')

In [None]:
adata_haensel_wounded = sc.AnnData.concatenate(adata_GSM4230078, adata_GSM4230079, adata_GSM4230080, batch_key='Internal sample identifier', batch_categories=['Wo1', 'Wo2', 'Wo3'])
adata_haensel_wounded.write_h5ad(f"{haensel_2021_dir}/haensel_2021_wounding.h5")

## Ma et al. 2020

In [None]:
ma_dir = data_dir + '/ma_2020'
os.makedirs(ma_dir, exist_ok=True)

In [None]:
adata_GSM4331840

In [None]:
os.makedirs(f"{ma_dir}/GSM4331840", exist_ok=True)
!cd {ma_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4331nnn/GSM4331840/suppl/GSM4331840%5FSkin%2DM%2DY%5Fbarcodes%2Etsv%2Egz -O GSM4331840/barcodes.tsv.gz
!cd {ma_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4331nnn/GSM4331840/suppl/GSM4331840%5FSkin%2DM%2DY%5Fgenes%2Etsv%2Egz -O GSM4331840/features.tsv.gz
!cd {ma_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4331nnn/GSM4331840/suppl/GSM4331840%5FSkin%2DM%2DY%5Fmatrix%2Emtx%2Egz -O GSM4331840/matrix.mtx.gz

In [None]:
adata_GSM4331840_M_Y = sc.read_mtx(f"{ma_dir}/GSM4331840/matrix.mtx.gz").transpose()
adata_GSM4331840_M_Y.var_names = pd.read_csv(f"{ma_dir}/GSM4331840/features.tsv.gz", sep='\t', header=None)[1].values
adata_GSM4331840_M_Y.obs_names = pd.read_csv(f"{ma_dir}/GSM4331840/barcodes.tsv.gz", sep='\t', header=None)[0].values

adata_GSM4331840_M_Y.var_names_make_unique()

In [None]:
os.makedirs(f"{ma_dir}/GSM4331841", exist_ok=True)
!cd {ma_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4331nnn/GSM4331841/suppl/GSM4331841%5FSkin%2DM%2DO%5Fbarcodes%2Etsv%2Egz -O GSM4331841/barcodes.tsv.gz
!cd {ma_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4331nnn/GSM4331841/suppl/GSM4331841%5FSkin%2DM%2DO%5Fgenes%2Etsv%2Egz -O GSM4331841/features.tsv.gz
!cd {ma_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4331nnn/GSM4331841/suppl/GSM4331841%5FSkin%2DM%2DO%5Fmatrix%2Emtx%2Egz -O GSM4331841/matrix.mtx.gz

In [None]:
adata_GSM4331841_M_O = sc.read_mtx(f"{ma_dir}/GSM4331841/matrix.mtx.gz").transpose()
adata_GSM4331841_M_O.var_names = pd.read_csv(f"{ma_dir}/GSM4331841/features.tsv.gz", sep='\t', header=None)[1].values
adata_GSM4331841_M_O.obs_names = pd.read_csv(f"{ma_dir}/GSM4331841/barcodes.tsv.gz", sep='\t', header=None)[0].values

adata_GSM4331841_M_O.var_names_make_unique()

In [None]:
os.makedirs(f"{ma_dir}/GSM4331842", exist_ok=True)
!cd {ma_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4331nnn/GSM4331842/suppl/GSM4331842%5FSkin%2DM%2DCR%5Fbarcodes%2Etsv%2Egz -O GSM4331842/barcodes.tsv.gz
!cd {ma_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4331nnn/GSM4331842/suppl/GSM4331842%5FSkin%2DM%2DCR%5Fgenes%2Etsv%2Egz -O GSM4331842/features.tsv.gz
!cd {ma_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4331nnn/GSM4331842/suppl/GSM4331842%5FSkin%2DM%2DCR%5Fmatrix%2Emtx%2Egz -O GSM4331842/matrix.mtx.gz

In [None]:
adata_GSM4331842_M_CR = sc.read_mtx(f"{ma_dir}/GSM4331842/matrix.mtx.gz").transpose()
adata_GSM4331842_M_CR.var_names = pd.read_csv(f"{ma_dir}/GSM4331842/features.tsv.gz", sep='\t', header=None)[1].values
adata_GSM4331842_M_CR.obs_names = pd.read_csv(f"{ma_dir}/GSM4331842/barcodes.tsv.gz", sep='\t', header=None)[0].values

adata_GSM4331842_M_CR.var_names_make_unique()

In [None]:
os.makedirs(f"{ma_dir}/GSM4331843", exist_ok=True)
!cd {ma_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4331nnn/GSM4331843/suppl/GSM4331843%5FSkin%2DF%2DY%5Fbarcodes%2Etsv%2Egz -O GSM4331843/barcodes.tsv.gz
!cd {ma_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4331nnn/GSM4331843/suppl/GSM4331843%5FSkin%2DF%2DY%5Fgenes%2Etsv%2Egz -O GSM4331843/features.tsv.gz
!cd {ma_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4331nnn/GSM4331843/suppl/GSM4331843%5FSkin%2DF%2DY%5Fmatrix%2Emtx%2Egz -O GSM4331843/matrix.mtx.gz

In [None]:
adata_GSM4331843_F_Y = sc.read_mtx(f"{ma_dir}/GSM4331843/matrix.mtx.gz").transpose()
adata_GSM4331843_F_Y.var_names = pd.read_csv(f"{ma_dir}/GSM4331843/features.tsv.gz", sep='\t', header=None)[1].values
adata_GSM4331843_F_Y.obs_names = pd.read_csv(f"{ma_dir}/GSM4331843/barcodes.tsv.gz", sep='\t', header=None)[0].values

adata_GSM4331843_F_Y.var_names_make_unique()

In [None]:
os.makedirs(f"{ma_dir}/GSM4331844", exist_ok=True)
!cd {ma_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4331nnn/GSM4331844/suppl/GSM4331844%5FSkin%2DF%2DO%5Fbarcodes%2Etsv%2Egz -O GSM4331844/barcodes.tsv.gz
!cd {ma_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4331nnn/GSM4331844/suppl/GSM4331844%5FSkin%2DF%2DO%5Fgenes%2Etsv%2Egz -O GSM4331844/features.tsv.gz
!cd {ma_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4331nnn/GSM4331844/suppl/GSM4331844%5FSkin%2DF%2DO%5Fmatrix%2Emtx%2Egz -O GSM4331844/matrix.mtx.gz

In [None]:
adata_GSM4331844_F_O = sc.read_mtx(f"{ma_dir}/GSM4331844/matrix.mtx.gz").transpose()
adata_GSM4331844_F_O.var_names = pd.read_csv(f"{ma_dir}/GSM4331844/features.tsv.gz", sep='\t', header=None)[1].values
adata_GSM4331844_F_O.obs_names = pd.read_csv(f"{ma_dir}/GSM4331844/barcodes.tsv.gz", sep='\t', header=None)[0].values

adata_GSM4331844_F_O.var_names_make_unique()

In [None]:
os.makedirs(f"{ma_dir}/GSM4331845", exist_ok=True)
!cd {ma_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4331nnn/GSM4331845/suppl/GSM4331845%5FSkin%2DF%2DCR%5Fbarcodes%2Etsv%2Egz -O GSM4331845/barcodes.tsv.gz
!cd {ma_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4331nnn/GSM4331845/suppl/GSM4331845%5FSkin%2DF%2DCR%5Fgenes%2Etsv%2Egz -O GSM4331845/features.tsv.gz
!cd {ma_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4331nnn/GSM4331845/suppl/GSM4331845%5FSkin%2DF%2DCR%5Fmatrix%2Emtx%2Egz -O GSM4331845/matrix.mtx.gz

In [None]:
adata_GSM4331845_F_CR = sc.read_mtx(f"{ma_dir}/GSM4331845/matrix.mtx.gz").transpose()
adata_GSM4331845_F_CR.var_names = pd.read_csv(f"{ma_dir}/GSM4331845/features.tsv.gz", sep='\t', header=None)[1].values
adata_GSM4331845_F_CR.obs_names = pd.read_csv(f"{ma_dir}/GSM4331845/barcodes.tsv.gz", sep='\t', header=None)[0].values

adata_GSM4331845_F_CR.var_names_make_unique()

In [None]:
adata_ma_Y = sc.AnnData.concatenate(adata_GSM4331840_M_Y, adata_GSM4331843_F_Y)
adata_ma_O = sc.AnnData.concatenate(adata_GSM4331841_M_O, adata_GSM4331844_F_O)
adata_ma_CR = sc.AnnData.concatenate(adata_GSM4331842_M_CR, adata_GSM4331845_F_CR)

In [None]:
adata_ma_Y.write_h5ad(f"{ma_dir}/adata_ma_Y.h5")
adata_ma_O.write_h5ad(f"{ma_dir}/adata_ma_O.h5")
adata_ma_CR.write_h5ad(f"{ma_dir}/adata_ma_CR.h5")

## Phan et al. 2020

In [None]:
phan_2020_dir = data_dir + '/phan_2020'
os.makedirs(phan_2020_dir, exist_ok=True)

In [None]:
!cd {phan_2020_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4647nnn/GSM4647788/suppl/GSM4647788_P21_1.loom.gz
!cd {phan_2020_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4647nnn/GSM4647789/suppl/GSM4647789_P21_2.loom.gz
!cd {phan_2020_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4647nnn/GSM4647790/suppl/GSM4647790_P21_3.loom.gz

In [None]:
!cd {phan_2020_dir} && gunzip *

In [None]:
adata_phan_2020_1 = sc.read(f"{phan_2020_dir}/GSM4647788_P21_1.loom")
adata_phan_2020_1.var_names_make_unique()
metadata_assignment(adata_phan_2020_1, 'Phan', 2020, 'P21_Un_1')

adata_phan_2020_2 = sc.read(f"{phan_2020_dir}/GSM4647789_P21_2.loom")
adata_phan_2020_2.var_names_make_unique()
metadata_assignment(adata_phan_2020_2, 'Phan', 2020, 'P21_Un_2')


adata_phan_2020_3 = sc.read(f"{phan_2020_dir}/GSM4647790_P21_3.loom")
adata_phan_2020_3.var_names_make_unique()
metadata_assignment(adata_phan_2020_2, 'Phan', 2020, 'P21_Un_3')

In [None]:
adata_phan_2020 = sc.AnnData.concatenate(adata_phan_2020_1, adata_phan_2020_2, adata_phan_2020_3, 
                                         batch_key='Internal sample identifier', batch_categories=['P21_Un_1', 'P21_Un_2', 'P21_Un_3'])

In [None]:
adata_phan_2020.write_h5ad(f"{phan_2020_dir}/phan_2020_ctrl_21d.h5")

## Salzer 2018

In [None]:
salzer_2018_dir = data_dir + '/salzer_2018'
os.makedirs(salzer_2018_dir, exist_ok=True)

In [None]:
adata_salzer = sc.read(salzer_2018_dir + '/Salzer.h5ad')  # This h5ad files was provided externally

In [None]:
list_samples = ['P2143_N704', 'P2143_N708', 'P1963_N701', 'P1963_N705', 'P1962_N701', 'P1962_N705', 'P2134_N704', 'P2134_N708', 
                'P2004_N701', 'P2004_N705', 'P2139_N704', 'P2139_N708', 'P1964_N701', 'P1964_N705', 'P2135_N704', 'P2135_N708', 
                'P1961_N701', 'P1961_N705', 'P2005_N701', 'P2005_N705', 'P1960_N701', 'P1960_N705', ]

list_batches = ['Y1_R2', 'O1_R2', 'Y2_R1', 'O2_R1', 'Y3_R1', 'O3_R1', 'Y4_R2', 'O4_R2', 'Y5_R1', 'O5_R1', 'Y6_R2', 'O6_R2', 'Y7_R1', 'O7_R1', 'Y8_R2', 'O8_R2', 'Y9_R1', 
                'O9_R1', 'Y10_R1', 'O10_R1', 'Y11_R1', 'O11_R1', ]

list_adatas = []

for sample, batch in zip(list_samples, list_batches):
    adata_salzer_batch = adata_salzer[np.array([True if sample in i else False for i in adata_salzer.obs_names]), :]
    adata_salzer_batch = metadata_assignment(adata_salzer_batch, 'Salzer', 2018, batch, do_return=True)
    list_adatas.append(adata_salzer_batch)

adata_salzer = sc.AnnData.concatenate(*list_adatas, batch_key='Internal sample identifier', batch_categories=list_batches)

In [None]:
adata_salzer.write_h5ad(salzer_2018_dir + '/salzer_2018_young_old.h5')

## Shook 2020

In [None]:
shook_2020_dir = data_dir + '/shook_2020'
os.makedirs(shook_2020_dir, exist_ok=True)

In [None]:
dict_names = {
              'SRR10480641': 'Non_Wounded_S1', 
              'SRR10480643': 'Non_Wounded_S2',
              'SRR10480644': 'Non_Wounded_S3', 
              'SRR10480645': 'Non_Wounded_S4', 
              'SRR10480646': 'Non_Wounded_S5', 
              'SRR10480636': 'Wounded_S1', 
              'SRR10480637': 'Wounded_S2', 
              'SRR10480638': 'Wounded_S3', 
              'SRR10480639': 'Wounded_S4',
              'SRR10480640': 'Wounded_S5',
}

In [None]:
df = pd.DataFrame({'name': list(dict_names.values()), 'technology': ['10xv3']*len(dict_names), 'targetnumcells': [5000]*len(dict_names)})
df.to_csv(shook_2020_dir + '/metadata.tab', sep='\t', index=None)

In [None]:
for SRR, name in dict_names.items():
    !cd {shook_2020_dir} && parallel-fastq-dump -s {SRR} --gzip --split-files -t 8 

In [None]:
for SRR, name in dict_names.items():
    !cd {shook_2020_dir} && loompy fromfq {name}.loom {name} {mouse_gencode_dir} metadata.tab {shook_dir}/{SRR}_1.fastq.gz {shook_dir}/{SRR}_2.fastq.gz

In [None]:
adata_Non_Wounded_S1 = sc.read(f"{shook_2020_dir}/Non_Wounded_S1.loom")
adata_Non_Wounded_S1.var_names_make_unique()
metadata_assignment(adata_Non_Wounded_S1, 'Shook', 2020, 'Ctrl_S1')

adata_Non_Wounded_S2 = sc.read(f"{shook_2020_dir}/Non_Wounded_S3.loom")
adata_Non_Wounded_S2.var_names_make_unique()
metadata_assignment(adata_Non_Wounded_S2, 'Shook', 2020, 'Ctrl_S2')

adata_Non_Wounded_S3 = sc.read(f"{shook_2020_dir}/Non_Wounded_S3.loom")
adata_Non_Wounded_S3.var_names_make_unique()
metadata_assignment(adata_Non_Wounded_S3, 'Shook', 2020, 'Ctrl_S3')

adata_Non_Wounded_S4 = sc.read(f"{shook_2020_dir}/Non_Wounded_S4.loom")
adata_Non_Wounded_S4.var_names_make_unique()
metadata_assignment(adata_Non_Wounded_S4, 'Shook', 2020, 'Ctrl_S4')

adata_Non_Wounded_S5 = sc.read(f"{shook_2020_dir}/Non_Wounded_S5.loom")
adata_Non_Wounded_S5.var_names_make_unique()
metadata_assignment(adata_Non_Wounded_S5, 'Shook', 2020, 'Ctrl_S5')

In [None]:
adata_shook_NW = sc.AnnData.concatenate(adata_Non_Wounded_S1, adata_Non_Wounded_S2, adata_Non_Wounded_S3, 
                                       adata_Non_Wounded_S4, adata_Non_Wounded_S5, batch_key='Internal sample identifier', 
                                        batch_categories=['Ctrl_S1', 'Ctrl_S2', 'Ctrl_S3', 'Ctrl_S4', 'Ctrl_S5'])
adata_shook_NW.write_h5ad(f"{shook_2020_dir}/shook_2020_ctrl.h5")