# Data extraction

In this notebook we are going to extract the raw or processed data from all datasets, to later analyze it in different notebooks.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import mygene
import numpy as np
import os
import pandas as pd
import scanpy as sc
import scanpy.external as sce
import scipy.sparse as spr
from tqdm.notebook import tqdm
import triku as tk
pd.set_option('display.max_columns', None)

In [None]:
from cellassign import assign_cats
from fb_functions import metadata_assignment

In [None]:
# Palettes for UMAP gene expression

magma = [plt.get_cmap('magma')(i) for i in np.linspace(0,1, 80)]
magma[0] = (0.88, 0.88, 0.88, 1)
magma = mpl.colors.LinearSegmentedColormap.from_list("", magma[:65])

In [None]:
data_dir = os.getcwd() + '/data/'

In [None]:
human_gencode_dir = "/media/seth/SETH_DATA/SETH_Alex/Programs/human_GRCh38_gencode.v31.600"
program_dir = "/media/seth/SETH_DATA/SETH_Alex/Programs/"

## Ahlers et al. 2022

In [None]:
ahlers_2022_dir = data_dir + '/ahlers_2022'
os.makedirs(ahlers_2022_dir, exist_ok=True)

In [None]:
!cd {ahlers_2022_dir} && parallel-fastq-dump -s SRR15440580 --gzip --split-files -t 8 
!cd {ahlers_2022_dir} && parallel-fastq-dump -s SRR15440581 --gzip --split-files -t 8
!cd {ahlers_2022_dir} && parallel-fastq-dump -s SRR15440582 --gzip --split-files -t 8
!cd {ahlers_2022_dir} && parallel-fastq-dump -s SRR15440583 --gzip --split-files -t 8 
!cd {ahlers_2022_dir} && parallel-fastq-dump -s SRR15440584 --gzip --split-files -t 8
!cd {ahlers_2022_dir} && parallel-fastq-dump -s SRR15440585 --gzip --split-files -t 8
!cd {ahlers_2022_dir} && parallel-fastq-dump -s SRR15440586 --gzip --split-files -t 8

In [None]:
df = pd.DataFrame({'name': ['Ahlers_2022_P1_Y', 'Ahlers_2022_P2_O', 'Ahlers_2022_P3_Y', 
                            'Ahlers_2022_P4_O', 'Ahlers_2022_P5_Y', 'Ahlers_2022_P6_O', 
                            'Ahlers_2022_P7_O'], 'technology': ['10xv2'] * 7, 
                   'targetnumcells': [1000] * 7})
df.to_csv(ahlers_2022_dir + '/metadata.tab', sep='\t', index=None)

In [None]:
!mv {ahlers_2022_dir}/SRR15440580_1.fastq.gz {ahlers_2022_dir}/Ahlers_2022_P4_O_L001_R1_001.fastq.gz 
!mv {ahlers_2022_dir}/SRR15440580_2.fastq.gz {ahlers_2022_dir}/Ahlers_2022_P4_O_L001_R2_001.fastq.gz 

!mv {ahlers_2022_dir}/SRR15440581_1.fastq.gz {ahlers_2022_dir}/Ahlers_2022_P3_Y_L001_R1_001.fastq.gz 
!mv {ahlers_2022_dir}/SRR15440581_2.fastq.gz {ahlers_2022_dir}/Ahlers_2022_P3_Y_L001_R2_001.fastq.gz 

!mv {ahlers_2022_dir}/SRR15440582_1.fastq.gz {ahlers_2022_dir}/Ahlers_2022_P2_O_L001_R1_001.fastq.gz 
!mv {ahlers_2022_dir}/SRR15440582_2.fastq.gz {ahlers_2022_dir}/Ahlers_2022_P2_O_L001_R2_001.fastq.gz 

!mv {ahlers_2022_dir}/SRR15440583_1.fastq.gz {ahlers_2022_dir}/Ahlers_2022_P1_Y_L001_R1_001.fastq.gz 
!mv {ahlers_2022_dir}/SRR15440583_2.fastq.gz {ahlers_2022_dir}/Ahlers_2022_P1_Y_L001_R2_001.fastq.gz 

!mv {ahlers_2022_dir}/SRR15440584_1.fastq.gz {ahlers_2022_dir}/Ahlers_2022_P7_O_L001_R1_001.fastq.gz 
!mv {ahlers_2022_dir}/SRR15440584_2.fastq.gz {ahlers_2022_dir}/Ahlers_2022_P7_O_L001_R2_001.fastq.gz 

!mv {ahlers_2022_dir}/SRR15440585_1.fastq.gz {ahlers_2022_dir}/Ahlers_2022_P6_O_L001_R1_001.fastq.gz 
!mv {ahlers_2022_dir}/SRR15440585_2.fastq.gz {ahlers_2022_dir}/Ahlers_2022_P6_O_L001_R2_001.fastq.gz 

!mv {ahlers_2022_dir}/SRR15440586_1.fastq.gz {ahlers_2022_dir}/Ahlers_2022_P5_Y_L001_R1_001.fastq.gz 
!mv {ahlers_2022_dir}/SRR15440586_2.fastq.gz {ahlers_2022_dir}/Ahlers_2022_P5_Y_L001_R2_001.fastq.gz 

In [None]:
!cd {ahlers_2022_dir} && loompy fromfq Ahlers_2022_P1_Y.loom Ahlers_2022_P1_Y {human_gencode_dir} metadata.tab \
Ahlers_2022_P1_Y_L001_R1_001.fastq.gz Ahlers_2022_P1_Y_L001_R2_001.fastq.gz 

In [None]:
!cd {ahlers_2022_dir} && loompy fromfq Ahlers_2022_P2_O.loom Ahlers_2022_P2_O {human_gencode_dir} metadata.tab \
Ahlers_2022_P2_O_L001_R1_001.fastq.gz Ahlers_2022_P2_O_L001_R2_001.fastq.gz 

In [None]:
!cd {ahlers_2022_dir} && loompy fromfq Ahlers_2022_P3_Y.loom Ahlers_2022_P3_Y {human_gencode_dir} metadata.tab \
Ahlers_2022_P3_Y_L001_R1_001.fastq.gz Ahlers_2022_P3_Y_L001_R2_001.fastq.gz 

In [None]:
!cd {ahlers_2022_dir} && loompy fromfq Ahlers_2022_P4_O.loom Ahlers_2022_P4_O {human_gencode_dir} metadata.tab \
Ahlers_2022_P4_O_L001_R1_001.fastq.gz Ahlers_2022_P4_O_L001_R2_001.fastq.gz 

In [None]:
!cd {ahlers_2022_dir} && loompy fromfq Ahlers_2022_P5_Y.loom Ahlers_2022_P5_Y {human_gencode_dir} metadata.tab \
Ahlers_2022_P5_Y_L001_R1_001.fastq.gz Ahlers_2022_P5_Y_L001_R2_001.fastq.gz 

In [None]:
!cd {ahlers_2022_dir} && loompy fromfq Ahlers_2022_P6_O.loom Ahlers_2022_P6_O {human_gencode_dir} metadata.tab \
Ahlers_2022_P6_O_L001_R1_001.fastq.gz Ahlers_2022_P6_O_L001_R2_001.fastq.gz 

In [None]:
!cd {ahlers_2022_dir} && loompy fromfq Ahlers_2022_P7_O.loom Ahlers_2022_P7_O {human_gencode_dir} metadata.tab \
Ahlers_2022_P7_O_L001_R1_001.fastq.gz Ahlers_2022_P7_O_L001_R2_001.fastq.gz 

In [None]:
adata_Ahlers_2022_P1_Y = sc.read_loom(ahlers_2022_dir + '/Ahlers_2022_P1_Y.loom')
adata_Ahlers_2022_P1_Y.var_names_make_unique()
metadata_assignment(adata_Ahlers_2022_P1_Y, 'Ahlers', 2022, 'P1_Y')

adata_Ahlers_2022_P2_O = sc.read_loom(ahlers_2022_dir + '/Ahlers_2022_P2_O.loom')
adata_Ahlers_2022_P2_O.var_names_make_unique()
metadata_assignment(adata_Ahlers_2022_P2_O, 'Ahlers', 2022, 'P2_O')

adata_Ahlers_2022_P3_Y = sc.read_loom(ahlers_2022_dir + '/Ahlers_2022_P3_Y.loom')
adata_Ahlers_2022_P3_Y.var_names_make_unique()
metadata_assignment(adata_Ahlers_2022_P3_Y, 'Ahlers', 2022, 'P3_Y')

adata_Ahlers_2022_P4_O = sc.read_loom(ahlers_2022_dir + '/Ahlers_2022_P4_O.loom')
adata_Ahlers_2022_P4_O.var_names_make_unique()
metadata_assignment(adata_Ahlers_2022_P4_O, 'Ahlers', 2022, 'P4_O')

adata_Ahlers_2022_P5_Y = sc.read_loom(ahlers_2022_dir + '/Ahlers_2022_P5_Y.loom')
adata_Ahlers_2022_P5_Y.var_names_make_unique()
metadata_assignment(adata_Ahlers_2022_P5_Y, 'Ahlers', 2022, 'P5_Y')

adata_Ahlers_2022_P6_O = sc.read_loom(ahlers_2022_dir + '/Ahlers_2022_P6_O.loom')
adata_Ahlers_2022_P6_O.var_names_make_unique()
metadata_assignment(adata_Ahlers_2022_P6_O, 'Ahlers', 2022, 'P6_O')

adata_Ahlers_2022_P7_O = sc.read_loom(ahlers_2022_dir + '/Ahlers_2022_P7_O.loom')
adata_Ahlers_2022_P7_O.var_names_make_unique()
metadata_assignment(adata_Ahlers_2022_P7_O, 'Ahlers', 2022, 'P7_O')

In [None]:
adata_Ahlers_2022_young = sc.AnnData.concatenate(adata_Ahlers_2022_P1_Y, adata_Ahlers_2022_P3_Y, adata_Ahlers_2022_P5_Y, 
                                             batch_key="Internal sample identifier", batch_categories=['P1_Y', 'P3_Y', 'P5_Y'])

adata_Ahlers_2022_old = sc.AnnData.concatenate(adata_Ahlers_2022_P2_O, adata_Ahlers_2022_P4_O, adata_Ahlers_2022_P6_O, adata_Ahlers_2022_P7_O,
                                             batch_key="Internal sample identifier", batch_categories=['P2_Y', 'P4_Y', 'P6_Y', 'P7_Y'])

In [None]:
adata_Ahlers_2022_young.write_h5ad(ahlers_2022_dir + '/ahlers_2022_young.h5')
adata_Ahlers_2022_old.write_h5ad(ahlers_2022_dir + '/ahlers_2022_old.h5')

## Boothby et al. 2021 

In [None]:
boothby_2021_dir = data_dir + '/boothby_2021'
os.makedirs(boothby_2021_dir, exist_ok=True)

In [None]:
!cd {boothby_2021_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM5549nnn/GSM5549903/suppl/GSM5549903%5FHC01%5Ffiltered%5Ffeature%5Fbc%5Fmatrix%2Eh5
!cd {boothby_2021_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM5549nnn/GSM5549904/suppl/GSM5549904%5FHC02%5Ffiltered%5Ffeature%5Fbc%5Fmatrix%2Eh5
!cd {boothby_2021_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM5549nnn/GSM5549905/suppl/GSM5549905%5FHC03%5Ffiltered%5Ffeature%5Fbc%5Fmatrix%2Eh5
!cd {boothby_2021_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM5549nnn/GSM5549906/suppl/GSM5549906%5FEF01%5Ffiltered%5Ffeature%5Fbc%5Fmatrix%2Eh5

In [None]:
boothby_2021_HC01 = sc.read_10x_h5(boothby_2021_dir + '/GSM5549903_HC01_filtered_feature_bc_matrix.h5')
boothby_2021_HC01.var_names_make_unique()
metadata_assignment(boothby_2021_HC01, 'Boothby', 2021, 'HC01')

boothby_2021_HC02 = sc.read_10x_h5(boothby_2021_dir + '/GSM5549904_HC02_filtered_feature_bc_matrix.h5')
boothby_2021_HC02.var_names_make_unique()
metadata_assignment(boothby_2021_HC02, 'Boothby', 2021, 'HC02')

boothby_2021_HC03 = sc.read_10x_h5(boothby_2021_dir + '/GSM5549905_HC03_filtered_feature_bc_matrix.h5')
boothby_2021_HC03.var_names_make_unique()
metadata_assignment(boothby_2021_HC03, 'Boothby', 2021, 'HC03')

boothby_2021_EF01 = sc.read_10x_h5(boothby_2021_dir + '/GSM5549906_EF01_filtered_feature_bc_matrix.h5')
boothby_2021_EF01.var_names_make_unique()
metadata_assignment(boothby_2021_EF01, 'Boothby', 2021, 'EF01')

In [None]:
boothby_2021_ctrl = sc.AnnData.concatenate(boothby_2021_HC01, boothby_2021_HC02, boothby_2021_HC03, 
                                   batch_key="Internal sample identifier", batch_categories=['HC01', 'HC02', 'HC03'])
boothby_2021_ctrl.write_h5ad(boothby_2021_dir + '/boothby_2021_ctrl.h5')

boothby_2021_EF01.write_h5ad(boothby_2021_dir + '/boothby_2021_ef.h5')

## Deng et al. 2021 (human)

In [None]:
deng_dir = data_dir + '/deng_2021'

In [None]:
!cd {deng_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4994nnn/GSM4994382/suppl/GSM4994382_NS1_matrix.tar.gz
!cd {deng_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4994nnn/GSM4994383/suppl/GSM4994383_NS2_matrix.tar.gz
!cd {deng_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4994nnn/GSM4994384/suppl/GSM4994384_NS3_matrix.tar.gz

In [None]:
!cd {deng_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4994nnn/GSM4994379/suppl/GSM4994379_KL1_matrix.tar.gz
!cd {deng_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4994nnn/GSM4994380/suppl/GSM4994380_KL2_matrix.tar.gz
!cd {deng_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4994nnn/GSM4994381/suppl/GSM4994381_KL3_matrix.tar.gz

In [None]:
!cd {deng_dir} && cat *.tar.gz | tar zxvf - -i

In [None]:
adata_deng_scar_1 = sc.read_10x_mtx(deng_dir + '/NF1_matrix')
metadata_assignment(adata_deng_scar_1, 'Deng', 2021, 'Scar_1')

adata_deng_scar_2 = sc.read_10x_mtx(deng_dir + '/NF2_matrix')
metadata_assignment(adata_deng_scar_2, 'Deng', 2021, 'Scar_2')


adata_deng_scar_3 = sc.read_10x_mtx(deng_dir + '/NF3_matrix')
metadata_assignment(adata_deng_scar_3, 'Deng', 2021, 'Scar_3')

adata_deng_scar = sc.AnnData.concatenate(adata_deng_scar_1, adata_deng_scar_2, adata_deng_scar_3, 
                                         batch_categories=['Scar_1', 'Scar_2', 'Scar_3'], batch_key='Internal sample identifier')

In [None]:
adata_deng_keloid_1 = sc.read_10x_mtx(deng_dir + '/NF1_matrix')
metadata_assignment(adata_deng_keloid_1, 'Deng', 2021, 'Keloid_1')

adata_deng_keloid_2 = sc.read_10x_mtx(deng_dir + '/NF2_matrix')
metadata_assignment(adata_deng_keloid_2, 'Deng', 2021, 'Keloid_2')


adata_deng_keloid_3 = sc.read_10x_mtx(deng_dir + '/NF3_matrix')
metadata_assignment(adata_deng_keloid_3, 'Deng', 2021, 'Keloid_3')

adata_deng_keloid = sc.AnnData.concatenate(adata_deng_keloid_1, adata_deng_keloid_2, adata_deng_keloid_3, 
                                         batch_categories=['Keloid_1', 'Keloid_2', 'Keloid_3'], batch_key='Internal sample identifier')

In [None]:
adata_deng_scar.write_h5ad(deng_dir + '/deng_2021_scar.h5')
adata_deng_keloid.write_h5ad(deng_dir + '/deng_2021_keloid.h5')

## Gao et al. 2021 (human)

In [None]:
gao_dir = data_dir + '/gao_2021'
os.makedirs(gao_dir, exist_ok=True)

### Direct h5ad download

In [None]:
!aria2c -x 16 https://ftp.ncbi.nlm.nih.gov/geo/series/GSE162nnn/GSE162183/suppl/GSE162183%5FRaw%5Fgene%5Fcounts%5Fmatrix%5FLoomFile%2Eloom%2Egz -d {gao_dir} -o gao_2021.loom.gz

In [None]:
!gunzip {gao_dir}/gao_2021.loom.gz

In [None]:
adata_gao = sc.read_loom(f'{gao_dir}/gao_2021.loom')

In [None]:
adata_gao_ctrl1 = metadata_assignment(adata_gao[adata_gao.obs['Patient'] == 'Ctrl1'], 'Gao', 2021, 'Ctrl1', do_return=True)
adata_gao_ctrl2 = metadata_assignment(adata_gao[adata_gao.obs['Patient'] == 'Ctrl2'], 'Gao', 2021, 'Ctrl2', do_return=True)
adata_gao_ctrl3 = metadata_assignment(adata_gao[adata_gao.obs['Patient'] == 'Ctrl3'], 'Gao', 2021, 'Ctrl3', do_return=True)

adata_gao_psor1 = metadata_assignment(adata_gao[adata_gao.obs['Patient'] == 'Psor1'], 'Gao', 2021, 'Psor1', do_return=True)
adata_gao_psor2 = metadata_assignment(adata_gao[adata_gao.obs['Patient'] == 'Psor2'], 'Gao', 2021, 'Psor2', do_return=True)
adata_gao_psor3 = metadata_assignment(adata_gao[adata_gao.obs['Patient'] == 'Psor3'], 'Gao', 2021, 'Psor3', do_return=True)

adata_gao_ctrl = sc.AnnData.concatenate(adata_gao_ctrl1, adata_gao_ctrl2, adata_gao_ctrl3, 
                                         batch_categories=['Ctrl1', 'Ctrl2', 'Ctrl3'], batch_key='Internal sample identifier')

adata_gao_psor = sc.AnnData.concatenate(adata_gao_psor1, adata_gao_psor2, adata_gao_psor3, 
                                         batch_categories=['Psor1', 'Psor2', 'Psor3'], batch_key='Internal sample identifier')

In [None]:
adata_gao_ctrl.write_h5ad(gao_dir + '/gao_2021_ctrl.h5')
adata_gao_psor.write_h5ad(gao_dir + '/gao_2021_psor.h5')

## Gaydosik et al. 2020 (human)

In [None]:
gaydosik_dir = data_dir + '/gaydosik_2020'
os.makedirs(gaydosik_dir, exist_ok=True)

In [None]:
!cd {gaydosik_dir} && wget ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM3679nnn/GSM3679033/suppl/GSM3679033%5FLabeled%5FSC67%5F050517%5FSK%5FMF2%5FGRCh38raw%2Ecsv%2Egz
!cd {gaydosik_dir} && wget ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM3679nnn/GSM3679034/suppl/GSM3679034%5FLabeled%5FSC82%5F060617%5FSK%5FMF5%5FGRCh38raw%2Ecsv%2Egz
!cd {gaydosik_dir} && wget ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM3679nnn/GSM3679035/suppl/GSM3679035%5FSC157dataframe%2Ecsv%2Egz
!cd {gaydosik_dir} && wget ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM3679nnn/GSM3679036/suppl/GSM3679036%5FSC158dataframe%2Ecsv%2Egz
!cd {gaydosik_dir} && wget ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM3679nnn/GSM3679037/suppl/GSM3679037%5FSC205dataframe%2Ecsv%2Egz

!cd {gaydosik_dir} && wget ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM3679nnn/GSM3679038/suppl/GSM3679038%5FLabeled%5FSC50%5F011917%5FSK%5FNOR%5FGRCh38raw%2Ecsv%2Egz
!cd {gaydosik_dir} && wget ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM3679nnn/GSM3679039/suppl/GSM3679039%5FLabeled%5FSC68%5F051517%5FSK%5FNOR%5FGRCh38raw%2Ecsv%2Egz
!cd {gaydosik_dir} && wget ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM3679nnn/GSM3679040/suppl/GSM3679040%5FLabeled%5FSC124%5F080317%5FSK%5FNOR%5FGRCh38raw%2Ecsv%2Egz
!cd {gaydosik_dir} && wget ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM3679nnn/GSM3679041/suppl/GSM3679041%5FLabeled%5FSC125%5F080317%5FSK%5FNOR%5FGRCh38raw%2Ecsv%2Egz

In [None]:
!cd {gaydosik_dir} &&  gunzip *.gz

In [None]:
adata_CTCL2 = sc.read(gaydosik_dir + '/GSM3679033_Labeled_SC67_050517_SK_MF2_GRCh38raw.csv').transpose()
adata_CTCL2 = metadata_assignment(adata_CTCL2, 'Gaydosik', 2019, 'CTCL2', do_return=True)

adata_CTCL5 = sc.read(gaydosik_dir + '/GSM3679034_Labeled_SC82_060617_SK_MF5_GRCh38raw.csv').transpose()
adata_CTCL5 = metadata_assignment(adata_CTCL5, 'Gaydosik', 2019, 'CTCL5', do_return=True)

adata_CTCL6 = sc.read(gaydosik_dir + '/GSM3679035_SC157dataframe.csv').transpose()
adata_CTCL6 = metadata_assignment(adata_CTCL6, 'Gaydosik', 2019, 'CTCL6', do_return=True)

adata_CTCL8 = sc.read(gaydosik_dir + '/GSM3679036_SC158dataframe.csv').transpose()
adata_CTCL8 = metadata_assignment(adata_CTCL8, 'Gaydosik', 2019, 'CTCL8', do_return=True)

adata_CTCL12 = sc.read(gaydosik_dir + '/GSM3679037_SC205dataframe.csv').transpose()
adata_CTCL12 = metadata_assignment(adata_CTCL12, 'Gaydosik', 2019, 'CTCL12', do_return=True)

In [None]:
adata_HC1 = sc.read(gaydosik_dir + '/GSM3679038_Labeled_SC50_011917_SK_NOR_GRCh38raw.csv').transpose()
adata_HC1 = metadata_assignment(adata_HC1, 'Gaydosik', 2019, 'HC1', do_return=True)

adata_HC2 = sc.read(gaydosik_dir + '/GSM3679039_Labeled_SC68_051517_SK_NOR_GRCh38raw.csv').transpose()
adata_HC2 = metadata_assignment(adata_HC2, 'Gaydosik', 2019, 'HC1', do_return=True)

adata_HC3 = sc.read(gaydosik_dir + '/GSM3679040_Labeled_SC124_080317_SK_NOR_GRCh38raw.csv').transpose()
adata_HC3 = metadata_assignment(adata_HC3, 'Gaydosik', 2019, 'HC1', do_return=True)

adata_HC4 = sc.read(gaydosik_dir + '/GSM3679041_Labeled_SC125_080317_SK_NOR_GRCh38raw.csv').transpose()
adata_HC4 = metadata_assignment(adata_HC4, 'Gaydosik', 2019, 'HC1', do_return=True)

In [None]:
adata_CTCL = sc.AnnData.concatenate(adata_CTCL2, adata_CTCL5, adata_CTCL6, 
                                   adata_CTCL8, adata_CTCL12, batch_key='Internal sample identifier', 
                                   batch_categories=['CTCL2', 'CTCL5', 'CTCL6',
                                                     'CTCL8', 'CTCL12'])
adata_HC = sc.AnnData.concatenate(adata_HC1, adata_HC2, adata_HC3, 
                                   adata_HC4, batch_key='Internal sample identifier', batch_categories=[
                                       'HC1', 'HC2', 'HC3', 'HC4'
                                   ])

In [None]:
adata_CTCL.X = spr.csr_matrix(adata_CTCL.X)
adata_HC.X = spr.csr_matrix(adata_HC.X)

In [None]:
adata_CTCL.write_h5ad(gaydosik_dir + '/gaydosik_2020_CTCL.h5')
adata_HC.write_h5ad(gaydosik_dir + '/gaydosik_2020_ctrl.h5')

## He et al. 2020 (human)

### Raw data and metadata extraction (healthy samples)

In [None]:
he_dir = data_dir + '/He_2020'
os.makedirs(he_dir, exist_ok=True)

In [None]:
!rm -rf {he_dir}

In [None]:
SRA_list = """
SRR11396171
SRR11396175
SRR11396162
SRR11396164
SRR11396166
SRR11396167
SRR11396168
SRR11396170
"""

with open(he_dir + '/accession.txt', 'w') as f:
    f.write(SRA_list)
    
df = pd.DataFrame({'name': ['He2020'], 'technology': ['10xv2'] * 8, 'targetnumcells': [5000] * 8})

df.to_csv(he_dir + '/metadata.tab', sep='\t', index=None)

In [None]:
!cd {he_dir} && cat accession.txt | parallel -j 8 "prefetch {}"

In [None]:
def adapt_fastq(filename_dir, filename_root, idx):
    # Using readline() 
    filein = open(f'{filename_dir}/{filename_root}.fastq', 'r') 
    
    fileR1 = open(f'{filename_dir}/He2020_L00{idx}_R1_001.fastq', 'w') 
    fileR2 = open(f'{filename_dir}/He2020_L00{idx}_R2_001.fastq', 'w') 
    
    print(f'{filename_dir}/{filename_root}.fastq', f'{filename_dir}/He2020_L00{idx}_R1_001.fastq')
    
    count = 0
    
    while True: 
        count += 1

        # Get next line from file 
        line = filein.readline() 
        
        if count % 4 in [1, 3]:
            fileR1.write(line.replace('\n', '') + '\n')
            fileR2.write(line.replace('\n', '') + '\n')
        elif count == 2:
            fileR1.write(line.replace('\n', '')[:26] + '\n')
            fileR2.write(line.replace('\n', '')[26:] + '\n')
        else:
            fileR1.write(line.replace('\n', '')[:26] + '\n')
            fileR2.write(line.replace('\n', '')[26:] + '\n')
            

        # if line is empty 
        # end of file is reached 
        if not line: 
            break

    filein.close() 
    fileR1.close()
    fileR2.close()
    
    os.system(f'gzip {filename_dir}/{filename_root}.fastq')
    os.system(f'gzip {filename_dir}/He2020_L00{idx}_R1_001.fastq')
    os.system(f'gzip {filename_dir}/He2020_L00{idx}_R2_001.fastq')

In [None]:
adapt_fastq_remote = ray.remote(adapt_fastq)

ray.init(ignore_reinit_error=True, num_cpus=2)

ret = [adapt_fastq_remote.remote(f'{he_dir}', f'{name}', name_idx+1) for name_idx, name in enumerate(SRA_list.split('\n')[1:-1])]
ray.get(ret)

ray.shutdown()

In [None]:
!cd {he_dir} && loompy fromfq He2020.loom He2020 {human_gencode_dir} metadata.tab \
He2020_L001_R1_001.fastq.gz He2020_L001_R2_001.fastq.gz He2020_L002_R1_001.fastq.gz He2020_L002_R2_001.fastq.gz \
He2020_L003_R1_001.fastq.gz He2020_L003_R2_001.fastq.gz He2020_L004_R1_001.fastq.gz He2020_L004_R2_001.fastq.gz \
He2020_L005_R1_001.fastq.gz He2020_L005_R2_001.fastq.gz He2020_L006_R1_001.fastq.gz He2020_L006_R2_001.fastq.gz \
He2020_L007_R1_001.fastq.gz He2020_L007_R2_001.fastq.gz He2020_L008_R1_001.fastq.gz He2020_L008_R2_001.fastq.gz \

In [None]:
adata_he_2020_ctrl = sc.read(he_dir + '/He2020.loom')
adata_he_2020_ctrl = metadata_assignment(adata_he_2020_ctrl, 'He', 2020, 'Ctrl', do_return=True)

In [None]:
adata_he_2020_ctrl.write_h5ad(he_dir + '/adata_he_2020_ctrl.h5')

### Raw data and metadata extraction (lesional samples)

In [None]:
SRA_list = """
SRR11396159
SRR11396160
SRR11396163
SRR11396165
"""

with open(he_dir + '/accession_old.txt', 'w') as f:
    f.write(SRA_list)
    
df = pd.DataFrame({'name': ['He2020_inj'], 'technology': ['10xv2'], 'targetnumcells': [5000]})

df.to_csv(he_dir + '/metadata_inj.tab', sep='\t', index=None)

In [None]:
!cd {he_dir} && cat accession_inj.txt | parallel -j 8 "prefetch {}"

In [None]:
adapt_fastq_remote = ray.remote(adapt_fastq)

ray.init(ignore_reinit_error=True, num_cpus=2)

ret = [adapt_fastq_remote.remote(f'{he_dir}', f'{name}', name_idx+10) for name_idx, name in enumerate(SRA_list.split('\n')[1:-1])]
ray.get(ret)

ray.shutdown()

In [None]:
!cd {he_dir} && loompy fromfq He2020_inj.loom He2020_inj {human_gencode_dir} metadata_old.tab \
He2020_L0010_R1_001.fastq.gz He2020_L0010_R2_001.fastq.gz He2020_L0011_R1_001.fastq.gz He2020_L0011_R2_001.fastq.gz \
He2020_L0013_R1_001.fastq.gz He2020_L0013_R2_001.fastq.gz He2020_L0012_R1_001.fastq.gz He2020_L0012_R2_001.fastq.gz

In [None]:
adata_he_2020_AD_LS = sc.read(he_dir + '/He2020_inj.loom')
adata_he_2020_AD_LS = metadata_assignment(adata_he_2020_AD_LS, 'He', 2020, 'AD_LS', do_return=True)

In [None]:
adata_he_2020_AD_LS.write_h5ad(he_dir + '/adata_he_2020_AD_LS.h5')

## Hughes et al. 2020 (human)

SRA download and processing did not produce correct matrices.

In [None]:
!cd {hughes_dir} && wget https://storage.googleapis.com/fc-9fe89f6d-a673-4659-8332-0bdcecf88e03/alexandria_structured_metadata.txt?GoogleAccessId=116798894341-compute%40developer.gserviceaccount.com&Expires=1632753650&Signature=nqwIaNbwIJdKkgwebfZSwN69btGHB%2BSzDbhRbstLDwkbFjGs%2BRThwg1Ibicv7aRdjN1KvJt%2F3w6kH1wht7bpS19a5TDEG9O4vycd%2FpKLdReTTvc10tkNlkT%2FVFTkpZhfSb1AXKxWNVSQsNj4oF5L%2FrXZvHU4DVYSfGxKhFbIjccqdWi%2B04k0cuALBrHzaBQllnOw079Rur3L5TvobxizFvIV77ZlQZ2cUI8iOO1UQ7ZB70Gfdb%2Fxr6UVZHCfb%2FEAthgAHDVk%2FrNx%2FuPvmFb5XKwQEvLeoCzMQ80uixU8L4XHKBL1YSLDoaVDeIc6bjEt86%2Fl1e31Aymh0ERxvzEYrQ%3D%3D
!cd {hughes_dir} && https://ftp.ncbi.nlm.nih.gov/geo/series/GSE150nnn/GSE150672/suppl/GSE150672%5FSkin%5FExpression%5Fcounts%2Ecsv%2Egz

In [None]:
hughes_dir = data_dir + '/hughes_2020'

In [None]:
adata_hughes = sc.read(hughes_dir + '/GSE150672_Skin_Expression_counts.csv.gz').transpose()
adata_hughes.var_names_make_unique()
adata_hughes.X = spr.csr.csr_matrix(adata_hughes.X)

In [None]:
metadata_hughes = pd.read_csv(hughes_dir+'/alexandria_structured_metadata.txt', sep='\t', index_col=0)
metadata_hughes = metadata_hughes.iloc[1:]

In [None]:
adata_hughes.obs.loc[metadata_hughes.index, ['donor_id']] = metadata_hughes.loc[:, ['donor_id']]

In [None]:
list_adatas = [ ]

list_donors = ['Acne1', 'Acne2', 'Acne3', 'Acne4', 'Alopecia1', 'GA1', 'GA2', 'Leprosy1', 'Leprosy2', 'Leprosy3', 'Leprosy4', 'Normal1', 'Normal2', 'Normal3', 'Psoriasis1', 'Psoriasis2', 'Psoriasis3', 'Psoriasis4', 'Psoriasis5',]
for donor in list_donors:
    adata_donor = adata_hughes[adata_hughes.obs['donor_id'] == donor]
    adata_donor = metadata_assignment(adata_donor, 'Hughes', 2020, donor, do_return=True)
    list_adatas.append(adata_donor)
    
adata_hughes_all = sc.AnnData.concatenate(*list_adatas, batch_key="Internal sample identifier", batch_categories=list_donors)

In [None]:
adata_hughes_all.write_h5ad(hughes_dir + '/adata_hughes_2020_all.h5')

## Kim et al. 2020 (human)

In [None]:
kim_dir = data_dir + '/Kim_2020'
os.makedirs(kim_dir, exist_ok=True)

In [None]:
!cd {kim_dir} && fastq-dump SRR9307706 --gzip --split-files

In [None]:
!cd {kim_dir} && fastq-dump SRR9307707 --gzip --split-files

In [None]:
!cd {kim_dir} && fastq-dump SRR9307708 --gzip --split-files

In [None]:
!cd {kim_dir} && fastq-dump SRR9307709 --gzip --split-files

In [None]:
!cd {kim_dir} && fastq-dump SRR9307710 --gzip --split-files

In [None]:
!cd {kim_dir} && fastq-dump SRR9307711 --gzip --split-files

In [None]:
!cd {kim_dir}/injury && fastq-dump SRR9307698 --gzip --split-files

In [None]:
df = pd.DataFrame({'name': ['Kim_2020_HC1', 'Kim_2020_HC2', 'Kim_2020_HC3', 
                            'Kim_2020_HC4', 'Kim_2020_HC5', 'Kim_2020_HC6', 
                            'Kim_2020_inj'], 'technology': ['10xv2'] * 7, 
                   'targetnumcells': [1000] * 7})
df.to_csv(kim_dir + '/metadata.tab', sep='\t', index=None)

In [None]:
!mv {kim_dir}/injury/SRR9307698_2.fastq.gz {kim_dir}/Kim_2020_inj_L001_R1_001.fastq.gz 
!mv {kim_dir}/injury/SRR9307698_3.fastq.gz {kim_dir}/Kim_2020_inj_L001_R2_001.fastq.gz 

!mv {kim_dir}/SRR9307706_2.fastq.gz {kim_dir}/Kim_2020_HC1_L001_R1_001.fastq.gz 
!mv {kim_dir}/SRR9307706_3.fastq.gz {kim_dir}/Kim_2020_HC1_L001_R2_001.fastq.gz 
!mv {kim_dir}/SRR9307707_2.fastq.gz {kim_dir}/Kim_2020_HC2_L001_R1_001.fastq.gz 
!mv {kim_dir}/SRR9307707_3.fastq.gz {kim_dir}/Kim_2020_HC2_L001_R2_001.fastq.gz 
!mv {kim_dir}/SRR9307708_2.fastq.gz {kim_dir}/Kim_2020_HC3_L001_R1_001.fastq.gz 
!mv {kim_dir}/SRR9307708_3.fastq.gz {kim_dir}/Kim_2020_HC3_L001_R2_001.fastq.gz 
!mv {kim_dir}/SRR9307709_2.fastq.gz {kim_dir}/Kim_2020_HC4_L001_R1_001.fastq.gz 
!mv {kim_dir}/SRR9307709_3.fastq.gz {kim_dir}/Kim_2020_HC4_L001_R2_001.fastq.gz 
!mv {kim_dir}/SRR9307710_2.fastq.gz {kim_dir}/Kim_2020_HC5_L001_R1_001.fastq.gz 
!mv {kim_dir}/SRR9307710_3.fastq.gz {kim_dir}/Kim_2020_HC5_L001_R2_001.fastq.gz 
!mv {kim_dir}/SRR9307711_2.fastq.gz {kim_dir}/Kim_2020_HC6_L001_R1_001.fastq.gz 
!mv {kim_dir}/SRR9307711_3.fastq.gz {kim_dir}/Kim_2020_HC6_L001_R2_001.fastq.gz 

In [None]:
!rm -rf {kim_dir}/*_1.fastq.gz

In [None]:
!cd {kim_dir} && loompy fromfq Kim_2020_HC1.loom Kim_2020_HC1 {human_gencode_dir} metadata.tab \
Kim_2020_HC1_L001_R1_001.fastq.gz Kim_2020_HC1_L001_R2_001.fastq.gz 

In [None]:
!cd {kim_dir} && loompy fromfq Kim_2020_HC2.loom Kim_2020_HC2 {human_gencode_dir} metadata.tab \
Kim_2020_HC2_L001_R1_001.fastq.gz Kim_2020_HC2_L001_R2_001.fastq.gz 

In [None]:
!cd {kim_dir} && loompy fromfq Kim_2020_HC3.loom Kim_2020_HC3 {human_gencode_dir} metadata.tab \
Kim_2020_HC3_L001_R1_001.fastq.gz Kim_2020_HC3_L001_R2_001.fastq.gz 

In [None]:
!cd {kim_dir} && loompy fromfq Kim_2020_HC4.loom Kim_2020_HC4 {human_gencode_dir} metadata.tab \
Kim_2020_HC4_L001_R1_001.fastq.gz Kim_2020_HC4_L001_R2_001.fastq.gz 

In [None]:
!cd {kim_dir} && loompy fromfq Kim_2020_HC5.loom Kim_2020_HC5 {human_gencode_dir} metadata.tab \
Kim_2020_HC5_L001_R1_001.fastq.gz Kim_2020_HC5_L001_R2_001.fastq.gz 

In [None]:
!cd {kim_dir} && loompy fromfq Kim_2020_HC6.loom Kim_2020_HC6 {human_gencode_dir} metadata.tab \
Kim_2020_HC6_L001_R1_001.fastq.gz Kim_2020_HC6_L001_R2_001.fastq.gz 

In [None]:
!cd {kim_dir} && loompy fromfq Kim_2020_inj.loom Kim_2020_inj {human_gencode_dir} metadata.tab \
Kim_2020_inj_L001_R1_001.fastq.gz Kim_2020_inj_L001_R2_001.fastq.gz

In [None]:
adata_kim_HC1 = sc.read_loom(kim_dir + '/Kim_2020_HC1.loom')
adata_kim_HC1.var_names_make_unique()
metadata_assignment(adata_kim_HC1, 'Kim', 2020, 'HC1')

adata_kim_HC2 = sc.read_loom(kim_dir + '/Kim_2020_HC2.loom')
adata_kim_HC2.var_names_make_unique()
metadata_assignment(adata_kim_HC2, 'Kim', 2020, 'HC2')

adata_kim_HC3 = sc.read_loom(kim_dir + '/Kim_2020_HC3.loom')
adata_kim_HC3.var_names_make_unique()
metadata_assignment(adata_kim_HC3, 'Kim', 2020, 'HC3')

adata_kim_HC4 = sc.read_loom(kim_dir + '/Kim_2020_HC4.loom')
adata_kim_HC4.var_names_make_unique()
metadata_assignment(adata_kim_HC4, 'Kim', 2020, 'HC4')

adata_kim_HC5 = sc.read_loom(kim_dir + '/Kim_2020_HC5.loom')
adata_kim_HC5.var_names_make_unique()
metadata_assignment(adata_kim_HC5, 'Kim', 2020, 'HC5')

adata_kim_HC6 = sc.read_loom(kim_dir + '/Kim_2020_HC6.loom')
adata_kim_HC6.var_names_make_unique()
metadata_assignment(adata_kim_HC6, 'Kim', 2020, 'HC6')

adata_kim_inj = sc.read_loom(kim_dir + '/Kim_2020_inj.loom')
adata_kim_inj.var_names_make_unique()
metadata_assignment(adata_kim_inj, 'Kim', 2020, 'inj')

In [None]:
adata_kim_2020_ctrl = sc.AnnData.concatenate(adata_kim_HC1, adata_kim_HC2, adata_kim_HC3, adata_kim_HC4, adata_kim_HC5, adata_kim_HC6, 
                                             batch_key="Internal sample identifier", batch_categories=['HC1', 'HC2', 'HC3', 'HC4', 'HC5', 'HC6'])

In [None]:
adata_kim_2020_ctrl.write_h5ad(kim_dir + '/adata_kim_2020_ctrl.h5')
adata_kim_inj.write_h5ad(kim_dir + '/adata_kim_2020_dress.h5')

## Kim et al. 2021 (human)

In [None]:
kim_dir = data_dir + '/kim_2021'
os.makedirs(kim_dir, exist_ok=True)

In [None]:
!cd {kim_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4816nnn/GSM4816776/suppl/GSM4816776%5FPoly01%5Fbarcodes%2Etsv%2Egz
!cd {kim_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4816nnn/GSM4816776/suppl/GSM4816776%5FPoly01%5Ffeatures%2Etsv%2Egz
!cd {kim_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4816nnn/GSM4816776/suppl/GSM4816776%5FPoly01%5Fmatrix%2Emtx%2Egz
    
!mkdir {kim_dir}/GSM4816776
!mv {kim_dir}/GSM4816776_Poly01_barcodes.tsv.gz {kim_dir}/GSM4816776/barcodes.tsv.gz
!mv {kim_dir}/GSM4816776_Poly01_features.tsv.gz {kim_dir}/GSM4816776/features.tsv.gz
!mv {kim_dir}/GSM4816776_Poly01_matrix.mtx.gz {kim_dir}/GSM4816776/matrix.mtx.gz

In [None]:
!cd {kim_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4816nnn/GSM4816777/suppl/GSM4816777%5FPoly02%5Fbarcodes%2Etsv%2Egz
!cd {kim_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4816nnn/GSM4816777/suppl/GSM4816777%5FPoly02%5Ffeatures%2Etsv%2Egz
!cd {kim_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4816nnn/GSM4816777/suppl/GSM4816777%5FPoly02%5Fmatrix%2Emtx%2Egz
    
!mkdir {kim_dir}/GSM4816777
!mv {kim_dir}/GSM4816777_Poly02_barcodes.tsv.gz {kim_dir}/GSM4816777/barcodes.tsv.gz
!mv {kim_dir}/GSM4816777_Poly02_features.tsv.gz {kim_dir}/GSM4816777/features.tsv.gz
!mv {kim_dir}/GSM4816777_Poly02_matrix.mtx.gz {kim_dir}/GSM4816777/matrix.mtx.gz

In [None]:
!cd {kim_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4816nnn/GSM4816778/suppl/GSM4816778%5FPoly03%5Fbarcodes%2Etsv%2Egz
!cd {kim_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4816nnn/GSM4816778/suppl/GSM4816778%5FPoly03%5Ffeatures%2Etsv%2Egz
!cd {kim_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4816nnn/GSM4816778/suppl/GSM4816778%5FPoly03%5Fmatrix%2Emtx%2Egz
    
!mkdir {kim_dir}/GSM4816778
!mv {kim_dir}/GSM4816778_Poly03_barcodes.tsv.gz {kim_dir}/GSM4816778/barcodes.tsv.gz
!mv {kim_dir}/GSM4816778_Poly03_features.tsv.gz {kim_dir}/GSM4816778/features.tsv.gz
!mv {kim_dir}/GSM4816778_Poly03_matrix.mtx.gz {kim_dir}/GSM4816778/matrix.mtx.gz

In [None]:
!cd {kim_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4816nnn/GSM4816779/suppl/GSM4816779%5FPoly04%5Fbarcodes%2Etsv%2Egz
!cd {kim_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4816nnn/GSM4816779/suppl/GSM4816779%5FPoly04%5Ffeatures%2Etsv%2Egz
!cd {kim_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4816nnn/GSM4816779/suppl/GSM4816779%5FPoly04%5Fmatrix%2Emtx%2Egz
    
!mkdir {kim_dir}/GSM4816779
!mv {kim_dir}/GSM4816779_Poly04_barcodes.tsv.gz {kim_dir}/GSM4816779/barcodes.tsv.gz
!mv {kim_dir}/GSM4816779_Poly04_features.tsv.gz {kim_dir}/GSM4816779/features.tsv.gz
!mv {kim_dir}/GSM4816779_Poly04_matrix.mtx.gz {kim_dir}/GSM4816779/matrix.mtx.gz

In [None]:
adata_kim_GSM4816776

In [None]:
adata_kim_GSM4816776 = sc.read_10x_mtx(data_dir + '/kim_2021/GSM4816776')
metadata_assignment(adata_kim_GSM4816776, 'Kim', 2021, '0')

adata_kim_GSM4816777 = sc.read_10x_mtx(data_dir + '/kim_2021/GSM4816777')
metadata_assignment(adata_kim_GSM4816777, 'Kim', 2021, '1')

adata_kim_GSM4816778 = sc.read_10x_mtx(data_dir + '/kim_2021/GSM4816778')
metadata_assignment(adata_kim_GSM4816778, 'Kim', 2021, '2')

adata_kim_GSM4816779 = sc.read_10x_mtx(data_dir + '/kim_2021/GSM4816779')
metadata_assignment(adata_kim_GSM4816779, 'Kim', 2021, '3')

In [None]:
adata_kim = sc.AnnData.concatenate(adata_kim_GSM4816776, adata_kim_GSM4816777, adata_kim_GSM4816778, adata_kim_GSM4816779, 
                                   batch_key="Internal sample identifier", batch_categories=['0', '1', '2', '3'])
adata_kim.write_h5ad(kim_dir + '/kim_2021_ctrl.h5')

## Liu et al. 2021 (human)

In [None]:
liu_dir = data_dir + '/liu_2021'
os.makedirs(liu_dir, exist_ok=True)

In [None]:
csv_liu = pd.read_csv(liu_dir + '/liu_2021.csv', index_col = 0).transpose()
metadata_liu = pd.read_csv(liu_dir + '/liu_2021_metadata.csv', index_col = 0)
metadata_liu.index = [i.replace('-', '.') for i in metadata_liu.index]

In [None]:
adata_liu = sc.AnnData(csv_liu)
adata_liu.obs = metadata_liu

In [None]:
list_adatas = []

list_donors = ['K007CASE', 'K007CTRL', 'K009CASE', 'K009CTRL', 'K013CASE', 'K013CTRL', 'K012CASE', 'K012CTRL']
for donor in list_donors:
    adata_donor = adata_liu[adata_liu.obs['batch'] == donor].copy()
    adata_donor = metadata_assignment(adata_donor, 'Liu', 2021, donor, do_return=True, do_sparse=False)
    list_adatas.append(adata_donor)
    
adata_liu_all = sc.AnnData.concatenate(*list_adatas, batch_key="Internal sample identifier", batch_categories=list_donors)

adata_liu_2021_ctrl = adata_liu_all[adata_liu_all.obs['Group'] == 'CTRL']
adata_liu_2021_keloid = adata_liu_all[adata_liu_all.obs['Group'] == 'CASE']

In [None]:
adata_liu_2021_ctrl

In [None]:
adata_liu_2021_keloid

In [None]:
adata_liu_2021_keloid.write_h5ad(liu_dir + '/adata_liu_2021_keloid.h5')
adata_liu_2021_ctrl.write_h5ad(liu_dir + '/adata_liu_2021_ctrl.h5')

## Mariottoni et al. 2021 (human)

In [None]:
mariottoni_2021_dir = data_dir + '/mariottoni_2021'
os.makedirs(mariottoni_2021_dir, exist_ok=True)

In [None]:
for gsm, name in zip(['GSM5352392', 'GSM5352393', 'GSM5352394', 'GSM5352395'], ['HS_1', 'HS_2', 'HS_3', 'HC']):
    !cd {mariottoni_2021_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM5352nnn/{gsm}/suppl/{gsm}%5F{name}%2Ebarcodes%2Etsv%2Egz
    !cd {mariottoni_2021_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM5352nnn/{gsm}/suppl/{gsm}%5F{name}%2Efeatures%2Etsv%2Egz
    !cd {mariottoni_2021_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM5352nnn/{gsm}/suppl/{gsm}%5F{name}%2Ematrix%2Emtx%2Egz

    !mkdir {mariottoni_2021_dir}/{gsm}
    !mv {mariottoni_2021_dir}/{gsm}_{name}.barcodes.tsv.gz {mariottoni_2021_dir}/{gsm}/barcodes.tsv.gz
    !mv {mariottoni_2021_dir}/{gsm}_{name}.features.tsv.gz {mariottoni_2021_dir}/{gsm}/features.tsv.gz
    !mv {mariottoni_2021_dir}/{gsm}_{name}.matrix.mtx.gz {mariottoni_2021_dir}/{gsm}/matrix.mtx.gz

In [None]:
mariottoni_2021_HS_1 = sc.read_10x_mtx(mariottoni_2021_dir + '/GSM5352392')
mariottoni_2021_HS_1.var_names_make_unique()
metadata_assignment(mariottoni_2021_HS_1, 'Mariottoni', 2021, 'HS_1')

mariottoni_2021_HS_2 = sc.read_10x_mtx(mariottoni_2021_dir + '/GSM5352393')
mariottoni_2021_HS_2.var_names_make_unique()
metadata_assignment(mariottoni_2021_HS_2, 'Mariottoni', 2021, 'HS_2')

mariottoni_2021_HS_3 = sc.read_10x_mtx(mariottoni_2021_dir + '/GSM5352394')
mariottoni_2021_HS_3.var_names_make_unique()
metadata_assignment(mariottoni_2021_HS_3, 'Mariottoni', 2021, 'HS_3')

mariottoni_2021_HC = sc.read_10x_mtx(mariottoni_2021_dir + '/GSM5352395')
mariottoni_2021_HC.var_names_make_unique()
metadata_assignment(mariottoni_2021_HC, 'Mariottoni', 2021, 'HC')

In [None]:
mariottoni_2021_HS = sc.AnnData.concatenate(mariottoni_2021_HS_1, mariottoni_2021_HS_2, mariottoni_2021_HS_3, 
                                   batch_key="Internal sample identifier", batch_categories=['HS_1', 'HS_2', 'HS_3'])
mariottoni_2021_HS.write_h5ad(mariottoni_2021_dir + '/mariottoni_2021_HS.h5')

mariottoni_2021_HC.write_h5ad(mariottoni_2021_dir + '/mariottoni_2021_ctrl.h5')

## Mirizio et al. 2020 (human)

In [None]:
mirizio_dir = data_dir + '/mirizio_2020'
os.makedirs(mirizio_dir, exist_ok=True)

In [None]:
!cd {mirizio_dir} && parallel-fastq-dump -s SRR12955136 --gzip --split-files -t 8 # Cryo P1 222
!cd {mirizio_dir} && parallel-fastq-dump -s SRR12955137 --gzip --split-files -t 8
!cd {mirizio_dir} && parallel-fastq-dump -s SRR12955138 --gzip --split-files -t 8
!cd {mirizio_dir} && parallel-fastq-dump -s SRR12955139 --gzip --split-files -t 8

In [None]:
for SRR_idx, SRR in enumerate(['SRR12955136', 'SRR12955137', 'SRR12955138', 'SRR12955139']):
    os.rename(f'{mirizio_dir}/{SRR}_2.fastq.gz', f'{mirizio_dir}/Cryo_P1_L00{SRR_idx + 1}_R1_001.fastq.gz')
    os.rename(f'{mirizio_dir}/{SRR}_3.fastq.gz', f'{mirizio_dir}/Cryo_P1_L00{SRR_idx + 1}_R2_001.fastq.gz')

In [None]:
!cd {mirizio_dir} && parallel-fastq-dump -s SRR12955140 --gzip --split-files -t 8 # RPMI P1 223
!cd {mirizio_dir} && parallel-fastq-dump -s SRR12955141 --gzip --split-files -t 8
!cd {mirizio_dir} && parallel-fastq-dump -s SRR12955142 --gzip --split-files -t 8
!cd {mirizio_dir} && parallel-fastq-dump -s SRR12955143 --gzip --split-files -t 8

In [None]:
for SRR_idx, SRR in enumerate(['SRR12955140', 'SRR12955141', 'SRR12955142', 'SRR12955143']):
    os.rename(f'{mirizio_dir}/{SRR}_2.fastq.gz', f'{mirizio_dir}/RPMI_P1_L00{SRR_idx + 1}_R1_001.fastq.gz')
    os.rename(f'{mirizio_dir}/{SRR}_3.fastq.gz', f'{mirizio_dir}/RPMI_P1_L00{SRR_idx + 1}_R2_001.fastq.gz')

In [None]:
!cd {mirizio_dir} && parallel-fastq-dump -s SRR12955144 --gzip --split-files -t 8 # Cryo P2 267
!cd {mirizio_dir} && parallel-fastq-dump -s SRR12955145 --gzip --split-files -t 8
!cd {mirizio_dir} && parallel-fastq-dump -s SRR12955146 --gzip --split-files -t 8
!cd {mirizio_dir} && parallel-fastq-dump -s SRR12955147 --gzip --split-files -t 8

In [None]:
for SRR_idx, SRR in enumerate(['SRR12955144', 'SRR12955145', 'SRR12955146', 'SRR12955147']):
    os.rename(f'{mirizio_dir}/{SRR}_2.fastq.gz', f'{mirizio_dir}/untrimmed_Cryo_P2_L00{SRR_idx + 1}_R1_001.fastq.gz')
    os.rename(f'{mirizio_dir}/{SRR}_3.fastq.gz', f'{mirizio_dir}/untrimmed_Cryo_P2_L00{SRR_idx + 1}_R2_001.fastq.gz')

In [None]:
!cd {mirizio_dir} && parallel-fastq-dump -s SRR12955148 --gzip --split-files -t 8 # RPMI P2 268
!cd {mirizio_dir} && parallel-fastq-dump -s SRR12955149 --gzip --split-files -t 8
!cd {mirizio_dir} && parallel-fastq-dump -s SRR12955150 --gzip --split-files -t 8
!cd {mirizio_dir} && parallel-fastq-dump -s SRR12955151 --gzip --split-files -t 8

In [None]:
for SRR_idx, SRR in enumerate(['SRR12955148', 'SRR12955149', 'SRR12955150', 'SRR12955151']):
    os.rename(f'{mirizio_dir}/{SRR}_2.fastq.gz', f'{mirizio_dir}/untrimmed_RPMI_P2_L00{SRR_idx + 1}_R1_001.fastq.gz')
    os.rename(f'{mirizio_dir}/{SRR}_3.fastq.gz', f'{mirizio_dir}/untrimmed_RPMI_P2_L00{SRR_idx + 1}_R2_001.fastq.gz')

In [None]:
!cd {mirizio_dir} && parallel-fastq-dump -s SRR12955152 --gzip --split-files -t 8 # Cryo P3 272
!cd {mirizio_dir} && parallel-fastq-dump -s SRR12955153 --gzip --split-files -t 8
!cd {mirizio_dir} && parallel-fastq-dump -s SRR12955154 --gzip --split-files -t 8
!cd {mirizio_dir} && parallel-fastq-dump -s SRR12955155 --gzip --split-files -t 8

In [None]:
for SRR_idx, SRR in enumerate(['SRR12955152', 'SRR12955153', 'SRR12955154', 'SRR12955155']):
    os.rename(f'{mirizio_dir}/{SRR}_2.fastq.gz', f'{mirizio_dir}/untrimmed_Cryo_P3_L00{SRR_idx + 1}_R1_001.fastq.gz')
    os.rename(f'{mirizio_dir}/{SRR}_3.fastq.gz', f'{mirizio_dir}/untrimmed_Cryo_P3_L00{SRR_idx + 1}_R2_001.fastq.gz')

In [None]:
!cd {mirizio_dir} && parallel-fastq-dump -s SRR12955156 --gzip --split-files -t 8 # RPMI P3 273
!cd {mirizio_dir} && parallel-fastq-dump -s SRR12955157 --gzip --split-files -t 8
!cd {mirizio_dir} && parallel-fastq-dump -s SRR12955158 --gzip --split-files -t 8
!cd {mirizio_dir} && parallel-fastq-dump -s SRR12955159 --gzip --split-files -t 8

In [None]:
for SRR_idx, SRR in enumerate(['SRR12955156', 'SRR12955157', 'SRR12955158', 'SRR12955159']):
    os.rename(f'{mirizio_dir}/{SRR}_2.fastq.gz', f'{mirizio_dir}/untrimmed_RPMI_P3_L00{SRR_idx + 1}_R1_001.fastq.gz')
    os.rename(f'{mirizio_dir}/{SRR}_3.fastq.gz', f'{mirizio_dir}/untrimmed_RPMI_P3_L00{SRR_idx + 1}_R2_001.fastq.gz')

In [None]:
df = pd.DataFrame({'name': ['Cryo_P1', 'Cryo_P2', 'Cryo_P3',
                            'RPMI_P1', 'RPMI_P2', 'RPMI_P3', ], 'technology': ['10xv2'] * 6, 
                   'targetnumcells': [1000] * 6})
df.to_csv(mirizio_dir + '/metadata.tab', sep='\t', index=None)

In [None]:
!rm -rf {mirizio_dir}/*_1.fastq.gz

In [None]:
!cd {kim_dir} && loompy fromfq Kim_2020_HC5.loom Kim_2020_HC5 {human_gencode_dir} metadata.tab \
Kim_2020_HC5_L001_R1_001.fastq.gz Kim_2020_HC5_L001_R2_001.fastq.gz 

In [None]:
# We will trim the bases to get the first 26 bp in R1 and the first 98 in R2
for name in ['Cryo_P2', 'Cryo_P3', 'RPMI_P2', 'RPMI_P3']:
    for lane in ['1', '2', '3', '4']:
        os.system(f'seqtk trimfq -e 124 {mirizio_dir}/untrimmed_{name}_L00{lane}_R1_001.fastq.gz > {mirizio_dir}/{name}_L00{lane}_R1_001.fastq.gz')
        os.system(f'seqtk trimfq -e 52  {mirizio_dir}/untrimmed_{name}_L00{lane}_R2_001.fastq.gz > {mirizio_dir}/{name}_L00{lane}_R2_001.fastq.gz')

In [None]:
!rm -rf {mirizio_dir}/untrimmed_*.fastq.gz

In [None]:
!cd {mirizio_dir} && loompy fromfq Cryo_P1.loom Cryo_P1 {human_gencode_dir} metadata.tab \
Cryo_P1_L001_R1_001.fastq.gz Cryo_P1_L001_R2_001.fastq.gz Cryo_P1_L002_R1_001.fastq.gz Cryo_P1_L002_R2_001.fastq.gz Cryo_P1_L003_R1_001.fastq.gz Cryo_P1_L003_R2_001.fastq.gz Cryo_P1_L003_R1_001.fastq.gz Cryo_P1_L003_R2_001.fastq.gz 

In [None]:
!cd {mirizio_dir} && loompy fromfq Cryo_P2.loom Cryo_P2 {human_gencode_dir} metadata.tab \
Cryo_P2_L001_R1_001.fastq.gz Cryo_P2_L001_R2_001.fastq.gz Cryo_P2_L002_R1_001.fastq.gz Cryo_P2_L002_R2_001.fastq.gz Cryo_P2_L003_R1_001.fastq.gz Cryo_P2_L003_R2_001.fastq.gz Cryo_P2_L003_R1_001.fastq.gz Cryo_P2_L003_R2_001.fastq.gz 

In [None]:
!cd {mirizio_dir} && loompy fromfq Cryo_P3.loom Cryo_P3 {human_gencode_dir} metadata.tab \
Cryo_P3_L001_R1_001.fastq.gz Cryo_P3_L001_R2_001.fastq.gz Cryo_P3_L002_R1_001.fastq.gz Cryo_P3_L002_R2_001.fastq.gz Cryo_P3_L003_R1_001.fastq.gz Cryo_P3_L003_R2_001.fastq.gz Cryo_P3_L003_R1_001.fastq.gz Cryo_P3_L003_R2_001.fastq.gz 

In [None]:
!cd {mirizio_dir} && loompy fromfq RPMI_P1.loom RPMI_P1 {human_gencode_dir} metadata.tab \
RPMI_P1_L001_R1_001.fastq.gz RPMI_P1_L001_R2_001.fastq.gz RPMI_P1_L002_R1_001.fastq.gz RPMI_P1_L002_R2_001.fastq.gz RPMI_P1_L003_R1_001.fastq.gz RPMI_P1_L003_R2_001.fastq.gz RPMI_P1_L003_R1_001.fastq.gz RPMI_P1_L003_R2_001.fastq.gz 

In [None]:
!cd {mirizio_dir} && loompy fromfq RPMI_P2.loom RPMI_P2 {human_gencode_dir} metadata.tab \
RPMI_P2_L001_R1_001.fastq.gz RPMI_P2_L001_R2_001.fastq.gz RPMI_P2_L002_R1_001.fastq.gz RPMI_P2_L002_R2_001.fastq.gz RPMI_P2_L003_R1_001.fastq.gz RPMI_P2_L003_R2_001.fastq.gz RPMI_P2_L003_R1_001.fastq.gz RPMI_P2_L003_R2_001.fastq.gz 

In [None]:
!cd {mirizio_dir} && loompy fromfq RPMI_P3.loom RPMI_P3 {human_gencode_dir} metadata.tab \
RPMI_P3_L001_R1_001.fastq.gz RPMI_P3_L001_R2_001.fastq.gz RPMI_P3_L002_R1_001.fastq.gz RPMI_P3_L002_R2_001.fastq.gz RPMI_P3_L003_R1_001.fastq.gz RPMI_P3_L003_R2_001.fastq.gz RPMI_P3_L003_R1_001.fastq.gz RPMI_P3_L003_R2_001.fastq.gz 

In [None]:
adata_mirizio_Cryo_P1 = sc.read_loom(mirizio_dir + '/Cryo_P1.loom')
adata_mirizio_Cryo_P1.var_names_make_unique()
metadata_assignment(adata_mirizio_Cryo_P1, 'Mirizio', 2020, 'Cryo_P1')

adata_mirizio_Cryo_P2 = sc.read_loom(mirizio_dir + '/Cryo_P2.loom')
adata_mirizio_Cryo_P2.var_names_make_unique()
metadata_assignment(adata_mirizio_Cryo_P2, 'Mirizio', 2020, 'Cryo_P2')

adata_mirizio_Cryo_P3 = sc.read_loom(mirizio_dir + '/Cryo_P3.loom')
adata_mirizio_Cryo_P3.var_names_make_unique()
metadata_assignment(adata_mirizio_Cryo_P3, 'Mirizio', 2020, 'Cryo_P3')

adata_mirizio_RPMI_P1 = sc.read_loom(mirizio_dir + '/RPMI_P1.loom')
adata_mirizio_RPMI_P1.var_names_make_unique()
metadata_assignment(adata_mirizio_RPMI_P1, 'Mirizio', 2020, 'RPMI_1')

adata_mirizio_RPMI_P2 = sc.read_loom(mirizio_dir + '/RPMI_P2.loom')
adata_mirizio_RPMI_P2.var_names_make_unique()
metadata_assignment(adata_mirizio_RPMI_P2, 'Mirizio', 2020, 'RPMI_2')

adata_mirizio_RPMI_P3 = sc.read_loom(mirizio_dir + '/RPMI_P3.loom')
adata_mirizio_RPMI_P3.var_names_make_unique()
metadata_assignment(adata_mirizio_RPMI_P3, 'Mirizio', 2020, 'RPMI_3')

In [None]:
adata_mirizio = sc.AnnData.concatenate(adata_mirizio_Cryo_P1, adata_mirizio_Cryo_P2, adata_mirizio_Cryo_P3, 
                                   adata_mirizio_RPMI_P1, adata_mirizio_RPMI_P2, adata_mirizio_RPMI_P3, 
                                   batch_categories=['Cryo_P1', 'Cryo_P2', 'Cryo_P3', 
                                                     'RPMI_1', 'RPMI_2', 'RPMI_3'], batch_key="Internal sample identifier")

In [None]:
adata_mirizio.write_h5ad(mirizio_dir + '/adata_mirizio_2020_scleroderma.h5')

## Philippeos et al. 2018 (human)

In [None]:
phil_dir = data_dir + '/Philippeos_2018'
os.makedirs(phil_dir, exist_ok=True)

In [None]:
!wget -P {phil_dir} https://ftp.ncbi.nlm.nih.gov/geo/series/GSE109nnn/GSE109822/suppl/GSE109822%5FCD3145%2Ecsv%2Egz

In [None]:
!wget -P {phil_dir} https://ftp.ncbi.nlm.nih.gov/geo/series/GSE109nnn/GSE109822/suppl/GSE109822%5FCD90%2Ecsv%2Egz

In [None]:
!gunzip {phil_dir}/*.gz -f

## Popescu et al. 2019 (human)

### Direct h5ad download

In [None]:
!aria2c -x 16 https://zenodo.org/record/4536165/files/fetal_submission.h5ad?download=1 -d {popescu_dir} -o popescu_2019.h5ad

### FASTQ processing

In [None]:
!aria2c -x 16 https://www.ebi.ac.uk/arrayexpress/files/E-MTAB-7407/E-MTAB-7407.sdrf.txt -d {popescu_dir} -o acctable.txt

In [None]:
popescu_metadata = pd.read_csv(popescu_dir + '/acctable.txt', sep='\t')
popescu_metadata_skin = popescu_metadata[popescu_metadata['Characteristics[organism part]'] == 'skin'].reset_index(drop=True)

In [None]:
df = pd.DataFrame({'name': [f"{i}_{popescu_metadata_skin['Source Name'].values[i]}_{popescu_metadata_skin['Characteristics[individual]'].values[i]}_{popescu_metadata_skin['Characteristics[facs sorting]'].values[i].replace('/', '-').replace(' ', '-')}" for i in range(len(popescu_metadata_skin))], 
                   'technology': ['10xv2'] * len(popescu_metadata_skin), 
                   'targetnumcells': [1000] * len(popescu_metadata_skin)})

df.to_csv(popescu_dir + '/metadata.tab', sep='\t', index=None)

In [None]:
!cd  {popescu_dir} && cat metadata.tab

In [None]:
print(len(popescu_metadata_skin))
for idx, name, indv, facs, f1, f2 in tqdm(zip(range(len(popescu_metadata_skin)), 
                                                    popescu_metadata_skin['Source Name'].values, 
                                    popescu_metadata_skin['Characteristics[individual]'].values, 
                                    popescu_metadata_skin['Characteristics[facs sorting]'].values,
                                    popescu_metadata_skin['Comment[FASTQ_URI]'].values, 
                                    popescu_metadata_skin['Comment[FASTQ_URI].1'].values)):
    
    facs = facs.replace('/', '-').replace(' ', '-')
    str_file = f'{idx}_{name}_{indv}_{facs}'
    
    if os.path.exists(f'{popescu_dir}/Popescu_2019_{str_file}.loom'):
        print(f'Popescu_2019_{str_file}.loom EXISTS!')
    else:
        os.system(f'cd {popescu_dir} && aria2c -x 16 --file-allocation=none {f1} -d {popescu_dir} -o {str_file}_S1_L001_R1_001.fastq.gz')
        os.system(f'cd {popescu_dir} && aria2c -x 16 --file-allocation=none {f2} -d {popescu_dir} -o {str_file}_S1_L001_R2_001.fastq.gz')

        os.system(f'cd {popescu_dir} && loompy fromfq Popescu_2019_{str_file}.loom {str_file} {human_gencode_dir} metadata.tab {str_file}_S1_L001_R1_001.fastq.gz {str_file}_S1_L001_R2_001.fastq.gz ')

        os.system(f'rm {popescu_dir}/{str_file}_S1_L001_R1_001.fastq.gz')
        os.system(f'rm {popescu_dir}/{str_file}_S1_L001_R2_001.fastq.gz')
    

## Reynolds et al. 2021 (human)

In [None]:
# Refer to https://github.com/alexmascension/revisit_reynolds_fb to see to main processing files.

In [None]:
reynolds_dir = data_dir + '/reynolds_2021'

reynolds_2021_ctrl_fb = sc.read(reynolds_dir + '/adata_reynolds_healthy_fb.h5ad', 
                                    backup_url='https://zenodo.org/record/4708700/files/adata_reynolds_healthy_fb.h5ad?download=1')

del reynolds_2021_ctrl_fb.uns

In [None]:
reynolds_2021_ctrl_fb.obs['Internal sample identifier'] = reynolds_2021_ctrl_fb.obs['sample_id']

list_adatas_control = []

for cat in reynolds_2021_ctrl_fb.obs['Internal sample identifier'].cat.categories:
    adata = reynolds_2021_ctrl_fb[reynolds_2021_ctrl_fb.obs['Internal sample identifier'] == cat]
    adata = metadata_assignment(adata, 'Reynolds', 2021, cat, do_return=True)
    list_adatas_control.append(adata)

reynolds_2021_ctrl_fb = sc.AnnData.concatenate(*list_adatas_control, batch_categories=reynolds_2021_ctrl_fb.obs['Internal sample identifier'].cat.categories, batch_key="Internal sample identifier")

In [None]:
reynolds_2021_ctrl_fb.write_h5ad(reynolds_dir + '/reynolds_2021_ctrl_fb.h5')

## Solé-Boldo et al. 2020 (human)

### Young samples

In [None]:
sole_dir = data_dir + '/Sole-Boldo_2020'
os.makedirs(sole_dir, exist_ok=True)

In [None]:
!cd {sole_dir} && fastq-dump SRR9036396 --gzip --split-files

In [None]:
!cd {sole_dir} && fastq-dump SRR9036397 --gzip --split-files

In [None]:
df = pd.DataFrame({'name': ['SB2020_y1', 'SB2020_y2'], 'technology': ['10xv2'] * 2, 'targetnumcells': [1000] * 2})
df.to_csv(sole_dir + '/metadata.tab', sep='\t', index=None)

In [None]:
!mv {sole_dir}/SRR9036396_1.fastq.gz {sole_dir}/SB2020_y1_L001_R1_001.fastq.gz 
!mv {sole_dir}/SRR9036396_2.fastq.gz {sole_dir}/SB2020_y1_L001_R2_001.fastq.gz 
!mv {sole_dir}/SRR9036397_1.fastq.gz {sole_dir}/SB2020_y2_L002_R1_001.fastq.gz 
!mv {sole_dir}/SRR9036397_2.fastq.gz {sole_dir}/SB2020_y2_L002_R2_001.fastq.gz 

In [None]:
!cd {sole_dir} && loompy fromfq SB2020_y1.loom SB2020_y1 {human_gencode_dir} metadata.tab SB2020_y1_L001_R1_001.fastq.gz SB2020_y1_L001_R2_001.fastq.gz 

!cd {sole_dir} && loompy fromfq SB2020_y2.loom SB2020_y2 {human_gencode_dir} metadata.tab SB2020_y2_L002_R1_001.fastq.gz SB2020_y2_L002_R2_001.fastq.gz

In [None]:
adata_sole_young_1 = sc.read_loom(sole_dir + '/SB2020_y1.loom')
adata_sole_young_1.var_names_make_unique()
metadata_assignment(adata_sole_young_1, 'Solé-Boldo', 2020, 'y1')

adata_sole_young_2 = sc.read_loom(sole_dir + '/SB2020_y2.loom')
adata_sole_young_2.var_names_make_unique()
metadata_assignment(adata_sole_young_2, 'Solé-Boldo', 2020, 'y2')

adata_sole_young = sc.AnnData.concatenate(adata_sole_young_1, adata_sole_young_2, batch_key="Internal sample identifier", batch_categories=['y1', 'y2'])
adata_sole_young.write_h5ad(sole_dir + '/adata_sole_2020_young.h5')

### Old samples

In [None]:
!cd {sole_dir} && fastq-dump SRR9036398 --gzip --split-files

In [None]:
!cd {sole_dir} && fastq-dump SRR9036399 --gzip --split-files

In [None]:
!cd {sole_dir} && fastq-dump SRR9036400 --gzip --split-files

In [None]:
df = pd.DataFrame({'name': ['SB2020_o1', 'SB2020_o2', 'SB2020_o3'], 'technology': ['10xv2'] * 3, 'targetnumcells': [1000] * 3})
df.to_csv(sole_dir + '/metadata.tab', sep='\t', index=None)

In [None]:
!mv {sole_dir}/SRR9036398_1.fastq.gz {sole_dir}/SB2020_o1_L001_R1_001.fastq.gz 
!mv {sole_dir}/SRR9036398_2.fastq.gz {sole_dir}/SB2020_o1_L001_R2_001.fastq.gz 
!mv {sole_dir}/SRR9036399_1.fastq.gz {sole_dir}/SB2020_o2_L002_R1_001.fastq.gz 
!mv {sole_dir}/SRR9036399_2.fastq.gz {sole_dir}/SB2020_o2_L002_R2_001.fastq.gz 
!mv {sole_dir}/SRR9036400_1.fastq.gz {sole_dir}/SB2020_o3_L003_R1_001.fastq.gz 
!mv {sole_dir}/SRR9036400_2.fastq.gz {sole_dir}/SB2020_o3_L003_R2_001.fastq.gz 

In [None]:
!cd {sole_dir} && loompy fromfq SB2020_o1.loom SB2020_o1 {human_gencode_dir} metadata.tab SB2020_o1_L001_R1_001.fastq.gz SB2020_o1_L001_R2_001.fastq.gz

!cd {sole_dir} && loompy fromfq SB2020_o2.loom SB2020_o2 {human_gencode_dir} metadata.tab SB2020_o2_L002_R1_001.fastq.gz SB2020_o2_L002_R2_001.fastq.gz

!cd {sole_dir} && loompy fromfq SB2020_o3.loom SB2020_o3 {human_gencode_dir} metadata.tab SB2020_o3_L003_R1_001.fastq.gz SB2020_o3_L003_R2_001.fastq.gz 

In [None]:
adata_sole_old_1 = sc.read_loom(sole_dir + '/SB2020_o1.loom')
adata_sole_old_1.var_names_make_unique()
metadata_assignment(adata_sole_old_1, 'Solé-Boldo', 2020, 'o1')

adata_sole_old_2 = sc.read_loom(sole_dir + '/SB2020_o2.loom')
adata_sole_old_2.var_names_make_unique()
metadata_assignment(adata_sole_old_2, 'Solé-Boldo', 2020, 'o2')

adata_sole_old_3 = sc.read_loom(sole_dir + '/SB2020_o3.loom')
adata_sole_old_3.var_names_make_unique()
metadata_assignment(adata_sole_old_3, 'Solé-Boldo', 2020, 'o3')

adata_sole_old = sc.AnnData.concatenate(adata_sole_old_1, adata_sole_old_2, adata_sole_old_3, batch_key="Internal sample identifier", batch_categories=['o1', 'o2', 'o3'])
adata_sole_old.write_h5ad(sole_dir + '/adata_sole_2020_old.h5')

## Tabib et al. 2018 (human)

In [None]:
tabib_2018_dir = data_dir + '/Tabib_2018'
os.makedirs(tabib_2018_dir, exist_ok=True)

In [None]:
!wget -P {tabib_dir} https://dom.pitt.edu/wp-content/uploads/2018/10/Skin_6Control_rawUMI.zip

In [None]:
!wget -P {tabib_dir} https://dom.pitt.edu/wp-content/uploads/2018/10/Skin_6Control_Metadata.zip

In [None]:
!unzip -o {tabib_dir}/Skin_6Control_rawUMI.zip -d {tabib_dir}

In [None]:
!unzip -o {tabib_dir}/Skin_6Control_Metadata.zip -d {tabib_dir}

In [None]:
tabib_2018_ctrl = sc.read_csv(tabib_2018_dir + '/Skin_6Control_rawUMI.csv')
tabib_2018_ctrl = tabib_2018_ctrl.transpose()

In [None]:
tabib_2018_ctrl.obs['Internal sample identifier'] = [i.split('_')[0] for i in tabib_2018_ctrl.obs_names]

In [None]:
tabib_2018_ctrl.obs['Internal sample identifier']

In [None]:
list_adatas = [ ]

list_donors = ['SC1control', 'SC4control', 'SC18control', 'SC32control', 'SC33control', 'SC34control']
for donor in list_donors:
    adata_donor = tabib_2018_ctrl[tabib_2018_ctrl.obs['Internal sample identifier'] == donor]
    adata_donor = metadata_assignment(adata_donor, 'Tabib', 2018, donor, do_return=True)
    list_adatas.append(adata_donor)
    
tabib_2018_ctrl = sc.AnnData.concatenate(*list_adatas, batch_key="Internal sample identifier", batch_categories=list_donors)

In [None]:
tabib_2018_ctrl.obs_names = [i.split('-')[0] for i in tabib_2018_ctrl.obs_names]

In [None]:
tabib_2018_ctrl.write_h5ad(tabib_2018_dir + '/adata_tabib_2018_ctrl.h5')

## Tabib et al. 2021 (human)

In [None]:
tabib_2021_dir = data_dir + '/Tabib_2021'
os.makedirs(tabib_2021_dir, exist_ok=True)

In [None]:
GSM_list_control = [('SC1', 'CONTROL', 'GSM4115868'), ('SC4', 'CONTROL', 'GSM4115870'), ('SC18', 'CONTROL', 'GSM4115872'), ('SC32', 'CONTROL', 'GSM4115874'),
                    ('SC33', 'CONTROL', 'GSM4115875'), ('SC34', 'CONTROL', 'GSM4115876'), ('SC50', 'CONTROL', 'GSM4115878'),
                    ('SC68', 'CONTROL', 'GSM4115880'), ('SC124', 'CONTROL', 'GSM4115885'), ('SC125', 'CONTROL', 'GSM4115886')]
GSM_list_SSC = [('SC2', 'SSC', 'GSM4115869'), ('SC5', 'SSC', 'GSM4115871'), ('SC19', 'SSC', 'GSM4115873'), ('SC49', 'SSC', 'GSM4115877'),
                ('SC60', 'SSC', 'GSM4115879'), ('SC69', 'SSC', 'GSM4115881'), ('SC70', 'SSC', 'GSM4115882'), ('SC86', 'SSC', 'GSM4115883'),
                ('SC119', 'SSC', 'GSM4115884'), ('SC185', 'SSC', 'GSM4115887'), ('SC188', 'SSC', 'GSM4115888'), ('SC189', 'SSC', 'GSM4115889'),]

In [None]:
for name, _, gsm in GSM_list_control + GSM_list_SSC:
    os.system(f"wget -P {tabib_2021_dir}  ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4115nnn/{gsm}/suppl/{gsm}%5F{name}raw%5Ffeature%5Fbc%5Fmatrix%2Eh5")

In [None]:
list_adatas_control = []

for name, _, gsm in GSM_list_control:
    adata = sc.read_10x_h5(f"{tabib_2021_dir}/{gsm}_{name}raw_feature_bc_matrix.h5")
    adata.var_names_make_unique()
    sc.pp.filter_cells(adata, min_counts=150)
    adata = metadata_assignment(adata, 'Tabib', 2021, name, do_return=True)
    list_adatas_control.append(adata)

adata_tabib_2021_ctrl = sc.AnnData.concatenate(*list_adatas_control, batch_categories=[i[0] for i in GSM_list_control], batch_key="Internal sample identifier")
sc.pp.filter_genes(adata_tabib_2021_ctrl, min_counts=10)

In [None]:
list_adatas_ssc = []

for name, _, gsm in GSM_list_SSC:
    adata = sc.read_10x_h5(f"{tabib_2021_dir}/{gsm}_{name}raw_feature_bc_matrix.h5")
    adata.var_names_make_unique()
    sc.pp.filter_cells(adata, min_counts=150)
    adata = metadata_assignment(adata, 'Tabib', 2021, name, do_return=True)
    list_adatas_ssc.append(adata)

adata_tabib_2021_ssc = sc.AnnData.concatenate(*list_adatas_ssc, batch_categories=[i[0] for i in GSM_list_SSC], batch_key="Internal sample identifier")
sc.pp.filter_genes(adata_tabib_2021_ssc, min_counts=10)

In [None]:
adata_tabib_2021_ctrl.write_h5ad(tabib_2021_dir + '/adata_tabib_2021_ctrl.h5')

In [None]:
adata_tabib_2021_ssc.write_h5ad(tabib_2021_dir + '/adata_tabib_2021_ssc.h5')

## Tabula Sapiens Consortium 2021 (human)

To access these fastq files you must ask for permission to CZ tabula sapiens (tabula-sapiens@czbiohub.org) and ask for data request.

In [None]:
tsc_dir = data_dir + '/Tabula_Sapiens_Consortium_2021'
os.makedirs(tsc_dir, exist_ok=True)

In [None]:
df = pd.DataFrame({'name': ['TSP10_Skin_NA_10X_1_1_S5', 'TSP10_Skin_NA_10X_1_2_S6', 'TSP14_Skin_Chest_10X_1_1_S18', 'TSP14_Skin_Abdomen_10X_1_1_S17'], 
                   'technology': ['10xv3'] * 4, 
                   'targetnumcells': [1000] * 4})
df.to_csv(tsc_dir + '/metadata.tab', sep='\t', index=None)

In [None]:
!cd {tsc_dir} && loompy fromfq TSP10_S5.loom TSP10_Skin_NA_10X_1_1_S5 {human_gencode_dir} metadata.tab \
Pilot_10/TSP10_Skin_NA_10X_1_1/TSP10_Skin_NA_10X_1_1_S5_L001_R1_001.fastq.gz Pilot_10/TSP10_Skin_NA_10X_1_1/TSP10_Skin_NA_10X_1_1_S5_L001_R2_001.fastq.gz \
Pilot_10/TSP10_Skin_NA_10X_1_1/TSP10_Skin_NA_10X_1_1_S5_L002_R1_001.fastq.gz Pilot_10/TSP10_Skin_NA_10X_1_1/TSP10_Skin_NA_10X_1_1_S5_L002_R2_001.fastq.gz \
Pilot_10/TSP10_Skin_NA_10X_1_1/TSP10_Skin_NA_10X_1_1_S5_L003_R1_001.fastq.gz Pilot_10/TSP10_Skin_NA_10X_1_1/TSP10_Skin_NA_10X_1_1_S5_L003_R2_001.fastq.gz \
Pilot_10/TSP10_Skin_NA_10X_1_1/TSP10_Skin_NA_10X_1_1_S5_L004_R1_001.fastq.gz Pilot_10/TSP10_Skin_NA_10X_1_1/TSP10_Skin_NA_10X_1_1_S5_L004_R2_001.fastq.gz

In [None]:
!cd {tsc_dir} && loompy fromfq TSP10_S6.loom TSP10_Skin_NA_10X_1_2_S6 {human_gencode_dir} metadata.tab \
Pilot_10/TSP10_Skin_NA_10X_1_2/TSP10_Skin_NA_10X_1_2_S6_L001_R1_001.fastq.gz Pilot_10/TSP10_Skin_NA_10X_1_2/TSP10_Skin_NA_10X_1_2_S6_L001_R2_001.fastq.gz \
Pilot_10/TSP10_Skin_NA_10X_1_2/TSP10_Skin_NA_10X_1_2_S6_L002_R1_001.fastq.gz Pilot_10/TSP10_Skin_NA_10X_1_2/TSP10_Skin_NA_10X_1_2_S6_L002_R2_001.fastq.gz \
Pilot_10/TSP10_Skin_NA_10X_1_2/TSP10_Skin_NA_10X_1_2_S6_L003_R1_001.fastq.gz Pilot_10/TSP10_Skin_NA_10X_1_2/TSP10_Skin_NA_10X_1_2_S6_L003_R2_001.fastq.gz \
Pilot_10/TSP10_Skin_NA_10X_1_2/TSP10_Skin_NA_10X_1_2_S6_L004_R1_001.fastq.gz Pilot_10/TSP10_Skin_NA_10X_1_2/TSP10_Skin_NA_10X_1_2_S6_L004_R2_001.fastq.gz

In [None]:
!cd {tsc_dir} && loompy fromfq TSP14_S17.loom TSP14_Skin_Abdomen_10X_1_1_S17 {human_gencode_dir} metadata.tab \
Pilot_14/TSP14_Skin_Abdomen_10X_1_1/TSP14_Skin_Abdomen_10X_1_1_S17_R1_001.fastq.gz Pilot_14/TSP14_Skin_Abdomen_10X_1_1/TSP14_Skin_Abdomen_10X_1_1_S17_R2_001.fastq.gz

In [None]:
!cd {tsc_dir} && loompy fromfq TSP14_S18.loom TSP14_Skin_Chest_10X_1_1_S18 {human_gencode_dir} metadata.tab \
Pilot_14/TSP14_Skin_Chest_10X_1_1/TSP14_Skin_Chest_10X_1_1_S18_R1_001.fastq.gz Pilot_14/TSP14_Skin_Chest_10X_1_1/TSP14_Skin_Chest_10X_1_1_S18_R2_001.fastq.gz

In [None]:
adata_tsc_2021_ctrl_TSP10_S5 = sc.read_loom(f"{tsc_dir}/TSP10_S5.loom")
adata_tsc_2021_ctrl_TSP10_S5.var_names_make_unique()
metadata_assignment(adata_tsc_2021_ctrl_TSP10_S5, 'Tabula Sapiens', 2021, "T10_S5")

adata_tsc_2021_ctrl_TSP10_S6 = sc.read_loom(f"{tsc_dir}/TSP10_S6.loom")
adata_tsc_2021_ctrl_TSP10_S6.var_names_make_unique()
metadata_assignment(adata_tsc_2021_ctrl_TSP10_S6, 'Tabula Sapiens', 2021, "T10_S6")

adata_tsc_2021_ctrl_TSP14_S17 = sc.read_loom(f"{tsc_dir}/TSP14_S17.loom")
adata_tsc_2021_ctrl_TSP14_S17.var_names_make_unique()
metadata_assignment(adata_tsc_2021_ctrl_TSP14_S17, 'Tabula Sapiens', 2021, "T14_S17")

adata_tsc_2021_ctrl_TSP14_S18 = sc.read_loom(f"{tsc_dir}/TSP14_S18.loom")
adata_tsc_2021_ctrl_TSP14_S18.var_names_make_unique()
metadata_assignment(adata_tsc_2021_ctrl_TSP14_S18, 'Tabula Sapiens', 2021, "T14_S18")

In [None]:
adata_tsc_2021_ctrl = sc.AnnData.concatenate(adata_tsc_2021_ctrl_TSP10_S5, adata_tsc_2021_ctrl_TSP10_S6, adata_tsc_2021_ctrl_TSP14_S17, adata_tsc_2021_ctrl_TSP14_S18, 
                                             batch_categories=['T10_S5', 'T10_S6', 'T14_S17', 'T14_S18'], batch_key="Internal sample identifier")
adata_tsc_2021_ctrl.write_h5ad(tsc_dir + '/adata_tsc_2021_ctrl.h5')

## The Human Protein Atlas 2021 (human)

The data are extracted from Solé-Boldo (GSE130973).

## Theocharidis et al. 2020 (human)

In [None]:
theo_dir = data_dir + '/Theocharidis_2020/'
os.makedirs(theo_dir, exist_ok=True)

In [None]:
# The file was obtained by personal request (https://www.dropbox.com/scl/fo/x4106l4nd2s8rrec4mboh/AACGrmqWuvbhXZaNnxOdZh9ja?dl=0)

In [None]:
adata_theo_healthy_1 = sc.read_10x_mtx(theo_dir + 'Human samples raw_GT_Veves lab/Healthy/H1_080717')
metadata_assignment(adata_theo_healthy_1, 'Theocarditis', 2020, "H1")
adata_theo_healthy_2 = sc.read_10x_mtx(theo_dir + 'Human samples raw_GT_Veves lab/Healthy/H2_091117')
metadata_assignment(adata_theo_healthy_2, 'Theocarditis', 2020, "H2")
adata_theo_healthy_3 = sc.read_10x_mtx(theo_dir + 'Human samples raw_GT_Veves lab/Healthy/H3_091117')
metadata_assignment(adata_theo_healthy_3, 'Theocarditis', 2020, "H3")
adata_theo_healthy_4 = sc.read_10x_mtx(theo_dir + 'Human samples raw_GT_Veves lab/Healthy/H4_100317')
metadata_assignment(adata_theo_healthy_4, 'Theocarditis', 2020, "H4")

adata_theo_healthy = sc.AnnData.concatenate(adata_theo_healthy_1, adata_theo_healthy_2, adata_theo_healthy_3, adata_theo_healthy_4, batch_categories=['H1', 'H2', 'H3', 'H4'])

sc.pp.filter_genes(adata_theo_healthy, min_counts=1)
adata_theo_healthy.write_h5ad(theo_dir + '/adata_theo_ctrl.h5')

In [None]:
adata_theo_dm_noDFU_1 = sc.read_10x_mtx(theo_dir + 'Human samples raw_GT_Veves lab/DM/DM1_091117')
metadata_assignment(adata_theo_dm_noDFU_1, 'Theocarditis', 2020, "DM1")
adata_theo_dm_noDFU_2 = sc.read_10x_mtx(theo_dir + 'Human samples raw_GT_Veves lab/DM/DM2_100317')
metadata_assignment(adata_theo_dm_noDFU_2, 'Theocarditis', 2020, "DM2")
adata_theo_dm_noDFU_3 = sc.read_10x_mtx(theo_dir + 'Human samples raw_GT_Veves lab/DM/DM3_100317')
metadata_assignment(adata_theo_dm_noDFU_3, 'Theocarditis', 2020, "DM3")
adata_theo_dm_noDFU_4 = sc.read_10x_mtx(theo_dir + 'Human samples raw_GT_Veves lab/DM/DM4_100317')
metadata_assignment(adata_theo_dm_noDFU_4, 'Theocarditis', 2020, "DM4")

adata_theo_dm_noDFU = sc.AnnData.concatenate(adata_theo_dm_noDFU_1, adata_theo_dm_noDFU_2, adata_theo_dm_noDFU_3, adata_theo_dm_noDFU_4, 
                                             batch_categories=['DM1', 'DM2', 'DM3', 'DM4'])

sc.pp.filter_genes(adata_theo_dm_noDFU, min_counts=1)
adata_theo_dm_noDFU.write_h5ad(theo_dir + '/adata_theo_dm_noDFU.h5')

In [None]:
adata_theo_dm_DFU_1 = sc.read_10x_mtx(theo_dir + 'Human samples raw_GT_Veves lab/DFU/DFU1_091117')
metadata_assignment(adata_theo_dm_DFU_1, 'Theocarditis', 2020, "DFU1")
adata_theo_dm_DFU_2 = sc.read_10x_mtx(theo_dir + 'Human samples raw_GT_Veves lab/DFU/DFU2_091117')
metadata_assignment(adata_theo_dm_DFU_2, 'Theocarditis', 2020, "DFU2")
adata_theo_dm_DFU_3 = sc.read_10x_mtx(theo_dir + 'Human samples raw_GT_Veves lab/DFU/DFU3_031418')
metadata_assignment(adata_theo_dm_DFU_3, 'Theocarditis', 2020, "DFU3")
adata_theo_dm_DFU_4 = sc.read_10x_mtx(theo_dir + 'Human samples raw_GT_Veves lab/DFU/DFU4_031418')
metadata_assignment(adata_theo_dm_DFU_4, 'Theocarditis', 2020, "DFU4")

adata_theo_dm_DFU = sc.AnnData.concatenate(adata_theo_dm_noDFU_1, adata_theo_dm_noDFU_2, adata_theo_dm_noDFU_3, adata_theo_dm_noDFU_4, 
                                             batch_categories=['DFU1', 'DFU2', 'DFU3', 'DFU4'])

sc.pp.filter_genes(adata_theo_dm_DFU, min_counts=1)
adata_theo_dm_DFU.write_h5ad(theo_dir + '/adata_theo_dm_DFU.h5')

## Theocarditis et al. 2021 (human)

In [None]:
theo_dir_2021 = data_dir + 'Theocharidis_2021/'
os.makedirs(theo_dir_2021, exist_ok=True)

In [None]:
list_g = ['G1', 'G10', 'G14', 'G16', 'G18', 'G24', 'G28', 'G29', 'G31', 'G32', 'G36', 'G40', 'G43', 'G44', 'G50', 'G1A', 'G2A', 'G3', 'G3A', 'G4A', 'G5', 'G38', 
 'G41', 'G46', 'G48', 'G2', 'G4', 'G7', 'G8', 'G12', 'G13', 'G15', 'G17', 'G23', 'G42', 'G45', 'G49', 'G6', 'G9', 'G11', 'G33', 'G34', 'G35', 'G39']
list_GSM = ['GSM5050521', 'GSM5050534', 'GSM5050538', 'GSM5050540', 'GSM5050542', 'GSM5050548', 'GSM5050552', 'GSM5050553', 'GSM5050555', 'GSM5050556', 'GSM5050560',
            'GSM5050564', 'GSM5050567', 'GSM5050568', 'GSM5050574', 'GSM5050522', 'GSM5050524', 'GSM5050525', 'GSM5050526', 'GSM5050528', 'GSM5050529', 'GSM5050562',
            'GSM5050565', 'GSM5050570', 'GSM5050572', 'GSM5050523', 'GSM5050527', 'GSM5050531', 'GSM5050532', 'GSM5050536', 'GSM5050537', 'GSM5050539', 'GSM5050541', 
            'GSM5050547', 'GSM5050566', 'GSM5050569', 'GSM5050573', 'GSM5050530', 'GSM5050533', 'GSM5050535', 'GSM5050557', 'GSM5050558', 'GSM5050559', 'GSM5050563', ]


In [None]:
# Download all files at once
for g, GSM in zip(list_g, list_GSM):
    os.system(f"cd {theo_dir_2021} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM5050nnn/{GSM}/suppl/{GSM}%5F{g}counts%2Ecsv%2Egz -O {g}.csv.gz")
    os.system(f"cd {theo_dir_2021} && gunzip {g}.csv.gz")

In [None]:
list_adatas = []

for g in tqdm(list(list_g)):
    adata = sc.read(theo_dir_2021 + f'{g}.csv').transpose()
    adata.X = spr.csr.csr_matrix(adata.X).copy()
    metadata_assignment(adata, 'Theocarditis', 2021, g)

    
    list_adatas.append(adata)

In [None]:
adata_theo_2021 = sc.AnnData.concatenate(*list_adatas, batch_categories=list_g)
adata_theo_2021.write_h5ad(theo_dir_2021 + 'adata_theo_2021.h5')

## Vorstandlechner et al. 2020 (human)

In [None]:
vors_dir = data_dir + '/Vorstandlechner_2020'
os.makedirs(sole_dir, exist_ok=True)

In [None]:
# The file was obtained by personal request from Vorstandlechner

In [None]:
vors_2020_ctrl = sc.read(vors_dir + '/skin_vorstandlechner.loom', cache=True)

In [None]:
vors_2020_ctrl.obs['Internal sample identifier'] = [i.split('-')[1] for i in vors_2020_ctrl.obs_names]
vors_2020_ctrl.obs['Internal sample identifier'] = vors_2020_ctrl.obs['Internal sample identifier'].astype('category')

In [None]:
list_adatas = []

for batch in ['1', '2', '3']:
    adata = vors_2020_ctrl[vors_2020_ctrl.obs['Internal sample identifier'] == batch]
    adata = metadata_assignment(adata, 'Vorstandlechner', 2020, batch, do_return=True)
    list_adatas.append(adata)
    
vors_2020_ctrl = sc.AnnData.concatenate(*list_adatas, batch_key="Internal sample identifier", batch_categories=['1', '2', '3'])
vors_2020_ctrl.write_h5ad(vors_dir + '/adata_vors_2020_ctrl.h5')

## Xu et al. 2021

In [None]:
xu_2021_dir = data_dir + '/xu_2021'

In [None]:
!cd {xu_2021_dir} && wget https://download.cncb.ac.cn/OMIX/OMIX691/OMIX691-20-01.tar.gz --no-check-certificate
!cd {xu_2021_dir} && wget https://download.cncb.ac.cn/OMIX/OMIX691/OMIX691-20-02.tar.gz --no-check-certificate
!cd {xu_2021_dir} && wget https://download.cncb.ac.cn/OMIX/OMIX691/OMIX691-20-03.tar.gz --no-check-certificate
!cd {xu_2021_dir} && wget https://download.cncb.ac.cn/OMIX/OMIX691/OMIX691-20-04.tar.gz --no-check-certificate
!cd {xu_2021_dir} && wget https://download.cncb.ac.cn/OMIX/OMIX691/OMIX691-20-05.tar.gz --no-check-certificate
!cd {xu_2021_dir} && wget https://download.cncb.ac.cn/OMIX/OMIX691/OMIX691-20-06.tar.gz --no-check-certificate
!cd {xu_2021_dir} && wget https://download.cncb.ac.cn/OMIX/OMIX691/OMIX691-20-07.tar.gz --no-check-certificate
!cd {xu_2021_dir} && wget https://download.cncb.ac.cn/OMIX/OMIX691/OMIX691-20-08.tar.gz --no-check-certificate
!cd {xu_2021_dir} && wget https://download.cncb.ac.cn/OMIX/OMIX691/OMIX691-20-09.tar.gz --no-check-certificate
!cd {xu_2021_dir} && wget https://download.cncb.ac.cn/OMIX/OMIX691/OMIX691-20-10.tar.gz --no-check-certificate
!cd {xu_2021_dir} && wget https://download.cncb.ac.cn/OMIX/OMIX691/OMIX691-20-11.tar.gz --no-check-certificate
!cd {xu_2021_dir} && wget https://download.cncb.ac.cn/OMIX/OMIX691/OMIX691-20-12.tar.gz --no-check-certificate
!cd {xu_2021_dir} && wget https://download.cncb.ac.cn/OMIX/OMIX691/OMIX691-20-13.tar.gz --no-check-certificate
!cd {xu_2021_dir} && wget https://download.cncb.ac.cn/OMIX/OMIX691/OMIX691-20-14.tar.gz --no-check-certificate
!cd {xu_2021_dir} && wget https://download.cncb.ac.cn/OMIX/OMIX691/OMIX691-20-15.tar.gz --no-check-certificate

In [None]:
!cd {xu_2021_dir} && cat *.tar.gz | tar zxvf - -i

In [None]:
list_adatas_healthy, list_adatas_disease = [], []
list_names_healthy, list_names_disease = ['H01', 'H02', 'H03', 'H04', 'H05'], ['P01', 'P02', 'P03', 'P04', 'P05', 'P06', 'P07', 'P08', 'P09', 'P10']

for name in list_names_healthy:
    adata = sc.read_10x_mtx(xu_2021_dir + f'/{name}/outs/filtered_gene_bc_matrices/GRCh38')
    adata.var_names_make_unique()
    metadata_assignment(adata, 'Xu', 2021, name)
    list_adatas_healthy.append(adata)
    
for name in list_names_disease:
    adata = sc.read_10x_mtx(xu_2021_dir + f'/{name}/outs/filtered_gene_bc_matrices/GRCh38')
    adata.var_names_make_unique()
    metadata_assignment(adata, 'Xu', 2021, name)
    list_adatas_disease.append(adata)

In [None]:
adata_xu_2021_healthy = sc.AnnData.concatenate(*list_adatas_healthy, batch_categories=list_names_healthy, batch_key='Internal sample identifier')
adata_xu_2021_disease = sc.AnnData.concatenate(*list_adatas_disease, batch_categories=list_names_disease, batch_key='Internal sample identifier')

In [None]:
adata_xu_2021_healthy.write_h5ad(xu_2021_dir + '/xu_2021_healthy.h5')
adata_xu_2021_disease.write_h5ad(xu_2021_dir + '/xu_2021_disease.h5')