# Data extraction

In this notebook we are going to extract the raw or processed data from all datasets, to later analyze it in different notebooks.

In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import mygene
import numpy as np
import os
import pandas as pd
import scanpy as sc
import scanpy.external as sce
from tqdm.notebook import tqdm
import triku as tk
pd.set_option('display.max_columns', None)

In [None]:
from cellassign import assign_cats

In [None]:
# Palettes for UMAP gene expression

magma = [plt.get_cmap('magma')(i) for i in np.linspace(0,1, 80)]
magma[0] = (0.88, 0.88, 0.88, 1)
magma = mpl.colors.LinearSegmentedColormap.from_list("", magma[:65])

In [None]:
data_dir = os.getcwd() + '/data/'

In [None]:
mouse_gencode_dir = "/media/seth/SETH_DATA/SETH_Alex/Programs/mouse_GRCm38_gencode.v31"
program_dir = "/media/seth/SETH_DATA/SETH_Alex/Programs/"

## Abbasi et al. 2020

In [None]:
abbasi_dir = data_dir + '/abassi_2020'
os.makedirs(abbasi_dir, exist_ok=True)

In [None]:
os.makedirs(f"{abbasi_dir}/GSM2910020", exist_ok=True)
!cd {abbasi_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM2910nnn/GSM2910020/suppl/GSM2910020_sample_4_matrix.mtx.gz -O GSM2910020/matrix.mtx.gz
!cd {abbasi_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM2910nnn/GSM2910020/suppl/GSM2910020_sample_4_barcodes.tsv.gz -O GSM2910020/barcodes.tsv.gz
!cd {abbasi_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM2910nnn/GSM2910020/suppl/GSM2910020_sample_4_genes.tsv.gz -O GSM2910020/features.tsv.gz

In [None]:
adata_abassi_2020 = sc.read_mtx(f"{abbasi_dir}/GSM2910020/matrix.mtx.gz").transpose()
adata_abassi_2020.var_names = pd.read_csv(f"{abbasi_dir}/GSM2910020/features.tsv.gz", sep='\t', header=None)[1].values
adata_abassi_2020.obs_names = pd.read_csv(f"{abbasi_dir}/GSM2910020/barcodes.tsv.gz", sep='\t', header=None)[0].values

adata_abassi_2020.var_names_make_unique()

In [None]:
adata_abassi_2020.write_h5ad(f"{abbasi_dir}/adata_abassi_2020.h5")

## Buechler et al. 2021

In [None]:
buechler_dir = data_dir + '/buechler_2021'
os.makedirs(buechler_dir, exist_ok=True)

In [None]:
!cd {buechler_dir} && wget ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/MTAB/E-MTAB-10315/LIB5436740_SAM24390211_S4_L001_R2_001.fastq.gz
!cd {buechler_dir} && wget ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/MTAB/E-MTAB-10315/LIB5436740_SAM24390211_S4_L001_R1_001.fastq.gz
    
!cd {buechler_dir} && wget ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/MTAB/E-MTAB-10315/LIB5436740_SAM24390211_S4_L002_R1_001.fastq.gz
!cd {buechler_dir} && wget ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/MTAB/E-MTAB-10315/LIB5436740_SAM24390211_S4_L002_R2_001.fastq.gz

In [None]:
df = pd.DataFrame({'name': ['buechler_2021'], 'technology': ['10xv3'], 'targetnumcells': [5000]})
df.to_csv(buechler_dir + '/metadata.tab', sep='\t', index=None)

In [None]:
!cd {buechler_dir} && loompy fromfq buechler_2021.loom buechler_2021 {mouse_gencode_dir} metadata.tab \
LIB5436740_SAM24390211_S4_L001_R1_001.fastq.gz LIB5436740_SAM24390211_S4_L001_R2_001.fastq.gz \
LIB5436740_SAM24390211_S4_L002_R1_001.fastq.gz LIB5436740_SAM24390211_S4_L002_R2_001.fastq.gz 

## Efremova, Mirjana Panglao DB) 2018

In [None]:
efremova_dir = data_dir + '/efremova_2018'
os.makedirs(efremova_dir, exist_ok=True)

In [None]:
!cd {efremova_dir} && wget ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/MTAB/E-MTAB-7417/3421STDY7639028_S1_L001_R1_001.fastq.gz
!cd {efremova_dir} && wget ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/MTAB/E-MTAB-7417/3421STDY7639028_S1_L001_R2_001.fastq.gz
    
!cd {efremova_dir} && wget ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/MTAB/E-MTAB-7417/3421STDY7639029_S1_L001_R1_001.fastq.gz
!cd {efremova_dir} && wget ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/MTAB/E-MTAB-7417/3421STDY7639029_S1_L001_R2_001.fastq.gz

In [None]:
df = pd.DataFrame({'name': ['efremova_2018_S1', 'efremova_2018_S2'], 'technology': ['10xv2']*2, 'targetnumcells': [5000]*2})
df.to_csv(efremova_dir + '/metadata.tab', sep='\t', index=None)

In [None]:
!cd {efremova_dir} && loompy fromfq efremova_2018_S1.loom efremova_2018_S1 {mouse_gencode_dir} metadata.tab \
3421STDY7639028_S1_L001_R1_001.fastq.gz 3421STDY7639028_S1_L001_R2_001.fastq.gz

In [None]:
!cd {efremova_dir} && loompy fromfq efremova_2018_S2.loom efremova_2018_S2 {mouse_gencode_dir} metadata.tab \
3421STDY7639029_S1_L001_R1_001.fastq.gz 3421STDY7639029_S1_L001_R2_001.fastq.gz

In [None]:
adata_efremova_2018_S1 = sc.read(f"{efremova_dir}/efremova_2018_S1.loom")
adata_efremova_2018_S1.var_names_make_unique()
adata_efremova_2018_S2 = sc.read(f"{efremova_dir}/efremova_2018_S2.loom")
adata_efremova_2018_S2.var_names_make_unique()

adata_efremova_2018 = sc.AnnData.concatenate(adata_efremova_2018_S1, adata_efremova_2018_S2)
adata_efremova_2018.write_h5ad(f"{efremova_dir}/efremova_2018.h5")

## Haensel et al. 2020

In [None]:
haensel_dir = data_dir + '/haensel_2021'
os.makedirs(haensel_dir, exist_ok=True)

In [None]:
os.makedirs(f"{haensel_dir}/GSM4230076", exist_ok=True)
!cd {haensel_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4230nnn/GSM4230076/suppl/GSM4230076_Un-Wounded_1_scRNA-Seq.mtx.gz -O GSM4230076/matrix.mtx.gz
!cd {haensel_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4230nnn/GSM4230076/suppl/GSM4230076_barcodes_Un-Wounded_1_scRNA-Seq.tsv.gz  -O GSM4230076/barcodes.tsv.gz
!cd {haensel_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4230nnn/GSM4230076/suppl/GSM4230076_genes_Un-Wounded_1_scRNA-Seq.tsv.gz -O GSM4230076/features.tsv.gz

In [None]:
os.makedirs(f"{haensel_dir}/GSM4230077", exist_ok=True)
!cd {haensel_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4230nnn/GSM4230077/suppl/GSM4230077_Un-Wounded_2_scRNA-Seq.mtx.gz -O GSM4230077/matrix.mtx.gz
!cd {haensel_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4230nnn/GSM4230077/suppl/GSM4230077_barcodes_Un-Wounded_2_scRNA-Seq.tsv.gz -O GSM4230077/barcodes.tsv.gz
!cd {haensel_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4230nnn/GSM4230077/suppl/GSM4230077_genes_Un-Wounded_2_scRNA-Seq.tsv.gz -O GSM4230077/features.tsv.gz

In [None]:
os.makedirs(f"{haensel_dir}/GSM4230078", exist_ok=True)
!cd {haensel_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4230nnn/GSM4230078/suppl/GSM4230078_Wounded_1_scRNA-Seq.mtx.gz -O GSM4230078/matrix.mtx.gz
!cd {haensel_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4230nnn/GSM4230078/suppl/GSM4230078_barcodes_Wounded_1_scRNA-Seq.tsv.gz -O GSM4230078/barcodes.tsv.gz
!cd {haensel_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4230nnn/GSM4230078/suppl/GSM4230078_genes_Wounded_1_scRNA-Seq.tsv.gz -O GSM4230078/features.tsv.gz

In [None]:
os.makedirs(f"{haensel_dir}/GSM4230079", exist_ok=True)
!cd {haensel_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4230nnn/GSM4230079/suppl/GSM4230079_Wounded_2_scRNA-Seq.mtx.gz -O GSM4230079/matrix.mtx.gz
!cd {haensel_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4230nnn/GSM4230079/suppl/GSM4230079_barcodes_Wounded_2_scRNA-Seq.tsv.gz -O GSM4230079/barcodes.tsv.gz
!cd {haensel_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4230nnn/GSM4230079/suppl/GSM4230079_genes_Wounded_2_scRNA-Seq.tsv.gz -O GSM4230079/features.tsv.gz

In [None]:
os.makedirs(f"{haensel_dir}/GSM4230080", exist_ok=True)
!cd {haensel_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4230nnn/GSM4230080/suppl/GSM4230080_Wounded_3_scRNA-Seq.mtx.gz -O GSM4230080/matrix.mtx.gz
!cd {haensel_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4230nnn/GSM4230080/suppl/GSM4230080_barcodes_Wounded_3_scRNA-Seq.tsv.gz -O GSM4230080/barcodes.tsv.gz
!cd {haensel_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4230nnn/GSM4230080/suppl/GSM4230080_genes_Wounded_3_scRNA-Seq.tsv.gz -O GSM4230080/features.tsv.gz

In [None]:
adata_GSM4230076 = sc.read_mtx(f"{haensel_dir}/GSM4230076/matrix.mtx.gz").transpose()
adata_GSM4230076.var_names = pd.read_csv(f"{haensel_dir}/GSM4230076/features.tsv.gz", sep='\t', header=None)[1].values
adata_GSM4230076.obs_names = pd.read_csv(f"{haensel_dir}/GSM4230076/barcodes.tsv.gz", sep='\t', header=None)[0].values

adata_GSM4230076.var_names_make_unique()

In [None]:
adata_GSM4230077 = sc.read_mtx(f"{haensel_dir}/GSM4230077/matrix.mtx.gz").transpose()
adata_GSM4230077.var_names = pd.read_csv(f"{haensel_dir}/GSM4230077/features.tsv.gz", sep='\t', header=None)[1].values
adata_GSM4230077.obs_names = pd.read_csv(f"{haensel_dir}/GSM4230077/barcodes.tsv.gz", sep='\t', header=None)[0].values

adata_GSM4230077.var_names_make_unique()

In [None]:
adata_haensel = sc.AnnData.concatenate(adata_GSM4230076, adata_GSM4230077)
adata_haensel.write_h5ad(f"{haensel_dir}/adata_haensel.h5")

In [None]:
adata_GSM4230078 = sc.read_mtx(f"{haensel_dir}/GSM4230078/matrix.mtx.gz").transpose()
adata_GSM4230078.var_names = pd.read_csv(f"{haensel_dir}/GSM4230078/features.tsv.gz", sep='\t', header=None)[1].values
adata_GSM4230078.obs_names = pd.read_csv(f"{haensel_dir}/GSM4230078/barcodes.tsv.gz", sep='\t', header=None)[0].values

adata_GSM4230078.var_names_make_unique()

In [None]:
adata_GSM4230079 = sc.read_mtx(f"{haensel_dir}/GSM4230079/matrix.mtx.gz").transpose()
adata_GSM4230079.var_names = pd.read_csv(f"{haensel_dir}/GSM4230079/features.tsv.gz", sep='\t', header=None)[1].values
adata_GSM4230079.obs_names = pd.read_csv(f"{haensel_dir}/GSM4230079/barcodes.tsv.gz", sep='\t', header=None)[0].values

adata_GSM4230079.var_names_make_unique()

In [None]:
adata_GSM4230080 = sc.read_mtx(f"{haensel_dir}/GSM4230080/matrix.mtx.gz").transpose()
adata_GSM4230080.var_names = pd.read_csv(f"{haensel_dir}/GSM4230080/features.tsv.gz", sep='\t', header=None)[1].values
adata_GSM4230080.obs_names = pd.read_csv(f"{haensel_dir}/GSM4230080/barcodes.tsv.gz", sep='\t', header=None)[0].values

adata_GSM4230080.var_names_make_unique()

In [None]:
adata_haensel_wounded = sc.AnnData.concatenate(adata_GSM4230078, adata_GSM4230079, adata_GSM4230080)
adata_haensel_wounded.write_h5ad(f"{haensel_dir}/adata_haensel_wounded.h5")

## Ma et al. 2020

In [None]:
ma_dir = data_dir + '/ma_2020'
os.makedirs(ma_dir, exist_ok=True)

In [None]:
adata_GSM4331840

In [None]:
os.makedirs(f"{ma_dir}/GSM4331840", exist_ok=True)
!cd {ma_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4331nnn/GSM4331840/suppl/GSM4331840%5FSkin%2DM%2DY%5Fbarcodes%2Etsv%2Egz -O GSM4331840/barcodes.tsv.gz
!cd {ma_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4331nnn/GSM4331840/suppl/GSM4331840%5FSkin%2DM%2DY%5Fgenes%2Etsv%2Egz -O GSM4331840/features.tsv.gz
!cd {ma_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4331nnn/GSM4331840/suppl/GSM4331840%5FSkin%2DM%2DY%5Fmatrix%2Emtx%2Egz -O GSM4331840/matrix.mtx.gz

In [None]:
adata_GSM4331840_M_Y = sc.read_mtx(f"{ma_dir}/GSM4331840/matrix.mtx.gz").transpose()
adata_GSM4331840_M_Y.var_names = pd.read_csv(f"{ma_dir}/GSM4331840/features.tsv.gz", sep='\t', header=None)[1].values
adata_GSM4331840_M_Y.obs_names = pd.read_csv(f"{ma_dir}/GSM4331840/barcodes.tsv.gz", sep='\t', header=None)[0].values

adata_GSM4331840_M_Y.var_names_make_unique()

In [None]:
os.makedirs(f"{ma_dir}/GSM4331841", exist_ok=True)
!cd {ma_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4331nnn/GSM4331841/suppl/GSM4331841%5FSkin%2DM%2DO%5Fbarcodes%2Etsv%2Egz -O GSM4331841/barcodes.tsv.gz
!cd {ma_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4331nnn/GSM4331841/suppl/GSM4331841%5FSkin%2DM%2DO%5Fgenes%2Etsv%2Egz -O GSM4331841/features.tsv.gz
!cd {ma_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4331nnn/GSM4331841/suppl/GSM4331841%5FSkin%2DM%2DO%5Fmatrix%2Emtx%2Egz -O GSM4331841/matrix.mtx.gz

In [None]:
adata_GSM4331841_M_O = sc.read_mtx(f"{ma_dir}/GSM4331841/matrix.mtx.gz").transpose()
adata_GSM4331841_M_O.var_names = pd.read_csv(f"{ma_dir}/GSM4331841/features.tsv.gz", sep='\t', header=None)[1].values
adata_GSM4331841_M_O.obs_names = pd.read_csv(f"{ma_dir}/GSM4331841/barcodes.tsv.gz", sep='\t', header=None)[0].values

adata_GSM4331841_M_O.var_names_make_unique()

In [None]:
os.makedirs(f"{ma_dir}/GSM4331842", exist_ok=True)
!cd {ma_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4331nnn/GSM4331842/suppl/GSM4331842%5FSkin%2DM%2DCR%5Fbarcodes%2Etsv%2Egz -O GSM4331842/barcodes.tsv.gz
!cd {ma_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4331nnn/GSM4331842/suppl/GSM4331842%5FSkin%2DM%2DCR%5Fgenes%2Etsv%2Egz -O GSM4331842/features.tsv.gz
!cd {ma_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4331nnn/GSM4331842/suppl/GSM4331842%5FSkin%2DM%2DCR%5Fmatrix%2Emtx%2Egz -O GSM4331842/matrix.mtx.gz

In [None]:
adata_GSM4331842_M_CR = sc.read_mtx(f"{ma_dir}/GSM4331842/matrix.mtx.gz").transpose()
adata_GSM4331842_M_CR.var_names = pd.read_csv(f"{ma_dir}/GSM4331842/features.tsv.gz", sep='\t', header=None)[1].values
adata_GSM4331842_M_CR.obs_names = pd.read_csv(f"{ma_dir}/GSM4331842/barcodes.tsv.gz", sep='\t', header=None)[0].values

adata_GSM4331842_M_CR.var_names_make_unique()

In [None]:
os.makedirs(f"{ma_dir}/GSM4331843", exist_ok=True)
!cd {ma_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4331nnn/GSM4331843/suppl/GSM4331843%5FSkin%2DF%2DY%5Fbarcodes%2Etsv%2Egz -O GSM4331843/barcodes.tsv.gz
!cd {ma_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4331nnn/GSM4331843/suppl/GSM4331843%5FSkin%2DF%2DY%5Fgenes%2Etsv%2Egz -O GSM4331843/features.tsv.gz
!cd {ma_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4331nnn/GSM4331843/suppl/GSM4331843%5FSkin%2DF%2DY%5Fmatrix%2Emtx%2Egz -O GSM4331843/matrix.mtx.gz

In [None]:
adata_GSM4331843_F_Y = sc.read_mtx(f"{ma_dir}/GSM4331843/matrix.mtx.gz").transpose()
adata_GSM4331843_F_Y.var_names = pd.read_csv(f"{ma_dir}/GSM4331843/features.tsv.gz", sep='\t', header=None)[1].values
adata_GSM4331843_F_Y.obs_names = pd.read_csv(f"{ma_dir}/GSM4331843/barcodes.tsv.gz", sep='\t', header=None)[0].values

adata_GSM4331843_F_Y.var_names_make_unique()

In [None]:
os.makedirs(f"{ma_dir}/GSM4331844", exist_ok=True)
!cd {ma_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4331nnn/GSM4331844/suppl/GSM4331844%5FSkin%2DF%2DO%5Fbarcodes%2Etsv%2Egz -O GSM4331844/barcodes.tsv.gz
!cd {ma_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4331nnn/GSM4331844/suppl/GSM4331844%5FSkin%2DF%2DO%5Fgenes%2Etsv%2Egz -O GSM4331844/features.tsv.gz
!cd {ma_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4331nnn/GSM4331844/suppl/GSM4331844%5FSkin%2DF%2DO%5Fmatrix%2Emtx%2Egz -O GSM4331844/matrix.mtx.gz

In [None]:
adata_GSM4331844_F_O = sc.read_mtx(f"{ma_dir}/GSM4331844/matrix.mtx.gz").transpose()
adata_GSM4331844_F_O.var_names = pd.read_csv(f"{ma_dir}/GSM4331844/features.tsv.gz", sep='\t', header=None)[1].values
adata_GSM4331844_F_O.obs_names = pd.read_csv(f"{ma_dir}/GSM4331844/barcodes.tsv.gz", sep='\t', header=None)[0].values

adata_GSM4331844_F_O.var_names_make_unique()

In [None]:
os.makedirs(f"{ma_dir}/GSM4331845", exist_ok=True)
!cd {ma_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4331nnn/GSM4331845/suppl/GSM4331845%5FSkin%2DF%2DCR%5Fbarcodes%2Etsv%2Egz -O GSM4331845/barcodes.tsv.gz
!cd {ma_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4331nnn/GSM4331845/suppl/GSM4331845%5FSkin%2DF%2DCR%5Fgenes%2Etsv%2Egz -O GSM4331845/features.tsv.gz
!cd {ma_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4331nnn/GSM4331845/suppl/GSM4331845%5FSkin%2DF%2DCR%5Fmatrix%2Emtx%2Egz -O GSM4331845/matrix.mtx.gz

In [None]:
adata_GSM4331845_F_CR = sc.read_mtx(f"{ma_dir}/GSM4331845/matrix.mtx.gz").transpose()
adata_GSM4331845_F_CR.var_names = pd.read_csv(f"{ma_dir}/GSM4331845/features.tsv.gz", sep='\t', header=None)[1].values
adata_GSM4331845_F_CR.obs_names = pd.read_csv(f"{ma_dir}/GSM4331845/barcodes.tsv.gz", sep='\t', header=None)[0].values

adata_GSM4331845_F_CR.var_names_make_unique()

In [None]:
adata_ma_Y = sc.AnnData.concatenate(adata_GSM4331840_M_Y, adata_GSM4331843_F_Y)
adata_ma_O = sc.AnnData.concatenate(adata_GSM4331841_M_O, adata_GSM4331844_F_O)
adata_ma_CR = sc.AnnData.concatenate(adata_GSM4331842_M_CR, adata_GSM4331845_F_CR)

In [None]:
adata_ma_Y.write_h5ad(f"{ma_dir}/adata_ma_Y.h5")
adata_ma_O.write_h5ad(f"{ma_dir}/adata_ma_O.h5")
adata_ma_CR.write_h5ad(f"{ma_dir}/adata_ma_CR.h5")

## Phan et al. 2020

In [None]:
phan_dir = data_dir + '/phan_2020'
os.makedirs(phan_dir, exist_ok=True)

In [None]:
!cd {phan_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4647nnn/GSM4647788/suppl/GSM4647788_P21_1.loom.gz
!cd {phan_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4647nnn/GSM4647789/suppl/GSM4647789_P21_2.loom.gz
!cd {phan_dir} && wget https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4647nnn/GSM4647790/suppl/GSM4647790_P21_3.loom.gz

In [None]:
!cd {phan_dir} && gunzip *

In [None]:
adata_phan_2020_1 = sc.read(f"{phan_dir}/GSM4647788_P21_1.loom")
adata_phan_2020_1.var_names_make_unique()
adata_phan_2020_2 = sc.read(f"{phan_dir}/GSM4647789_P21_2.loom")
adata_phan_2020_2.var_names_make_unique()
adata_phan_2020_3 = sc.read(f"{phan_dir}/GSM4647790_P21_3.loom")
adata_phan_2020_3.var_names_make_unique()

In [None]:
adata_phan_2020 = sc.AnnData.concatenate(adata_phan_2020_1, adata_phan_2020_2, adata_phan_2020_3)

In [None]:
adata_phan_2020.write_h5ad(f"{phan_dir}/adata_phan_2020.h5")

## Shook 2020

In [None]:
shook_dir = data_dir + '/shook_2020'
os.makedirs(shook_dir, exist_ok=True)

In [None]:
dict_names = {
              'SRR10480641': 'Non_Wounded_S1', 
              'SRR10480643': 'Non_Wounded_S2',
              'SRR10480644': 'Non_Wounded_S3', 
              'SRR10480645': 'Non_Wounded_S4', 
              'SRR10480646': 'Non_Wounded_S5', 
              'SRR10480636': 'Wounded_S1', 
              'SRR10480637': 'Wounded_S2', 
              'SRR10480638': 'Wounded_S3', 
              'SRR10480639': 'Wounded_S4',
              'SRR10480640': 'Wounded_S5',
}

In [None]:
df = pd.DataFrame({'name': list(dict_names.values()), 'technology': ['10xv3']*len(dict_names), 'targetnumcells': [5000]*len(dict_names)})
df.to_csv(shook_dir + '/metadata.tab', sep='\t', index=None)

In [None]:
for SRR, name in dict_names.items():
    !cd {shook_dir} && parallel-fastq-dump -s {SRR} --gzip --split-files -t 8 

In [None]:
for SRR, name in dict_names.items():
    !cd {shook_dir} && loompy fromfq {name}.loom {name} {mouse_gencode_dir} metadata.tab {shook_dir}/{SRR}_1.fastq.gz {shook_dir}/{SRR}_2.fastq.gz

In [None]:
adata_Non_Wounded_S1 = sc.read(f"{shook_dir}/Non_Wounded_S1.loom")
adata_Non_Wounded_S1.var_names_make_unique()

adata_Non_Wounded_S2 = sc.read(f"{shook_dir}/Non_Wounded_S3.loom")
adata_Non_Wounded_S2.var_names_make_unique()

adata_Non_Wounded_S3 = sc.read(f"{shook_dir}/Non_Wounded_S3.loom")
adata_Non_Wounded_S3.var_names_make_unique()

adata_Non_Wounded_S4 = sc.read(f"{shook_dir}/Non_Wounded_S4.loom")
adata_Non_Wounded_S4.var_names_make_unique()

adata_Non_Wounded_S5 = sc.read(f"{shook_dir}/Non_Wounded_S5.loom")
adata_Non_Wounded_S5.var_names_make_unique()

In [None]:
adata_shook_NW = sc.AnnData.concatenate(adata_Non_Wounded_S1, adata_Non_Wounded_S2, adata_Non_Wounded_S3, 
                                       adata_Non_Wounded_S4, adata_Non_Wounded_S5)
adata_shook_NW.write_h5ad(f"{shook_dir}/adata_shook_NW.h5")