# Data extraction

In this notebook we are going to extract the raw or processed data from all datasets, to later analyze it in different notebooks.

In [None]:
import scanpy as sc
import os
import pandas as pd
from tqdm.notebook import tqdm

pd.set_option('display.max_columns', None)

In [None]:
data_dir = os.getcwd()

In [None]:
human_gencode_dir = "/media/seth/SETH_DATA/SETH_Alex/Programs/human_GRCh38_gencode.v31.600"

## Tabib et al. 2018

In [None]:
tabib_dir = data_dir + '/Tabib_2018'
os.makedirs(tabib_dir, exist_ok=True)

In [None]:
!wget -P {tabib_dir} https://dom.pitt.edu/wp-content/uploads/2018/10/Skin_6Control_rawUMI.zip

In [None]:
!wget -P {tabib_dir} https://dom.pitt.edu/wp-content/uploads/2018/10/Skin_6Control_Metadata.zip

In [None]:
!unzip -o {tabib_dir}/Skin_6Control_rawUMI.zip -d {tabib_dir}

In [None]:
!unzip -o {tabib_dir}/Skin_6Control_Metadata.zip -d {tabib_dir}

## Philippeos et al. 2018

In [None]:
phil_dir = data_dir + '/Philippeos_2018'
os.makedirs(phil_dir, exist_ok=True)

In [None]:
!wget -P {phil_dir} https://ftp.ncbi.nlm.nih.gov/geo/series/GSE109nnn/GSE109822/suppl/GSE109822%5FCD3145%2Ecsv%2Egz

In [None]:
!wget -P {phil_dir} https://ftp.ncbi.nlm.nih.gov/geo/series/GSE109nnn/GSE109822/suppl/GSE109822%5FCD90%2Ecsv%2Egz

In [None]:
!gunzip {phil_dir}/*.gz -f

## Solé-Boldo et al. 2020

In [None]:
sole_dir = data_dir + '/Sole-Boldo_2020'
os.makedirs(sole_dir, exist_ok=True)

In [None]:
!cd {sole_dir} && fastq-dump SRR9036396 --gzip --split-files

In [None]:
!cd {sole_dir} && fastq-dump SRR9036397 --gzip --split-files

In [None]:
df = pd.DataFrame({'name': ['SB2020'], 'technology': ['10xv2'], 'targetnumcells': [1000]})
df.to_csv(sole_dir + '/metadata.tab', sep='\t', index=None)

In [None]:
!mv {sole_dir}/SRR9036396_1.fastq.gz {sole_dir}/SB2020_L001_R1_001.fastq.gz 
!mv {sole_dir}/SRR9036396_2.fastq.gz {sole_dir}/SB2020_L001_R2_001.fastq.gz 
!mv {sole_dir}/SRR9036397_1.fastq.gz {sole_dir}/SB2020_L002_R1_001.fastq.gz 
!mv {sole_dir}/SRR9036397_2.fastq.gz {sole_dir}/SB2020_L002_R2_001.fastq.gz 

In [None]:
!cd {sole_dir} && loompy fromfq SB2020.loom SB2020 {human_gencode_dir} metadata.tab \
SB2020_L002_R1_001.fastq.gz SB2020_L002_R2_001.fastq.gz SB2020_L001_R1_001.fastq.gz SB2020_L001_R2_001.fastq.gz 

## Vorstandlechner et al. 2020

In [None]:
vors_dir = data_dir + '/Vorstandlechner_2020'
os.makedirs(sole_dir, exist_ok=True)

In [None]:
# The file was obtained by personal request from Vorstandlechner

## He et al. 2020

### Raw data and metadata extraction (young samples)

In [None]:
he_dir = data_dir + '/He_2020'
os.makedirs(he_dir, exist_ok=True)

In [None]:
!rm -rf {he_dir}

In [None]:
SRA_list = """
SRR11396171
SRR11396175
SRR11396162
SRR11396164
SRR11396166
SRR11396167
SRR11396168
SRR11396170
"""

with open(he_dir + '/accession.txt', 'w') as f:
    f.write(SRA_list)
    
df = pd.DataFrame({'name': ['He2020'], 'technology': ['10xv2'] * 8, 'targetnumcells': [5000] * 8})

df.to_csv(he_dir + '/metadata.tab', sep='\t', index=None)

In [None]:
!cd {he_dir} && cat accession.txt | parallel -j 8 "prefetch {}"

In [None]:
def adapt_fastq(filename_dir, filename_root, idx):
    # Using readline() 
    filein = open(f'{filename_dir}/{filename_root}.fastq', 'r') 
    
    fileR1 = open(f'{filename_dir}/He2020_L00{idx}_R1_001.fastq', 'w') 
    fileR2 = open(f'{filename_dir}/He2020_L00{idx}_R2_001.fastq', 'w') 
    
    print(f'{filename_dir}/{filename_root}.fastq', f'{filename_dir}/He2020_L00{idx}_R1_001.fastq')
    
    count = 0
    
    while True: 
        count += 1

        # Get next line from file 
        line = filein.readline() 
        
        if count % 4 in [1, 3]:
            fileR1.write(line.replace('\n', '') + '\n')
            fileR2.write(line.replace('\n', '') + '\n')
        elif count == 2:
            fileR1.write(line.replace('\n', '')[:26] + '\n')
            fileR2.write(line.replace('\n', '')[26:] + '\n')
        else:
            fileR1.write(line.replace('\n', '')[:26] + '\n')
            fileR2.write(line.replace('\n', '')[26:] + '\n')
            

        # if line is empty 
        # end of file is reached 
        if not line: 
            break

    filein.close() 
    fileR1.close()
    fileR2.close()
    
    os.system(f'gzip {filename_dir}/{filename_root}.fastq')
    os.system(f'gzip {filename_dir}/He2020_L00{idx}_R1_001.fastq')
    os.system(f'gzip {filename_dir}/He2020_L00{idx}_R2_001.fastq')

In [None]:
adapt_fastq_remote = ray.remote(adapt_fastq)

ray.init(ignore_reinit_error=True, num_cpus=2)

ret = [adapt_fastq_remote.remote(f'{he_dir}', f'{name}', name_idx+1) for name_idx, name in enumerate(SRA_list.split('\n')[1:-1])]
ray.get(ret)

ray.shutdown()

In [None]:
!cd {he_dir} && loompy fromfq He2020.loom He2020 {human_gencode_dir} metadata.tab \
He2020_L001_R1_001.fastq.gz He2020_L001_R2_001.fastq.gz He2020_L002_R1_001.fastq.gz He2020_L002_R2_001.fastq.gz \
He2020_L003_R1_001.fastq.gz He2020_L003_R2_001.fastq.gz He2020_L004_R1_001.fastq.gz He2020_L004_R2_001.fastq.gz \
He2020_L005_R1_001.fastq.gz He2020_L005_R2_001.fastq.gz He2020_L006_R1_001.fastq.gz He2020_L006_R2_001.fastq.gz \
He2020_L007_R1_001.fastq.gz He2020_L007_R2_001.fastq.gz He2020_L008_R1_001.fastq.gz He2020_L008_R2_001.fastq.gz \

### Raw data and metadata extraction (old samples)

In [None]:
SRA_list = """
SRR11396159
SRR11396160
SRR11396163
SRR11396165
"""

with open(he_dir + '/accession_old.txt', 'w') as f:
    f.write(SRA_list)
    
df = pd.DataFrame({'name': ['He2020_inj'], 'technology': ['10xv2'], 'targetnumcells': [5000]})

df.to_csv(he_dir + '/metadata_inj.tab', sep='\t', index=None)

In [None]:
!cd {he_dir} && cat accession_inj.txt | parallel -j 8 "prefetch {}"

In [None]:
adapt_fastq_remote = ray.remote(adapt_fastq)

ray.init(ignore_reinit_error=True, num_cpus=2)

ret = [adapt_fastq_remote.remote(f'{he_dir}', f'{name}', name_idx+10) for name_idx, name in enumerate(SRA_list.split('\n')[1:-1])]
ray.get(ret)

ray.shutdown()

In [None]:
!cd {he_dir} && loompy fromfq He2020_inj.loom He2020_inj {human_gencode_dir} metadata_old.tab \
He2020_L0010_R1_001.fastq.gz He2020_L0010_R2_001.fastq.gz He2020_L0011_R1_001.fastq.gz He2020_L0011_R2_001.fastq.gz \
He2020_L0013_R1_001.fastq.gz He2020_L0013_R2_001.fastq.gz He2020_L0012_R1_001.fastq.gz He2020_L0012_R2_001.fastq.gz

## Kim et al. 2020

In [None]:
kim_dir = data_dir + '/Kim_2020'
os.makedirs(kim_dir, exist_ok=True)
os.makedirs(kim_dir + '/injury', exist_ok=True)

In [None]:
!cd {kim_dir} && fastq-dump SRR9307706 --gzip --split-files

In [None]:
!cd {kim_dir} && fastq-dump SRR9307707 --gzip --split-files

In [None]:
!cd {kim_dir} && fastq-dump SRR9307708 --gzip --split-files

In [None]:
!cd {kim_dir} && fastq-dump SRR9307709 --gzip --split-files

In [None]:
!cd {kim_dir} && fastq-dump SRR9307710 --gzip --split-files

In [None]:
!cd {kim_dir} && fastq-dump SRR9307711 --gzip --split-files

In [None]:
!cd {kim_dir}/injury && fastq-dump SRR9307698 --gzip --split-files

In [None]:
df = pd.DataFrame({'name': ['Kim_2020_HC1', 'Kim_2020_HC2', 'Kim_2020_HC3', 
                            'Kim_2020_HC4', 'Kim_2020_HC5', 'Kim_2020_HC6', 
                            'Kim_2020_inj'], 'technology': ['10xv2'] * 7, 
                   'targetnumcells': [1000] * 7})
df.to_csv(kim_dir + '/metadata.tab', sep='\t', index=None)

In [None]:
!mv {kim_dir}/injury/SRR9307698_2.fastq.gz {kim_dir}/Kim_2020_inj_L001_R1_001.fastq.gz 
!mv {kim_dir}/injury/SRR9307698_3.fastq.gz {kim_dir}/Kim_2020_inj_L001_R2_001.fastq.gz 

!mv {kim_dir}/SRR9307706_2.fastq.gz {kim_dir}/Kim_2020_HC1_L001_R1_001.fastq.gz 
!mv {kim_dir}/SRR9307706_3.fastq.gz {kim_dir}/Kim_2020_HC1_L001_R2_001.fastq.gz 
!mv {kim_dir}/SRR9307707_2.fastq.gz {kim_dir}/Kim_2020_HC2_L001_R1_001.fastq.gz 
!mv {kim_dir}/SRR9307707_3.fastq.gz {kim_dir}/Kim_2020_HC2_L001_R2_001.fastq.gz 
!mv {kim_dir}/SRR9307708_2.fastq.gz {kim_dir}/Kim_2020_HC3_L001_R1_001.fastq.gz 
!mv {kim_dir}/SRR9307708_3.fastq.gz {kim_dir}/Kim_2020_HC3_L001_R2_001.fastq.gz 
!mv {kim_dir}/SRR9307709_2.fastq.gz {kim_dir}/Kim_2020_HC4_L001_R1_001.fastq.gz 
!mv {kim_dir}/SRR9307709_3.fastq.gz {kim_dir}/Kim_2020_HC4_L001_R2_001.fastq.gz 
!mv {kim_dir}/SRR9307710_2.fastq.gz {kim_dir}/Kim_2020_HC5_L001_R1_001.fastq.gz 
!mv {kim_dir}/SRR9307710_3.fastq.gz {kim_dir}/Kim_2020_HC5_L001_R2_001.fastq.gz 
!mv {kim_dir}/SRR9307711_2.fastq.gz {kim_dir}/Kim_2020_HC6_L001_R1_001.fastq.gz 
!mv {kim_dir}/SRR9307711_3.fastq.gz {kim_dir}/Kim_2020_HC6_L001_R2_001.fastq.gz 

In [None]:
!rm -rf {kim_dir}/*_1.fastq.gz

In [None]:
!cd {kim_dir} && loompy fromfq Kim_2020_HC1.loom Kim_2020_HC1 /media/seth/SETH_DATA/SETH_Alex/Programs/human_GRCh38_gencode.v31.600 metadata.tab \
Kim_2020_HC1_L001_R1_001.fastq.gz Kim_2020_HC1_L001_R2_001.fastq.gz 

In [None]:
!cd {kim_dir} && loompy fromfq Kim_2020_HC2.loom Kim_2020_HC2 /media/seth/SETH_DATA/SETH_Alex/Programs/human_GRCh38_gencode.v31.600 metadata.tab \
Kim_2020_HC2_L001_R1_001.fastq.gz Kim_2020_HC2_L001_R2_001.fastq.gz 

In [None]:
!cd {kim_dir} && loompy fromfq Kim_2020_HC3.loom Kim_2020_HC3 /media/seth/SETH_DATA/SETH_Alex/Programs/human_GRCh38_gencode.v31.600 metadata.tab \
Kim_2020_HC3_L001_R1_001.fastq.gz Kim_2020_HC3_L001_R2_001.fastq.gz 

In [None]:
!cd {kim_dir} && loompy fromfq Kim_2020_HC4.loom Kim_2020_HC4 /media/seth/SETH_DATA/SETH_Alex/Programs/human_GRCh38_gencode.v31.600 metadata.tab \
Kim_2020_HC4_L001_R1_001.fastq.gz Kim_2020_HC4_L001_R2_001.fastq.gz 

In [None]:
!cd {kim_dir} && loompy fromfq Kim_2020_HC5.loom Kim_2020_HC5 /media/seth/SETH_DATA/SETH_Alex/Programs/human_GRCh38_gencode.v31.600 metadata.tab \
Kim_2020_HC5_L001_R1_001.fastq.gz Kim_2020_HC5_L001_R2_001.fastq.gz 

In [None]:
!cd {kim_dir} && loompy fromfq Kim_2020_HC6.loom Kim_2020_HC6 /media/seth/SETH_DATA/SETH_Alex/Programs/human_GRCh38_gencode.v31.600 metadata.tab \
Kim_2020_HC6_L001_R1_001.fastq.gz Kim_2020_HC6_L001_R2_001.fastq.gz 

In [None]:
!cd {kim_dir} && loompy fromfq Kim_2020_inj.loom Kim_2020_inj /media/seth/SETH_DATA/SETH_Alex/Programs/human_GRCh38_gencode.v31.600 metadata.tab \
Kim_2020_inj_L001_R1_001.fastq.gz Kim_2020_inj_L001_R2_001.fastq.gz

In [None]:
adata_kim_HC1 = sc.read_loom(kim_dir + '/Kim_2020_HC1.loom')
adata_kim_HC1.var_names_make_unique()
adata_kim_HC2 = sc.read_loom(kim_dir + '/Kim_2020_HC2.loom')
adata_kim_HC2.var_names_make_unique()
adata_kim_HC3 = sc.read_loom(kim_dir + '/Kim_2020_HC3.loom')
adata_kim_HC3.var_names_make_unique()
adata_kim_HC4 = sc.read_loom(kim_dir + '/Kim_2020_HC4.loom')
adata_kim_HC4.var_names_make_unique()
adata_kim_HC5 = sc.read_loom(kim_dir + '/Kim_2020_HC5.loom')
adata_kim_HC5.var_names_make_unique()
adata_kim_HC6 = sc.read_loom(kim_dir + '/Kim_2020_HC6.loom')
adata_kim_HC6.var_names_make_unique()

In [None]:
adata_kim = sc.AnnData.concatenate(adata_kim_HC1, adata_kim_HC2, adata_kim_HC3, 
                                  adata_kim_HC4, adata_kim_HC5, adata_kim_HC6)

In [None]:
adata_kim.write_h5ad(kim_dir + '/Kim_2020.h5ad')

## Gaydosik et al. 2020

In [None]:
gaydosik_dir = data_dir + '/gaydosik_2020'
os.makedirs(gaydosik_dir, exist_ok=True)

In [None]:
!cd {gaydosik_dir} && wget ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM3679nnn/GSM3679033/suppl/GSM3679033%5FLabeled%5FSC67%5F050517%5FSK%5FMF2%5FGRCh38raw%2Ecsv%2Egz
!cd {gaydosik_dir} && wget ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM3679nnn/GSM3679034/suppl/GSM3679034%5FLabeled%5FSC82%5F060617%5FSK%5FMF5%5FGRCh38raw%2Ecsv%2Egz
!cd {gaydosik_dir} && wget ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM3679nnn/GSM3679035/suppl/GSM3679035%5FSC157dataframe%2Ecsv%2Egz
!cd {gaydosik_dir} && wget ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM3679nnn/GSM3679036/suppl/GSM3679036%5FSC158dataframe%2Ecsv%2Egz
!cd {gaydosik_dir} && wget ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM3679nnn/GSM3679037/suppl/GSM3679037%5FSC205dataframe%2Ecsv%2Egz

!cd {gaydosik_dir} && wget ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM3679nnn/GSM3679038/suppl/GSM3679038%5FLabeled%5FSC50%5F011917%5FSK%5FNOR%5FGRCh38raw%2Ecsv%2Egz
!cd {gaydosik_dir} && wget ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM3679nnn/GSM3679039/suppl/GSM3679039%5FLabeled%5FSC68%5F051517%5FSK%5FNOR%5FGRCh38raw%2Ecsv%2Egz
!cd {gaydosik_dir} && wget ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM3679nnn/GSM3679040/suppl/GSM3679040%5FLabeled%5FSC124%5F080317%5FSK%5FNOR%5FGRCh38raw%2Ecsv%2Egz
!cd {gaydosik_dir} && wget ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM3679nnn/GSM3679041/suppl/GSM3679041%5FLabeled%5FSC125%5F080317%5FSK%5FNOR%5FGRCh38raw%2Ecsv%2Egz

In [None]:
!cd {gaydosik_dir} &&  gunzip *.gz

In [None]:
adata_CTCL2 = sc.read(gaydosik_dir + '/GSM3679033_Labeled_SC67_050517_SK_MF2_GRCh38raw.csv').transpose()
adata_CTCL5 = sc.read(gaydosik_dir + '/GSM3679034_Labeled_SC82_060617_SK_MF5_GRCh38raw.csv').transpose()
adata_CTCL6 = sc.read(gaydosik_dir + '/GSM3679035_SC157dataframe.csv').transpose()
adata_CTCL8 = sc.read(gaydosik_dir + '/GSM3679036_SC158dataframe.csv').transpose()
adata_CTCL12 = sc.read(gaydosik_dir + '/GSM3679037_SC205dataframe.csv').transpose()

In [None]:
adata_HC1 = sc.read(gaydosik_dir + '/GSM3679038_Labeled_SC50_011917_SK_NOR_GRCh38raw.csv').transpose()
adata_HC2 = sc.read(gaydosik_dir + '/GSM3679039_Labeled_SC68_051517_SK_NOR_GRCh38raw.csv').transpose()
adata_HC3 = sc.read(gaydosik_dir + '/GSM3679040_Labeled_SC124_080317_SK_NOR_GRCh38raw.csv').transpose()
adata_HC4 = sc.read(gaydosik_dir + '/GSM3679041_Labeled_SC125_080317_SK_NOR_GRCh38raw.csv').transpose()

In [None]:
adata_CTCL = sc.AnnData.concatenate(adata_CTCL2, adata_CTCL5, adata_CTCL6, 
                                   adata_CTCL8, adata_CTCL12, batch_key='sample', 
                                   batch_categories=['CTCL2', 'CTCL5', 'CTCL6',
                                                     'CTCL8', 'CTCL12'])
adata_HC = sc.AnnData.concatenate(adata_HC1, adata_HC2, adata_HC3, 
                                   adata_HC4, batch_key='sample', batch_categories=[
                                       'HC1', 'HC2', 'HC3', 'HC4'
                                   ])

In [None]:
adata_CTCL.write_h5ad(gaydosik_dir + '/gaydosik_2020_CTCL.h5ad')
adata_HC.write_h5ad(gaydosik_dir + '/gaydosik_2020_HC.h5ad')

## Reynolds et al. 2020

In [None]:
reynolds_dir = data_dir + '/reynolds_2020'
os.makedirs(reynolds_dir, exist_ok=True)

### Direct h5ad download

In [None]:
!aria2c -x 16 https://zenodo.org/record/4536165/files/submission_210120.h5ad?download=1 -d {reynolds_dir} -o reynolds_2020.h5ad

### FASTQ processing

In [None]:
!aria2c -x 16 https://www.ebi.ac.uk/arrayexpress/files/E-MTAB-8142/E-MTAB-8142.sdrf.txt -d {reynolds_dir} -o acctable.txt

In [None]:
reynolds_metadata = pd.read_csv(reynolds_dir + '/acctable.txt', sep='\t')

In [None]:
reynolds_metadata[reynolds_metadata['Characteristics[FACS sorting]'] == 'fibroblasts']

In [None]:
reynolds_metadata

In [None]:
df = pd.DataFrame({'name': [f"{i}_{reynolds_metadata['Source Name'].values[i]}_{reynolds_metadata['Characteristics[individual]'].values[i]}_{reynolds_metadata['Characteristics[sampling site]'].values[i]}_{reynolds_metadata['Characteristics[FACS sorting]'].values[i].replace('/', '-').replace(' ', '-')}" for i in range(len(reynolds_metadata))], 
                   'technology': ['10xv2'] * len(reynolds_metadata), 
                   'targetnumcells': [1000] * len(reynolds_metadata)})

df.to_csv(reynolds_dir + '/metadata.tab', sep='\t', index=None)

In [None]:
print(len(reynolds_metadata))
for idx, name, indv, site, facs, f1, f2 in zip(reynolds_metadata_sub.index, 
                                               reynolds_metadata_sub['Source Name'].values, 
                                    reynolds_metadata_sub['Characteristics[individual]'].values,
                                    reynolds_metadata_sub['Characteristics[sampling site]'].values,
                                    reynolds_metadata_sub['Characteristics[FACS sorting]'].values,
                                    reynolds_metadata_sub['Comment[FASTQ_URI]'].values, 
                                    reynolds_metadata_sub['Comment[FASTQ_URI].1'].values):
    
    facs = facs.replace('/', '-').replace(' ', '-')
    str_file = f'{idx}_{name}_{indv}_{site}_{facs}'
        
    while not os.path.exists(f'{reynolds_dir}/reynolds_2020_{str_file}.loom'):
        print(idx, name, indv, site, facs, f1, f2)

        os.system(f'cd {reynolds_dir} && aria2c -x 16 --file-allocation=none {f1} -d {reynolds_dir} -o {str_file}_S1_L001_R1_001.fastq.gz')
        os.system(f'cd {reynolds_dir} && aria2c -x 16 --file-allocation=none {f2} -d {reynolds_dir} -o {str_file}_S1_L001_R2_001.fastq.gz')

        os.system(f'cd {reynolds_dir} && loompy fromfq reynolds_2020_{str_file}.loom {str_file} {human_gencode_dir} metadata.tab {str_file}_S1_L001_R1_001.fastq.gz {str_file}_S1_L001_R2_001.fastq.gz ')
    else:
        try:
            os.system(f'rm {reynolds_dir}/{str_file}_S1_L001_R1_001.fastq.gz')
            os.system(f'rm {reynolds_dir}/{str_file}_S1_L001_R2_001.fastq.gz')
        except:
            print(f'reynolds_2020_{str_file}.loom EXISTS!')
    

## Popescu et al. 2019

### Direct h5ad download

In [None]:
!aria2c -x 16 https://zenodo.org/record/4536165/files/fetal_submission.h5ad?download=1 -d {popescu_dir} -o popescu_2019.h5ad

### FASTQ processing

In [None]:
!aria2c -x 16 https://www.ebi.ac.uk/arrayexpress/files/E-MTAB-7407/E-MTAB-7407.sdrf.txt -d {popescu_dir} -o acctable.txt

In [None]:
popescu_metadata = pd.read_csv(popescu_dir + '/acctable.txt', sep='\t')
popescu_metadata_skin = popescu_metadata[popescu_metadata['Characteristics[organism part]'] == 'skin'].reset_index(drop=True)

In [None]:
df = pd.DataFrame({'name': [f"{i}_{popescu_metadata_skin['Source Name'].values[i]}_{popescu_metadata_skin['Characteristics[individual]'].values[i]}_{popescu_metadata_skin['Characteristics[facs sorting]'].values[i].replace('/', '-').replace(' ', '-')}" for i in range(len(popescu_metadata_skin))], 
                   'technology': ['10xv2'] * len(popescu_metadata_skin), 
                   'targetnumcells': [1000] * len(popescu_metadata_skin)})

df.to_csv(popescu_dir + '/metadata.tab', sep='\t', index=None)

In [None]:
!cd  {popescu_dir} && cat metadata.tab

In [None]:
print(len(popescu_metadata_skin))
for idx, name, indv, facs, f1, f2 in tqdm(zip(range(len(popescu_metadata_skin)), 
                                                    popescu_metadata_skin['Source Name'].values, 
                                    popescu_metadata_skin['Characteristics[individual]'].values, 
                                    popescu_metadata_skin['Characteristics[facs sorting]'].values,
                                    popescu_metadata_skin['Comment[FASTQ_URI]'].values, 
                                    popescu_metadata_skin['Comment[FASTQ_URI].1'].values)):
    
    facs = facs.replace('/', '-').replace(' ', '-')
    str_file = f'{idx}_{name}_{indv}_{facs}'
    
    if os.path.exists(f'{popescu_dir}/Popescu_2019_{str_file}.loom'):
        print(f'Popescu_2019_{str_file}.loom EXISTS!')
    else:
        os.system(f'cd {popescu_dir} && aria2c -x 16 --file-allocation=none {f1} -d {popescu_dir} -o {str_file}_S1_L001_R1_001.fastq.gz')
        os.system(f'cd {popescu_dir} && aria2c -x 16 --file-allocation=none {f2} -d {popescu_dir} -o {str_file}_S1_L001_R2_001.fastq.gz')

        os.system(f'cd {popescu_dir} && loompy fromfq Popescu_2019_{str_file}.loom {str_file} {human_gencode_dir} metadata.tab {str_file}_S1_L001_R1_001.fastq.gz {str_file}_S1_L001_R2_001.fastq.gz ')

        os.system(f'rm {popescu_dir}/{str_file}_S1_L001_R1_001.fastq.gz')
        os.system(f'rm {popescu_dir}/{str_file}_S1_L001_R2_001.fastq.gz')
    

## McCarthy et al. 2020

In [None]:
mccarthy_dir = data_dir + '/McCarthy_2020'
os.makedirs(mccarthy_dir, exist_ok=True)
os.makedirs(mccarthy_dir+'/outfiles', exist_ok=True)

In [None]:
df_meta = pd.read_csv(mccarthy_dir + '/E-MTAB-7167.sdrf.txt', sep='\t')
df_meta = df_meta.drop_duplicates('Comment[ENA_RUN]').set_index('Comment[ENA_RUN]')

In [None]:
list_ftp = df_meta['Comment[FASTQ_URI]'].values

In [None]:
list_ftp_names = [i for i in list_ftp if i.split('/')[-1] not in os.listdir(mccarthy_dir)]

In [None]:
for i in tqdm(list_ftp_names):
    os.system(f'wget -P {mccarthy_dir} {i}')

In [None]:
all_ftp_names = list(dict.fromkeys([i.split('/')[-1] for i in list_ftp]))

In [None]:
all_ftp_roots = list(dict.fromkeys([i.split('_')[0] for i in all_ftp_names if
                 (i.split('_')[0] + '_1.fastq.gz' in all_ftp_names) & 
                 (i.split('_')[0] + '_2.fastq.gz' in all_ftp_names)]))

In [None]:
for i in range(0, len(all_ftp_roots), 20):
    i_end = min(i + 20, len(all_ftp_names))
    
    for root in all_ftp_roots[i:i+20]:
        subprocess.run(f'mv {mccarthy_dir}/{root}_1.fastq.gz {mccarthy_dir}/{root}_1_{i//20}.fastq.gz'.split())
        subprocess.run(f'mv {mccarthy_dir}/{root}_2.fastq.gz {mccarthy_dir}/{root}_2_{i//20}.fastq.gz'.split())

In [None]:
len(all_ftp_roots)

In [None]:
for i in tqdm(range(200)):
    i_div = i
    
    subprocess.run(f"""nextflow run nf-core/smartseq2 -profile docker --reads {mccarthy_dir}/ERR*_{i_div}_{{1,2}}.fastq.gz --genome GRCh38 -r dev --skip_tracer --skip_bracer -c {mccarthy_dir}/conf.config --outdir {mccarthy_dir}/results --skip_rsem --star_index {data_dir}/STAR_index_gencode --gtf {data_dir}/STAR_idx_gencode/genes.gtf""".split())
    
    # Copy all files
    output, error = subprocess.Popen(
        'find McCarthy_2020/results -wholename *.count.txt'.split(), 
        stdout=subprocess.PIPE).communicate()
    
    list_files = output.decode().split('\n')[:-1]
    
    for file in list_files:
        df = pd.read_csv(data_dir + '/' + file, sep='\t', skiprows=1).iloc[:, [0, -1]]
        df.to_csv(mccarthy_dir+'/outfiles_gencode/'+file.split('/')[-1], index=None, header=None, sep='\t')
    
    # Remove directories
    subprocess.run(f'rm -rf {data_dir}/work'.split())
    subprocess.run(f'rm -rf {mccarthy_dir}/results'.split())

In [None]:
df_full = None
for file in tqdm(os.listdir(mccarthy_dir+'/outfiles_gencode/')):
    df = pd.read_csv(mccarthy_dir+'/outfiles_gencode/'+file, header=None, sep='\t', index_col=0)
    if df_full is None:
        df_full = pd.DataFrame(index=df.index)
    
    df_full[file] = df[1]

In [None]:
adata_mccarthy = sc.AnnData(df_full,).transpose()
adata_mccarthy.obs_names = [i.replace('.count.txt', '') for i in adata_mccarthy.obs_names]

In [None]:
sc.pp.filter_genes(adata_mccarthy, min_cells=1)

In [None]:
adata_mccarthy.write_loom(mccarthy_dir + '/mccarthy_2020.loom')

## Gao et al. 2021

In [None]:
gao_dir = data_dir + '/gao_2021'
os.makedirs(gao_dir, exist_ok=True)

### Direct h5ad download

In [None]:
!aria2c -x 16 https://ftp.ncbi.nlm.nih.gov/geo/series/GSE162nnn/GSE162183/suppl/GSE162183%5FRaw%5Fgene%5Fcounts%5Fmatrix%5FLoomFile%2Eloom%2Egz -d {gao_dir} -o gao_2021.loom.gz

In [None]:
!gunzip {gao_dir}/gao_2021.loom.gz