# Data extraction of Reynolds dataset

In this notebook we are going to extract the processed and raw data files.

In [None]:
# UNCOMMENT THIS TO INSTALL STUFF!
# !wget https://raw.githubusercontent.com/alexmascension/revisit_reynolds_fb/master/requirements.txt
# !pip install -r requirements.txt

In [None]:
import scanpy as sc
import os
import pandas as pd
from tqdm.notebook import tqdm

pd.set_option('display.max_columns', None)

In [None]:
data_dir = os.getcwd()

In [None]:
human_gencode_dir = "/media/seth/SETH_DATA/SETH_Alex/Programs/human_GRCh38_gencode.v31.600"

In [None]:
reynolds_dir = data_dir + '/reynolds_2020'
os.makedirs(reynolds_dir, exist_ok=True)

### Direct h5ad download

In [None]:
!aria2c -x 16 https://zenodo.org/record/4536165/files/submission_210120.h5ad?download=1 -d {reynolds_dir} -o reynolds_2020.h5ad

### FASTQ processing

In [None]:
!aria2c -x 16 https://www.ebi.ac.uk/arrayexpress/files/E-MTAB-8142/E-MTAB-8142.sdrf.txt -d {reynolds_dir} -o acctable.txt

In [None]:
reynolds_metadata = pd.read_csv(reynolds_dir + '/acctable.txt', sep='\t')

**We will only download the fibroblasts from healthy donors. If you want to download other datasets, set the reynolds_metadata_sub with the datasets of your preference**

In [None]:
reynolds_metadata_sub = reynolds_metadata[reynolds_metadata['Characteristics[FACS sorting]'] == 'fibroblasts']

In [None]:
reynolds_metadata_sub

In [None]:
df = pd.DataFrame({'name': [f"{reynolds_metadata_sub.index[i]}_{reynolds_metadata_sub['Source Name'].values[i]}_{reynolds_metadata_sub['Characteristics[individual]'].values[i]}_{reynolds_metadata_sub['Characteristics[sampling site]'].values[i]}_{reynolds_metadata_sub['Characteristics[FACS sorting]'].values[i].replace('/', '-').replace(' ', '-')}" for i in range(len(reynolds_metadata_sub))], 
                   'technology': ['10xv2'] * len(reynolds_metadata_sub), 
                   'targetnumcells': [1000] * len(reynolds_metadata_sub)})

df.to_csv(reynolds_dir + '/metadata.tab', sep='\t', index=None)

In [None]:
df

In [None]:
for idx, name, indv, site, facs, f1, f2 in zip(reynolds_metadata_sub.index, 
                                               reynolds_metadata_sub['Source Name'].values, 
                                    reynolds_metadata_sub['Characteristics[individual]'].values,
                                    reynolds_metadata_sub['Characteristics[sampling site]'].values,
                                    reynolds_metadata_sub['Characteristics[FACS sorting]'].values,
                                    reynolds_metadata_sub['Comment[FASTQ_URI]'].values, 
                                    reynolds_metadata_sub['Comment[FASTQ_URI].1'].values):
    
    facs = facs.replace('/', '-').replace(' ', '-')
    str_file = f'{idx}_{name}_{indv}_{site}_{facs}'
    print(str_file)
    while not os.path.exists(f'{reynolds_dir}/reynolds_2020_{str_file}.loom'):
        print(idx, name, indv, site, facs, f1, f2)

        os.system(f'cd {reynolds_dir} && aria2c -x 16 --file-allocation=none {f1} -d {reynolds_dir} -o {str_file}_S1_L001_R1_001.fastq.gz')
        os.system(f'cd {reynolds_dir} && aria2c -x 16 --file-allocation=none {f2} -d {reynolds_dir} -o {str_file}_S1_L001_R2_001.fastq.gz')

        os.system(f'cd {reynolds_dir} && loompy fromfq reynolds_2020_{str_file}.loom {str_file} {human_gencode_dir} metadata.tab {str_file}_S1_L001_R1_001.fastq.gz {str_file}_S1_L001_R2_001.fastq.gz ')
    else:
        print(f'reynolds_2020_{str_file}.loom EXISTS!')
        try:
            os.system(f'rm {reynolds_dir}/{str_file}_S1_L001_R1_001.fastq.gz')
            os.system(f'rm {reynolds_dir}/{str_file}_S1_L001_R2_001.fastq.gz')
        except:
            pass