# Environment preparation and FASTQ file download

In this notebook we are going to prepare the environment for the preprocessing and analysis of the fastq files. To do that we are going to perform the following steps:
* Environment preparation
  * Download and install CellRanger / kallisto and genome files
* Dataset download and pre-processing
  * Download fastq files
  * Run CellRanger pipeline
  * Run loompy/kallisto pipeline


In [None]:
import os
import ray
import pandas as pd

In [None]:
n_cpus = 8

## Environment preparation

In [None]:
dir_mapper = os.getcwd() + "/mapper"
os.makedirs(dir_mapper, exist_ok=True)

### Download and install CellRanger and genome files
We will use CellRanger with their reference data.

In [None]:
!cd {dir_mapper} && wget -O cellranger-6.0.2.tar.gz "https://cf.10xgenomics.com/releases/cell-exp/cellranger-6.0.2.tar.gz?Expires=1625015730&Policy=eyJTdGF0ZW1lbnQiOlt7IlJlc291cmNlIjoiaHR0cHM6Ly9jZi4xMHhnZW5vbWljcy5jb20vcmVsZWFzZXMvY2VsbC1leHAvY2VsbHJhbmdlci02LjAuMi50YXIuZ3oiLCJDb25kaXRpb24iOnsiRGF0ZUxlc3NUaGFuIjp7IkFXUzpFcG9jaFRpbWUiOjE2MjUwMTU3MzB9fX1dfQ__&Signature=As8f3wksvQDwXVFXBQK7WbblsRUNPAY8dGyYLA6f4DdBQbeb0ld4lCny0AXRYzdHfgNFllVgBtpSfzTgbibPdKNZNbOtiBxv3m3REIJ1sHIw12G0NC2hjxsFRHhta0pFAKWuORxaXHeVIyBxTh1mm0vXcFH3VPvv~haLreqEzYZqbYM0v4ikSum6c5YYVVrVxKom6P4cHQin0T49LRyFYjG83qpu3gaCq86YMwdLbNfn9T35fcmhu3XVpBhFcoZ4hSu0WgNffJ9ENLJibCraZ9q4Fw017pZxd5WT7K6DZ2Wx7EpyvpZv8IRU6as-W1uaQkTnBuY7XPjDFXTBQhL95A__&Key-Pair-Id=APKAI7S6A5RYOXBWRPDA" && tar xzf cellranger-6.0.2.tar.gz

In [None]:
!cd {dir_mapper} && wget https://cf.10xgenomics.com/supp/cell-exp/refdata-gex-GRCh38-2020-A.tar.gz && tar xzf refdata-gex-GRCh38-2020-A.tar.gz

### Download kallisto genome files

In [None]:
!cd {dir_mapper} && wget https://storage.googleapis.com/linnarsson-lab-www-blobs/human_GRCh38_gencode.v31.tar.gz && tar xzf human_GRCh38_gencode.v31.tar.gz

## Download fastq files

In [None]:
fastq_dir = os.getcwd() + '/data/FASTQ'

In [None]:
dict_links_fastq = {
    'SI-GA-C10_Mp11_S1_L001_R1_001.fastq.gz': 'https://zenodo.org/record/5041684/files/SI-GA-C10_Mp11_S1_L001_R1_001.fastq.gz',
    'SI-GA-C10_Mp11_S1_L001_R2_001.fastq.gz': 'https://zenodo.org/record/5041684/files/SI-GA-C10_Mp11_S1_L001_R2_001.fastq.gz',
    'SI-GA-C10_Mp11_S2_L002_R1_001.fastq.gz': 'https://zenodo.org/record/5041684/files/SI-GA-C10_Mp11_S2_L002_R1_001.fastq.gz',
    'SI-GA-C10_Mp11_S2_L002_R2_001.fastq.gz': 'https://zenodo.org/record/5041684/files/SI-GA-C10_Mp11_S2_L002_R2_001.fastq.gz',
    'SI-GA-C10_Mp11_S3_L001_R1_001.fastq.gz': 'https://zenodo.org/record/5041684/files/SI-GA-C10_Mp11_S3_L001_R1_001.fastq.gz',
    'SI-GA-C10_Mp11_S3_L001_R2_001.fastq.gz': 'https://zenodo.org/record/5041684/files/SI-GA-C10_Mp11_S3_L001_R2_001.fastq.gz',
    'SI-GA-C10_Mp11_S4_L002_R1_001.fastq.gz': 'https://zenodo.org/record/5041684/files/SI-GA-C10_Mp11_S4_L002_R1_001.fastq.gz',
    'SI-GA-C10_Mp11_S4_L002_R2_001.fastq.gz': 'https://zenodo.org/record/5041684/files/SI-GA-C10_Mp11_S4_L002_R2_001.fastq.gz',
    'SI-GA-D12_Ap11_S1_L001_R1_001.fastq.gz': 'https://zenodo.org/record/5041684/files/SI-GA-D12_Ap11_S1_L001_R1_001.fastq.gz',
    'SI-GA-D12_Ap11_S1_L001_R2_001.fastq.gz': 'https://zenodo.org/record/5041684/files/SI-GA-D12_Ap11_S1_L001_R2_001.fastq.gz',
    'SI-GA-D12_Ap11_S2_L002_R1_001.fastq.gz': 'https://zenodo.org/record/5041684/files/SI-GA-D12_Ap11_S2_L002_R1_001.fastq.gz',
    'SI-GA-D12_Ap11_S2_L002_R2_001.fastq.gz': 'https://zenodo.org/record/5041684/files/SI-GA-D12_Ap11_S2_L002_R2_001.fastq.gz',
    'SI-GA-D12_Ap11_S3_L001_R1_001.fastq.gz': 'https://zenodo.org/record/5041684/files/SI-GA-D12_Ap11_S3_L001_R1_001.fastq.gz',
    'SI-GA-D12_Ap11_S3_L001_R2_001.fastq.gz': 'https://zenodo.org/record/5041684/files/SI-GA-D12_Ap11_S3_L001_R2_001.fastq.gz',
    'SI-GA-D12_Ap11_S4_L002_R1_001.fastq.gz': 'https://zenodo.org/record/5041684/files/SI-GA-D12_Ap11_S4_L002_R1_001.fastq.gz',
    'SI-GA-D12_Ap11_S4_L002_R2_001.fastq.gz': 'https://zenodo.org/record/5041684/files/SI-GA-D12_Ap11_S4_L002_R2_001.fastq.gz',
    'SI-GA-E10_Mp13_S1_L001_R1_001.fastq.gz': 'https://zenodo.org/record/5041684/files/SI-GA-E10_Mp13_S1_L001_R1_001.fastq.gz',
    'SI-GA-E10_Mp13_S1_L001_R2_001.fastq.gz': 'https://zenodo.org/record/5041684/files/SI-GA-E10_Mp13_S1_L001_R2_001.fastq.gz',
    'SI-GA-E10_Mp13_S2_L002_R1_001.fastq.gz': 'https://zenodo.org/record/5041684/files/SI-GA-E10_Mp13_S2_L002_R1_001.fastq.gz',
    'SI-GA-E10_Mp13_S2_L002_R2_001.fastq.gz': 'https://zenodo.org/record/5041684/files/SI-GA-E10_Mp13_S2_L002_R2_001.fastq.gz',
    'SI-GA-E10_Mp13_S3_L001_R1_001.fastq.gz': 'https://zenodo.org/record/5041684/files/SI-GA-E10_Mp13_S3_L001_R1_001.fastq.gz',
    'SI-GA-E10_Mp13_S3_L001_R2_001.fastq.gz': 'https://zenodo.org/record/5041684/files/SI-GA-E10_Mp13_S3_L001_R2_001.fastq.gz',
    'SI-GA-E10_Mp13_S4_L002_R1_001.fastq.gz': 'https://zenodo.org/record/5041684/files/SI-GA-E10_Mp13_S4_L002_R1_001.fastq.gz',
    'SI-GA-E10_Mp13_S4_L002_R2_001.fastq.gz': 'https://zenodo.org/record/5041684/files/SI-GA-E10_Mp13_S4_L002_R2_001.fastq.gz',
    'SI-GA-G7_Ap13_S1_L001_R1_001.fastq.gz': 'https://zenodo.org/record/5041684/files/SI-GA-G7_Ap13_S1_L001_R1_001.fastq.gz',
    'SI-GA-G7_Ap13_S1_L001_R2_001.fastq.gz': 'https://zenodo.org/record/5041684/files/SI-GA-G7_Ap13_S1_L001_R2_001.fastq.gz',
    'SI-GA-G7_Ap13_S2_L002_R1_001.fastq.gz': 'https://zenodo.org/record/5041684/files/SI-GA-G7_Ap13_S2_L002_R1_001.fastq.gz',
    'SI-GA-G7_Ap13_S2_L002_R2_001.fastq.gz': 'https://zenodo.org/record/5041684/files/SI-GA-G7_Ap13_S2_L002_R2_001.fastq.gz',
    'SI-GA-G7_Ap13_S3_L001_R1_001.fastq.gz': 'https://zenodo.org/record/5041684/files/SI-GA-G7_Ap13_S3_L001_R1_001.fastq.gz',
    'SI-GA-G7_Ap13_S3_L001_R2_001.fastq.gz': 'https://zenodo.org/record/5041684/files/SI-GA-G7_Ap13_S3_L001_R2_001.fastq.gz',
    'SI-GA-G7_Ap13_S4_L002_R1_001.fastq.gz': 'https://zenodo.org/record/5041684/files/SI-GA-G7_Ap13_S4_L002_R1_001.fastq.gz',
    'SI-GA-G7_Ap13_S4_L002_R2_001.fastq.gz': 'https://zenodo.org/record/5041684/files/SI-GA-G7_Ap13_S4_L002_R2_001.fastq.gz',
    'SI-GA-D10_Mp15_S1_L001_R1_001.fastq.gz': 'https://zenodo.org/record/5041684/files/SI-GA-D10_Mp15_S1_L001_R1_001.fastq.gz',
    'SI-GA-D10_Mp15_S1_L001_R2_001.fastq.gz': 'https://zenodo.org/record/5041684/files/SI-GA-D10_Mp15_S1_L001_R2_001.fastq.gz',
    'SI-GA-D10_Mp15_S2_L002_R1_001.fastq.gz': 'https://zenodo.org/record/5041684/files/SI-GA-D10_Mp15_S2_L002_R1_001.fastq.gz',
    'SI-GA-D10_Mp15_S2_L002_R2_001.fastq.gz': 'https://zenodo.org/record/5041684/files/SI-GA-D10_Mp15_S2_L002_R2_001.fastq.gz',    
    'SI-GA-D8_Ap15_S1_L001_R1_001.fastq.gz': 'https://zenodo.org/record/5041684/files/SI-GA-D8_Ap15_S1_L001_R1_001.fastq.gz',
    'SI-GA-D8_Ap15_S1_L001_R2_001.fastq.gz': 'https://zenodo.org/record/5041684/files/SI-GA-D8_Ap15_S1_L001_R2_001.fastq.gz',
    'SI-GA-D8_Ap15_S2_L002_R1_001.fastq.gz': 'https://zenodo.org/record/5041684/files/SI-GA-D8_Ap15_S2_L002_R1_001.fastq.gz',
    'SI-GA-D8_Ap15_S2_L002_R2_001.fastq.gz': 'https://zenodo.org/record/5041684/files/SI-GA-D8_Ap15_S2_L002_R2_001.fastq.gz',
    'SI-GA-D8_Ap15_S3_L001_R1_001.fastq.gz': 'https://zenodo.org/record/5041684/files/SI-GA-D8_Ap15_S3_L001_R1_001.fastq.gz',
    'SI-GA-D8_Ap15_S3_L001_R2_001.fastq.gz': 'https://zenodo.org/record/5041684/files/SI-GA-D8_Ap15_S3_L001_R2_001.fastq.gz',
    'SI-GA-D8_Ap15_S4_L002_R1_001.fastq.gz': 'https://zenodo.org/record/5041684/files/SI-GA-D8_Ap15_S4_L002_R1_001.fastq.gz',
    'SI-GA-D8_Ap15_S4_L002_R2_001.fastq.gz': 'https://zenodo.org/record/5041684/files/SI-GA-D8_Ap15_S4_L002_R2_001.fastq.gz',
}

In [None]:
@ray.remote
def wget_link(name, url):
    prefix = '_'.join(name.split('_')[:2])
    print('prefix', prefix)
    
    if not os.path.exists(f'{fastq_dir}/{prefix}/{name}'):
        print(f'Downloading {name}')
        os.makedirs(f'{fastq_dir}/{prefix}', exist_ok=True)
        os.system(f'wget -nv -O {fastq_dir}/{prefix}/{name} {url}')
    else:
        print(f'{name} already downloaded')

In [None]:
for name_i, url_i in dict_links_fastq.items():
    wget_link(name_i, url_i)

In [None]:
ray.init(num_cpus=n_cpus)

ray_get = ray.get([wget_link.remote(name_i, url_i) for name_i, url_i in dict_links_fastq.items()])

ray.shutdown()

### Run CellRanger pipeline

In [None]:
dir_CR = os.getcwd() + '/data/CR'
os.makedirs(dir_CR, exist_ok=True)

In [None]:
# Ap11
!cd {dir_CR} && {dir_mapper}/cellranger-6.0.2/cellranger count --id=SI-GA-D12_Ap11  --expect-cells 1500 --transcriptome={dir_mapper}/refdata-gex-GRCh38-2020-A --fastqs {fastq_dir}/SI-GA-D12_Ap11

In [None]:
# Mp11
!cd {dir_CR} && {dir_mapper}/cellranger-6.0.2/cellranger count --id=SI-GA-C10_Mp11  --expect-cells 1500 --transcriptome={dir_mapper}/refdata-gex-GRCh38-2020-A --fastqs {fastq_dir}/SI-GA-C10_Mp11

In [None]:
# Ap13
!cd {dir_CR} && {dir_mapper}/cellranger-6.0.2/cellranger count --id=SI-GA-G7_Ap13  --expect-cells 1500 --transcriptome={dir_mapper}/refdata-gex-GRCh38-2020-A --fastqs {fastq_dir}/SI-GA-G7_Ap13

In [None]:
# Mp13
!cd {dir_CR} && {dir_mapper}/cellranger-6.0.2/cellranger count --id=SI-GA-E10_Mp13  --expect-cells 1500 --transcriptome={dir_mapper}/refdata-gex-GRCh38-2020-A --fastqs {fastq_dir}/SI-GA-E10_Mp13

In [None]:
# Ap15
!cd {dir_CR} && {dir_mapper}/cellranger-6.0.2/cellranger count --id=SI-GA-D8_Ap15  --expect-cells 1500 --transcriptome={dir_mapper}/refdata-gex-GRCh38-2020-A --fastqs {fastq_dir}/SI-GA-D8_Ap15

In [None]:
# Mp15
!cd {dir_CR} && {dir_mapper}/cellranger-6.0.2/cellranger count --id=SI-GA-D10_Mp15  --expect-cells 1500 --transcriptome={dir_mapper}/refdata-gex-GRCh38-2020-A --fastqs {fastq_dir}/SI-GA-D10_Mp15

### Run loompy/kallisto pipeline

In [None]:
dir_kallisto = os.getcwd() + '/data/kallisto'
os.makedirs(dir_kallisto, exist_ok=True)

In [None]:
df = pd.DataFrame({'name': ['SI-GA-D12_Ap11', 'SI-GA-C10_Mp11', 'SI-GA-G7_Ap13', 'SI-GA-E10_Mp13', 'SI-GA-D8_Ap15', 'SI-GA-D10_Mp15'],
                   'technology': ['10xv3'] * 6,
                   'targetnumcells': [1500] * 6})

df.to_csv(f'{dir_kallisto}/metadata.tab', sep='\t', index=None)

In [None]:
sample = 'SI-GA-D12_Ap11'
!loompy fromfq {dir_kallisto}/{sample}.loom {sample} {dir_mapper}/human_GRCh38_gencode.v31.600 {dir_kallisto}/metadata.tab {' '.join([f'{fastq_dir}/{sample}/{i}' for i in os.listdir(f'{fastq_dir}/{sample}')])}

In [None]:
sample = 'SI-GA-C10_Mp11'
!loompy fromfq {dir_kallisto}/{sample}.loom {sample} {dir_mapper}/human_GRCh38_gencode.v31.600 {dir_kallisto}/metadata.tab {' '.join([f'{fastq_dir}/{sample}/{i}' for i in os.listdir(f'{fastq_dir}/{sample}')])}

In [None]:
sample = 'SI-GA-G7_Ap13'
!loompy fromfq {dir_kallisto}/{sample}.loom {sample} {dir_mapper}/human_GRCh38_gencode.v31.600 {dir_kallisto}/metadata.tab {' '.join([f'{fastq_dir}/{sample}/{i}' for i in os.listdir(f'{fastq_dir}/{sample}')])}

In [None]:
sample = 'SI-GA-E10_Mp13'
!loompy fromfq {dir_kallisto}/{sample}.loom {sample} {dir_mapper}/human_GRCh38_gencode.v31.600 {dir_kallisto}/metadata.tab {' '.join([f'{fastq_dir}/{sample}/{i}' for i in os.listdir(f'{fastq_dir}/{sample}')])}

In [None]:
sample = 'SI-GA-D8_Ap15'
!loompy fromfq {dir_kallisto}/{sample}.loom {sample} {dir_mapper}/human_GRCh38_gencode.v31.600 {dir_kallisto}/metadata.tab {' '.join([f'{fastq_dir}/{sample}/{i}' for i in os.listdir(f'{fastq_dir}/{sample}')])}

In [None]:
sample = 'SI-GA-D10_Mp15'
!loompy fromfq {dir_kallisto}/{sample}.loom {sample} {dir_mapper}/human_GRCh38_gencode.v31.600 {dir_kallisto}/metadata.tab {' '.join([f'{fastq_dir}/{sample}/{i}' for i in os.listdir(f'{fastq_dir}/{sample}')])}