# CRAM to FASTQ

## Prepare environment

In [2]:
!pip install python-dotenv

Defaulting to user installation because normal site-packages is not writeable
Collecting python-dotenv
  Downloading python_dotenv-0.21.1-py3-none-any.whl.metadata (21 kB)
Downloading python_dotenv-0.21.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
[0mSuccessfully installed python-dotenv-0.21.1


In [56]:
%load_ext dotenv

The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv


In [57]:
%dotenv

In [58]:
import os
import pandas as pd
import subprocess

In [59]:
# Check ICA_API_KEY
print(f"ICA_API_KEY {'exists' if 'ICA_API_KEY' in os.environ else 'missing'}")

ICA_API_KEY exists


In [60]:
# Get project ID
project_name = 'sg10k-dragen-reanalysis'
project_id = subprocess.check_output(f"icav2 -k $ICA_API_KEY projects list | grep {project_name} | cut -f 1 ", shell=True, text=True).strip()
print(f"project: {project_name} [{project_id}]")

project: sg10k-dragen-reanalysis [bf47eb3e-868e-4f88-97ac-c2a76c3c6ac4]


In [61]:
# Get pipeline ID
pipeline_name = 'cram_to_fastq_v2'
pipeline_id = subprocess.check_output(f"icav2 -k $ICA_API_KEY --project-id {project_id} projectpipelines list | grep {pipeline_name} | cut -f 1 ", shell=True, text=True).strip()
print(f"pipeline: {pipeline_name} [{pipeline_id}]")

pipeline: cram_to_fastq_v2 [9b0b0046-aea7-4927-ae6d-4b995de871a5]


In [62]:
# Get output folder ID
output_name = 'cram_to_fastq_output'
output_id = subprocess.check_output(f"icav2 -k $ICA_API_KEY --project-id {project_id} projectdata list --file-name {output_name} | grep {output_name} | cut -f 4 ", shell=True, text=True).strip()
print(f"output folder: {output_name} [{output_id}]")

output folder: cram_to_fastq_output [fol.ad1f7f2373fa478514e808dcfde680bc]


In [63]:
# Get ref ID
ref_name = 'hg38.fa'
ref_id = subprocess.check_output(f"icav2 -k $ICA_API_KEY --project-id {project_id} projectdata list --file-name {ref_name} | grep {ref_name} | cut -f 4 ", shell=True, text=True).strip()
print(f"ref: {ref_name} [{ref_id}]")

ref: hg38.fa [fil.b353222e04a44a69ad5308dc227c4360]


## Launch one sample

In [64]:
sample_name = 'WHH430'
# Get CRAM file ID
cram_id = subprocess.check_output(f"icav2 -k $ICA_API_KEY --project-id {project_id} projectdata list --file-name {sample_name}.bqsr.cram | grep {sample_name} | cut -f 4", shell=True, text=True).strip()

print(f"sample: {sample_name} - cram_id: {cram_id}")

sample: WHH430 - cram_id: fil.1bd99f3c419640a1582808dce7f6b5a3


In [66]:
# Launch analysis
cmd = f"icav2 -k $ICA_API_KEY projectpipelines start nextflow {pipeline_id} \
--project-id {project_id} \
--storage-size small \
--user-reference {sample_name} \
--user-tag {sample_name} \
--user-tag {pipeline_name} \
--output-parent-folder {output_id} \
--input ref_fasta:{ref_id} \
--input input_cram:{cram_id} \
"
print(f"cmd: {cmd}")
cmd_out = subprocess.check_output(cmd, shell=True, text=True).strip()
# print(cmd_out)
print(f"Analysis {pipeline_name} launched on sample {sample_name}")

cmd: icav2 -k $ICA_API_KEY projectpipelines start nextflow 9b0b0046-aea7-4927-ae6d-4b995de871a5 --project-id bf47eb3e-868e-4f88-97ac-c2a76c3c6ac4 --storage-size small --user-reference WHH430 --user-tag WHH430 --user-tag cram_to_fastq_v2 --output-parent-folder fol.ad1f7f2373fa478514e808dcfde680bc --input ref_fasta:fil.b353222e04a44a69ad5308dc227c4360 --input input_cram:fil.1bd99f3c419640a1582808dce7f6b5a3 
Analysis cram_to_fastq_v2 launched on sample WHH430


## Launch batch of samples

In [37]:
# List ICA sample objects
os.system(f"icav2 -k $ICA_API_KEY --project-id {project_id} projectsamples list > projectsamples-list.txt")
# Read projectsamples-list
cols = ['ID', 'NAME', 'STATUS', 'DESCRIPTION', 'USER TAGS', 'TECHNICAL TAGS']
df_sample = pd.read_csv('projectsamples-list.txt', sep='\t', header=0, index_col=False, engine='python', skipfooter=1,
                 names=cols, converters={col: str.strip for col in cols}
                )
print(f'n={len(df_sample):,}')
print(df_sample.columns)

n=1,543
Index(['ID', 'NAME', 'STATUS', 'DESCRIPTION', 'USER TAGS', 'TECHNICAL TAGS'], dtype='object')


In [None]:
# df_sample.iloc[0]: done using Flow UI
# df_sample.iloc[1]: done using Bench 1 sample
# df_sample.iloc[2:500]: batch 1 ~14h ~35/h
# df_sample.iloc[500:800]: batch 2 ~8h30 ~35/h
# df_sample.iloc[800:1100]: batch 3 ~8h30 ~35/h
# df_sample.iloc[1100:]: batch 4

In [None]:
# For each sample
for i, row in df_sample.iloc[1100:].iterrows():
    # Get sample name
    sample_name = row['NAME']
    # Get CRAM file ID
    cram_id = subprocess.check_output(f"icav2 -k $ICA_API_KEY --project-id {project_id} projectdata list --file-name {sample_name}.bqsr.cram | grep {sample_name} | cut -f 4", shell=True, text=True).strip()
    # Launch analysis
    cmd = f"icav2 -k $ICA_API_KEY projectpipelines start nextflow {pipeline_id} \
    --project-id {project_id} \
    --storage-size small \
    --user-reference {sample_name} \
    --user-tag {sample_name} \
    --output-parent-folder {output_id} \
    --input ref_fasta:{fasta_id} \
    --input input_cram:{cram_id} \
    >> ica.log"
    # Launch analysis
    exit = os.system(cmd)
    # Log string
    log = f"[{i}] {sample_name} :{exit}: {cmd}"
    os.system(f"echo {log} >> cram_to_fastq.log")
    print(log)

In [None]:
# Check if fastq exists

# For each sample
for i, row in df_sample.iterrows():
    # Get R1.fastq
    r1 = subprocess.check_output(f"icav2 -k $ICA_API_KEY projectsamples listdata {row['ID']} --file-name 'R1.fastq' --match-mode FUZZY | grep {row['NAME']} | cut -f 1", shell=True, text=True).strip()
    # Log string
    log = f"[{i}] {row['NAME']}: {r1}"
    os.system(f"echo {log} >> R1.fastq-list.log")
print('done')