# CRAM to FASTQ

## Prepare environment

In [2]:
!pip install python-dotenv

Defaulting to user installation because normal site-packages is not writeable
Collecting python-dotenv
  Downloading python_dotenv-0.21.1-py3-none-any.whl.metadata (21 kB)
Downloading python_dotenv-0.21.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
[0mSuccessfully installed python-dotenv-0.21.1


In [3]:
%load_ext dotenv

In [5]:
%dotenv

In [8]:
import os
import pandas as pd
import subprocess

In [28]:
# Check ICA_API_KEY
print(f"ICA_API_KEY {'exists' if 'ICA_API_KEY' in os.environ else 'missing'}")

ICA_API_KEY exists


In [29]:
# Check ICA_PROJECT_ID
project_id = subprocess.check_output(f"icav2 -k $ICA_API_KEY projects list | grep 'sg10k-dragen-reanalysis' | cut -f 1 ", shell=True, text=True).strip()
print(f"project_id: {project_id}")

project_id: bf47eb3e-868e-4f88-97ac-c2a76c3c6ac4


In [30]:
# Get pipeline ID
pipeline_id = subprocess.check_output(f"icav2 -k $ICA_API_KEY --project-id {project_id} projectpipelines list | grep 'cram_to_fastq' | cut -f 1 ", shell=True, text=True).strip()
print(f"pipeline_id: {pipeline_id}")

pipeline_id: 4a4b8bf2-f6b9-48a8-9e0c-a9b845fb6bc6


In [31]:
# Get OUTPUTFOLDERID
output_id = subprocess.check_output(f"icav2 -k $ICA_API_KEY --project-id {project_id} projectdata list --file-name 'cram_to_fastq_output' | grep 'cram_to_fastq_output' | cut -f 4 ", shell=True, text=True).strip()
print(f"output_id: {output_id}")

output_id: fol.ad1f7f2373fa478514e808dcfde680bc


In [32]:
# Get OUTPUTFOLDERID
fasta_id = subprocess.check_output(f"icav2 -k $ICA_API_KEY --project-id {project_id} projectdata list --file-name 'hg38.fa' | grep 'hg38.fa' | cut -f 4  ", shell=True, text=True).strip()
print(f"fasta_id: {fasta_id}")

fasta_id: fil.b353222e04a44a69ad5308dc227c4360


## Launch one sample

In [26]:
sample_name = 'WHB7277'
# Get CRAM file ID
cram_id = subprocess.check_output(f"icav2 -k $ICA_API_KEY --project-id {project_id} projectdata list --file-name {sample_name}.bqsr.cram | grep {sample_name} | cut -f 4", shell=True, text=True).strip()

print(f"sample: {sample_name} - cram_id: {cram_id}")

sample: WHB7277 - cram_id: fil.5476d4e5adb74ba357a908dce7f6b5a3


In [36]:
# Launch analysis
print(f'Launching analysis: ')
cmd = f"icav2 -k $ICA_API_KEY projectpipelines start nextflow {pipeline_id} \
--project-id {project_id} \
--storage-size small \
--user-reference {sample_name} \
--user-tag {sample_name} \
--output-parent-folder {output_id} \
--input ref_fasta:{fasta_id} \
--input input_cram:{cram_id} \
"
print(cmd)
cmd_out = subprocess.check_output(cmd, shell=True, text=True).strip()
print(cmd_out)

Launching analysis: 
icav2 -k $ICA_API_KEY projectpipelines start nextflow 4a4b8bf2-f6b9-48a8-9e0c-a9b845fb6bc6 --project-id bf47eb3e-868e-4f88-97ac-c2a76c3c6ac4 --storage-size small --user-reference WHB7277 --user-tag WHB7277 --output-parent-folder fol.ad1f7f2373fa478514e808dcfde680bc --input ref_fasta:fil.b353222e04a44a69ad5308dc227c4360 --input input_cram:fil.5476d4e5adb74ba357a908dce7f6b5a3 
analysisPriority                      MEDIUM
analysisStorage.description           1.2TB
analysisStorage.id                    6e1b6c8f-f913-48b2-9bd0-7fc13eda0fd0
analysisStorage.name                  Small
analysisStorage.ownerId               8ec463f6-1acb-341b-b321-043c39d8716a
analysisStorage.tenantId              f91bb1a0-c55f-4bce-8014-b2e60c0ec7d3
analysisStorage.tenantName            ica-cp-admin
analysisStorage.timeCreated           2021-11-05T10:28:20Z
analysisStorage.timeModified          2023-05-31T16:38:26Z
id                                    c5747094-6c06-4883-91ad-5d2dd6ff541d

## Launch batch of samples

In [37]:
# List ICA sample objects
os.system(f"icav2 -k $ICA_API_KEY --project-id {project_id} projectsamples list > projectsamples-list.txt")
# Read projectsamples-list
cols = ['ID', 'NAME', 'STATUS', 'DESCRIPTION', 'USER TAGS', 'TECHNICAL TAGS']
df_sample = pd.read_csv('projectsamples-list.txt', sep='\t', header=0, index_col=False, engine='python', skipfooter=1,
                 names=cols, converters={col: str.strip for col in cols}
                )
print(f'n={len(df_sample):,}')
print(df_sample.columns)

n=1,543
Index(['ID', 'NAME', 'STATUS', 'DESCRIPTION', 'USER TAGS', 'TECHNICAL TAGS'], dtype='object')


In [None]:
# df_sample.iloc[0]: done using Flow UI
# df_sample.iloc[1]: done using Bench 1 sample
# df_sample.iloc[2:500]: batch 1 ~14h ~35/h
# df_sample.iloc[500:800]: batch 2 ~8h30 ~35/h
# df_sample.iloc[800:1100]: batch 3

In [41]:
# For each sample
for i, row in df_sample.iloc[800:1100].iterrows():
    # Get sample name
    sample_name = row['NAME']
    # Get CRAM file ID
    cram_id = subprocess.check_output(f"icav2 -k $ICA_API_KEY --project-id {project_id} projectdata list --file-name {sample_name}.bqsr.cram | grep {sample_name} | cut -f 4", shell=True, text=True).strip()
    # Launch analysis
    cmd = f"icav2 -k $ICA_API_KEY projectpipelines start nextflow {pipeline_id} \
    --project-id {project_id} \
    --storage-size small \
    --user-reference {sample_name} \
    --user-tag {sample_name} \
    --output-parent-folder {output_id} \
    --input ref_fasta:{fasta_id} \
    --input input_cram:{cram_id} \
    >> ica.log"
    # Launch analysis
    exit = os.system(cmd)
    # Log string
    log = f"[{i}] {sample_name} :{exit}: {cmd}"
    os.system(f"echo {log} >> cram_to_fastq.log")
    print(log)

[500] WHH923 :0: icav2 -k $ICA_API_KEY projectpipelines start nextflow 4a4b8bf2-f6b9-48a8-9e0c-a9b845fb6bc6     --project-id bf47eb3e-868e-4f88-97ac-c2a76c3c6ac4     --storage-size small     --user-reference WHH923     --user-tag WHH923     --output-parent-folder fol.ad1f7f2373fa478514e808dcfde680bc     --input ref_fasta:fil.b353222e04a44a69ad5308dc227c4360     --input input_cram:fil.d7e237c4e72144eeed0708dce689ffeb     >> ica.log
[501] WHH924 :0: icav2 -k $ICA_API_KEY projectpipelines start nextflow 4a4b8bf2-f6b9-48a8-9e0c-a9b845fb6bc6     --project-id bf47eb3e-868e-4f88-97ac-c2a76c3c6ac4     --storage-size small     --user-reference WHH924     --user-tag WHH924     --output-parent-folder fol.ad1f7f2373fa478514e808dcfde680bc     --input ref_fasta:fil.b353222e04a44a69ad5308dc227c4360     --input input_cram:fil.1beff210a141484ced0d08dce689ffeb     >> ica.log
[502] WHH925 :0: icav2 -k $ICA_API_KEY projectpipelines start nextflow 4a4b8bf2-f6b9-48a8-9e0c-a9b845fb6bc6     --project-id bf47e

In [None]:
# 