# Dragen

In [1]:
%load_ext dotenv

In [2]:
%dotenv

In [3]:
import os
import pandas as pd
import subprocess
import io

## Import global parameters

In [4]:
# Check ICA_API_KEY
print(f"ICA_API_KEY {'exists' if 'ICA_API_KEY' in os.environ else 'missing'}")

ICA_API_KEY exists


In [5]:
# Get project ID
project_name = 'sg10k-dragen-reanalysis'
project_id = subprocess.check_output(f"icav2 -k $ICA_API_KEY projects list | grep {project_name} | cut -f 1 ", shell=True, text=True).strip()
print(f"project: {project_name} [{project_id}]")

project: sg10k-dragen-reanalysis [bf47eb3e-868e-4f88-97ac-c2a76c3c6ac4]


In [6]:
# Get pipeline ID
pipeline_name = 'GermlineWithQc378_SG100KpipelineV1'
pipeline_id = subprocess.check_output(f"icav2 -k $ICA_API_KEY --project-id {project_id} projectpipelines list | grep {pipeline_name} | cut -f 1 ", shell=True, text=True).strip()
print(f"pipeline: {pipeline_name} [{pipeline_id}]")

pipeline: GermlineWithQc378_SG100KpipelineV1 [b9b3cdde-1e0e-4079-830d-385770b86b72]


In [7]:
# Get pipeline parameters
pipeline_name = 'GermlineWithQc378_SG100KpipelineV1'
params = subprocess.check_output(f"icav2 -k $ICA_API_KEY --project-id {project_id} projectpipelines input {pipeline_id}", shell=True, text=True).strip()
print(params)

[1mID      [0m 	[1mCODES       [0m 	[1mREQUIRED[0m 	[1mMULTI VALUE[0m  
32583051	autosome_bed	true    	false      	
32583048	cram        	false   	false      	
32583047	fastq_list  	false   	false      	
32583049	fastqs      	false   	true       	
32583050	ref_tar     	true    	false      	
No of items :  5


In [8]:
# Get output folder ID
output_name = 'dragen_378_output'
output_id = subprocess.check_output(f"icav2 -k $ICA_API_KEY --project-id {project_id} projectdata list --file-name {output_name} | grep {output_name} | cut -f 4 ", shell=True, text=True).strip()
print(f"output folder: {output_name} [{output_id}]")

output folder: dragen_378_output [fol.d8ba95e57f51475fc06508dcfe056ca7]


In [9]:
# Get ref ID
ref_name = 'hg38_full_analysis_set_graph.tar'
ref_id = subprocess.check_output(f"icav2 -k $ICA_API_KEY --project-id {project_id} projectdata list --file-name {ref_name} | grep {ref_name} | cut -f 4 ", shell=True, text=True).strip()
print(f"ref: {ref_name} [{ref_id}]")

ref: hg38_full_analysis_set_graph.tar [fil.f9c8f899de7f4c418b4008db2043fa32]


In [10]:
# Get autosomes ID
autosomes_name = 'autosomes.bed'
autosomes_id = subprocess.check_output(f"icav2 -k $ICA_API_KEY --project-id {project_id} projectdata list --file-name {autosomes_name} | grep {autosomes_name} | cut -f 4 ", shell=True, text=True).strip()
print(f"ref: {autosomes_name} [{autosomes_id}]")

ref: autosomes.bed [fil.9efff4fe5d9b43a9346508db2043fa32]


In [None]:
# List ICA sample objects
# Save the sample list in a txt file
# One time sample list will be user to loop over all the available samples
###
# os.system(f"icav2 -k $ICA_API_KEY --project-id {project_id} projectsamples list > projectsamples-list.txt")
# # Read projectsamples-list
# cols = ['ID', 'NAME', 'STATUS', 'DESCRIPTION', 'USER TAGS', 'TECHNICAL TAGS']
# df_sample = pd.read_csv('projectsamples-list.txt', sep='\t', header=0, index_col=False, engine='python', skipfooter=1,
#                  names=cols, converters={col: str.strip for col in cols}
#                 )
# print(f'n={len(df_sample):,}')
# print(df_sample.columns)

## Single sample

In [67]:
# Set sample name & ID
sample_name = 'WHH430'
sample_id = '59d306d4-fee1-4d3e-8f78-f2e8f3a13b16'
# This info will be extracted form projectsamples-list

In [72]:
# List fastq files linked to the sample
fastqs = subprocess.check_output(f"icav2 -k $ICA_API_KEY --project-id {project_id} projectsamples listdata {sample_id} --file-name fastq.gz --match-mode FUZZY | grep '/fastq_output/' ", shell=True, text=True).strip()
# parse list
cols = ['NAME', 'TYPE', 'STATUS', 'ID', 'PROJECT_NAME', 'PROJECT_ID', 'PATH']
df_fq = pd.read_csv(io.StringIO(fastqs), sep='\t', index_col=False, engine='python', names=cols, converters={col: str.strip for col in cols})
# header=0, skipfooter=1
# Sort R1, R2
df_fq.sort_values(by='NAME', inplace=True)

df_fq

Unnamed: 0,NAME,TYPE,STATUS,ID,PROJECT_NAME,PROJECT_ID,PATH
1,WHH430_R1.fastq.gz,FILE,AVAILABLE,fil.41ed7fd6b4774feca46508dd0879ac94,sg10k-dragen-reanalysis,bf47eb3e-868e-4f88-97ac-c2a76c3c6ac4,/fastq_output/WHH430-96d4b6f2-6f82-4723-8144-d...
0,WHH430_R2.fastq.gz,FILE,AVAILABLE,fil.6881278d2cbd4c59a46b08dd0879ac94,sg10k-dragen-reanalysis,bf47eb3e-868e-4f88-97ac-c2a76c3c6ac4,/fastq_output/WHH430-96d4b6f2-6f82-4723-8144-d...


In [73]:
# Get fastq IDs
fastq_ids = ",".join(df_fq["ID"])  # Reverse the order
print(f"Sample {sample_name} - input fastq_ids {fastq_ids}")

Sample WHH430 - input fastq_ids fil.41ed7fd6b4774feca46508dd0879ac94,fil.6881278d2cbd4c59a46b08dd0879ac94


In [74]:
# Create FASTQ list file
###
# Create manifest
manifest = f"""
RGID,RGSM,RGLB,Lane,Read1File,Read2File
{sample_name}_FC1,{sample_name},{sample_name}_LB1,1,{','.join(df_fq['NAME'])}
"""

print(manifest)

# Save manifest into a CSV file
###
# Required to create folder fastq_lists before runin the cell
###
manifest_path = f"03.Dragen/fastq_lists/{sample_name}_fastq_list.csv"
with open(manifest_path, "w") as f:
    f.write(manifest.strip())
print(f"[Manifest saved to {manifest_path}]")


RGID,RGSM,RGLB,Lane,Read1File,Read2File
WHH430_FC1,WHH430,WHH430_LB1,1,WHH430_R1.fastq.gz,WHH430_R2.fastq.gz

[Manifest saved to 03.Dragen/fastq_lists/WHH430_fastq_list.csv]


In [75]:
# Upload FASTQ list file into ICA
upload = subprocess.check_output(f"icav2 -k $ICA_API_KEY --project-id {project_id} projectdata upload {manifest_path} /fastq_list/", shell=True, text=True).strip()
# Get file id
fastq_list_id = subprocess.check_output(f"icav2 -k $ICA_API_KEY --project-id {project_id} projectdata list --file-name {sample_name}_fastq_list.csv | grep fastq_list.csv | cut -f 4", shell=True, text=True).strip()
# Link fastq list to the sample
link = os.system(f"icav2 -k $ICA_API_KEY --project-id {project_id} projectsamples link {sample_id} --data-id {fastq_list_id}")

print(f"fastq list: {sample_name}_fastq_list.csv [{fastq_list_id}] - linked [{link}]")

fastq list: WHH430_fastq_list.csv [fil.594ecbd863b34cfac12908dcfe056ca7] - linked [0]


In [76]:
# Launch analysis
print(f'Launching analysis: ')
cmd = f"icav2 -k $ICA_API_KEY projectpipelines start nextflow {pipeline_id} \
--project-id {project_id} \
--storage-size small \
--user-reference {sample_name} \
--user-tag {sample_name} \
--output-parent-folder {output_id} \
--input fastq_list:{fastq_list_id} \
--input fastqs:{fastq_ids} \
--input ref_tar:{ref_id} \
--input autosome_bed:{autosomes_id} \
--parameters cyp2d6_enabled:true \
--parameters prefix:{sample_name} \
"
print(cmd)
cmd_out = subprocess.check_output(cmd, shell=True, text=True).strip()
# print(cmd_out)
print(f"sample {sample_name} launched")

Launching analysis: 
icav2 -k $ICA_API_KEY projectpipelines start nextflow b9b3cdde-1e0e-4079-830d-385770b86b72 --project-id bf47eb3e-868e-4f88-97ac-c2a76c3c6ac4 --storage-size small --user-reference WHH430 --user-tag WHH430 --output-parent-folder fol.d8ba95e57f51475fc06508dcfe056ca7 --input fastq_list:fil.594ecbd863b34cfac12908dcfe056ca7 --input fastqs:fil.41ed7fd6b4774feca46508dd0879ac94,fil.6881278d2cbd4c59a46b08dd0879ac94 --input ref_tar:fil.f9c8f899de7f4c418b4008db2043fa32 --input autosome_bed:fil.9efff4fe5d9b43a9346508db2043fa32 --parameters cyp2d6_enabled:true --parameters prefix:WHH430 
sample WHH430 launched


## Batch of samples

In [11]:
# List ICA sample objects
# os.system(f"icav2 -k $ICA_API_KEY --project-id {project_id} projectsamples list > projectsamples-list.txt")
# Read projectsamples-list
cols = ['ID', 'NAME', 'STATUS', 'DESCRIPTION', 'USER TAGS', 'TECHNICAL TAGS']
df_sample = pd.read_csv('projectsamples-list.txt', sep='\t', header=0, index_col=False, engine='python', skipfooter=1,
                 names=cols, converters={col: str.strip for col in cols}
                )
# Drop NPM1NA# 
df_sample = df_sample.drop(0)
print(f'n={len(df_sample):,}')
print(df_sample.columns)
# df_sample

n=1,543
Index(['ID', 'NAME', 'STATUS', 'DESCRIPTION', 'USER TAGS', 'TECHNICAL TAGS'], dtype='object')


In [None]:
# df_sample.iloc[0]: # launched using single sample
# df_sample.iloc[1:5]: # done
# df_sample.iloc[5:200]: # done
# df_sample.iloc[200:400]: # 3 rerun
# df_sample.iloc[400:600]: 

# WHB7277 CYP2D6 error

In [12]:
# For each sample
for i, row in df_sample.iloc[400:600].iterrows():
    # Get sample name & sample ID
    sample_name = row['NAME']
    sample_id = row['ID']
    
    # List fastq files linked to the sample
    fastqs = subprocess.check_output(f"icav2 -k $ICA_API_KEY --project-id {project_id} projectsamples listdata {sample_id} --file-name fastq.gz --match-mode FUZZY | grep '/fastq_output/' ", shell=True, text=True).strip()
    # parse list
    cols = ['NAME', 'TYPE', 'STATUS', 'ID', 'PROJECT_NAME', 'PROJECT_ID', 'PATH']
    df_fq = pd.read_csv(io.StringIO(fastqs), sep='\t', index_col=False, engine='python', names=cols, converters={col: str.strip for col in cols})
    # Sort R1, R2
    df_fq.sort_values(by='NAME', inplace=True)
    # Get fastq IDs
    fastq_ids = ",".join(df_fq["ID"])  # Reverse the order
    # print(f"Sample {sample_name} - input fastq_ids {fastq_ids}")
    
    # Create FASTQ list file
    manifest = f"""
RGID,RGSM,RGLB,Lane,Read1File,Read2File
{sample_name}_FC1,{sample_name},{sample_name}_LB1,1,{','.join(df_fq['NAME'])}
"""
    # Save manifest into a CSV file
    manifest_path = f"03.Dragen/fastq_lists/{sample_name}_fastq_list.csv"
    with open(manifest_path, "w") as f:
        f.write(manifest.strip())
    # print(f"[Manifest saved to {manifest_path}]")
    
    # Upload FASTQ list file into ICA
    upload = subprocess.check_output(f"icav2 -k $ICA_API_KEY --project-id {project_id} projectdata upload {manifest_path} /fastq_list/", shell=True, text=True).strip()
    # Get file id
    fastq_list_id = subprocess.check_output(f"icav2 -k $ICA_API_KEY --project-id {project_id} projectdata list --file-name {sample_name}_fastq_list.csv | grep fastq_list.csv | cut -f 4", shell=True, text=True).strip()
    # Link fastq list to the sample
    link = os.system(f"icav2 -k $ICA_API_KEY --project-id {project_id} projectsamples link {sample_id} --data-id {fastq_list_id}")
    # print(f"fastq list: {sample_name}_fastq_list.csv [{fastq_list_id}] - linked [{link}]")

    # Set ICA command
    cmd = f"icav2 -k $ICA_API_KEY projectpipelines start nextflow {pipeline_id} \
    --project-id {project_id} \
    --storage-size small \
    --user-reference {sample_name} \
    --user-tag {sample_name} \
    --output-parent-folder {output_id} \
    --input fastq_list:{fastq_list_id} \
    --input fastqs:{fastq_ids} \
    --input ref_tar:{ref_id} \
    --input autosome_bed:{autosomes_id} \
    --parameters cyp2d6_enabled:true \
    --parameters prefix:{sample_name} \
    >> ica.log"
    # Launch analysis
    exit = os.system(cmd)
    # Log string
    log = f"[{i}] {sample_name} :{exit}: {cmd}"
    os.system(f"echo {log} >> cram_to_fastq.log")
    print(log)

[201] WHH629 :0: icav2 -k $ICA_API_KEY projectpipelines start nextflow b9b3cdde-1e0e-4079-830d-385770b86b72     --project-id bf47eb3e-868e-4f88-97ac-c2a76c3c6ac4     --storage-size small     --user-reference WHH629     --user-tag WHH629     --output-parent-folder fol.d8ba95e57f51475fc06508dcfe056ca7     --input fastq_list:fil.ab1261a6927e4f08af3008dd10fde41e     --input fastqs:fil.c1aae17a796a453b5b1608dd0a8ab0a2,fil.2913d40ecbc641455b3508dd0a8ab0a2     --input ref_tar:fil.f9c8f899de7f4c418b4008db2043fa32     --input autosome_bed:fil.9efff4fe5d9b43a9346508db2043fa32     --parameters cyp2d6_enabled:true     --parameters prefix:WHH629     >> ica.log
[202] WHH630 :0: icav2 -k $ICA_API_KEY projectpipelines start nextflow b9b3cdde-1e0e-4079-830d-385770b86b72     --project-id bf47eb3e-868e-4f88-97ac-c2a76c3c6ac4     --storage-size small     --user-reference WHH630     --user-tag WHH630     --output-parent-folder fol.d8ba95e57f51475fc06508dcfe056ca7     --input fastq_list:fil.26f6fe464d644221

## CYP2D6 disabled

In [37]:
# Set sample name & sample ID
sample_name = 'WHH619'
sample_id = '2e298d3c-525e-4b97-a7d9-de9571dff188'
    
# List fastq files linked to the sample
fastqs = subprocess.check_output(f"icav2 -k $ICA_API_KEY --project-id {project_id} projectsamples listdata {sample_id} --file-name fastq.gz --match-mode FUZZY | grep '/fastq_output/' ", shell=True, text=True).strip()
# parse list
cols = ['NAME', 'TYPE', 'STATUS', 'ID', 'PROJECT_NAME', 'PROJECT_ID', 'PATH']
df_fq = pd.read_csv(io.StringIO(fastqs), sep='\t', index_col=False, engine='python', names=cols, converters={col: str.strip for col in cols})
# Sort R1, R2
df_fq.sort_values(by='NAME', inplace=True)
# Get fastq IDs
fastq_ids = ",".join(df_fq["ID"])  # Reverse the order
# print(f"Sample {sample_name} - input fastq_ids {fastq_ids}")
    
# Create FASTQ list file
manifest = f"""
RGID,RGSM,RGLB,Lane,Read1File,Read2File
{sample_name}_FC1,{sample_name},{sample_name}_LB1,1,{','.join(df_fq['NAME'])}
"""
# Save manifest into a CSV file
manifest_path = f"03.Dragen/fastq_lists/{sample_name}_fastq_list.csv"
with open(manifest_path, "w") as f:
    f.write(manifest.strip())
# print(f"[Manifest saved to {manifest_path}]")
    
# Upload FASTQ list file into ICA
upload = subprocess.check_output(f"icav2 -k $ICA_API_KEY --project-id {project_id} projectdata upload {manifest_path} /fastq_list/", shell=True, text=True).strip()
# Get file id
fastq_list_id = subprocess.check_output(f"icav2 -k $ICA_API_KEY --project-id {project_id} projectdata list --file-name {sample_name}_fastq_list.csv | grep fastq_list.csv | cut -f 4", shell=True, text=True).strip()
# Link fastq list to the sample
link = os.system(f"icav2 -k $ICA_API_KEY --project-id {project_id} projectsamples link {sample_id} --data-id {fastq_list_id}")
# print(f"fastq list: {sample_name}_fastq_list.csv [{fastq_list_id}] - linked [{link}]")

# Set ICA command
cmd = f"icav2 -k $ICA_API_KEY projectpipelines start nextflow {pipeline_id} \
--project-id {project_id} \
--storage-size small \
--user-reference {sample_name} \
--user-tag {sample_name} \
--user-tag 'CYP2D6-OFF' \
--output-parent-folder {output_id} \
--input fastq_list:{fastq_list_id} \
--input fastqs:{fastq_ids} \
--input ref_tar:{ref_id} \
--input autosome_bed:{autosomes_id} \
--parameters cyp2d6_enabled:false \
--parameters prefix:{sample_name} \
>> ica.log"
# Launch analysis
exit = os.system(cmd)
# Log string
log = f"{sample_name} :{exit}: {cmd}"
os.system(f"echo {log} >> cram_to_fastq.log")
print(log)

WHH619 :0: icav2 -k $ICA_API_KEY projectpipelines start nextflow b9b3cdde-1e0e-4079-830d-385770b86b72 --project-id bf47eb3e-868e-4f88-97ac-c2a76c3c6ac4 --storage-size small --user-reference WHH619 --user-tag WHH619 --output-parent-folder fol.d8ba95e57f51475fc06508dcfe056ca7 --input fastq_list:fil.a939dde176bf4c131a8108dd0d1862d1 --input fastqs:fil.da5ebe96a1a74a3e569c08dd0a8ab0a2,fil.eb81f0be87f442dab41008dcfe048f15 --input ref_tar:fil.f9c8f899de7f4c418b4008db2043fa32 --input autosome_bed:fil.9efff4fe5d9b43a9346508db2043fa32 --parameters cyp2d6_enabled:false --parameters prefix:WHH619 >> ica.log
