# CRAM to FASTQ processing

In [1]:
%load_ext dotenv

In [31]:
%dotenv

In [47]:
import os
import pandas as pd
import re
import subprocess

## Create sample objects

In [20]:
# Read CRAM manifest
df_cram = pd.read_csv(f"../{os.environ['CRAM_MANIFEST']}", header=None, names=['cram_bucket', 'cram'])
# Extract sample name
df_cram['sample'] = df_cram['cram'].apply(lambda x: re.search(r'([^/]+)\.bqsr', x).group(1))

df_cram.columns

Index(['cram_bucket', 'cram', 'sample'], dtype='object')

In [None]:
# For each sample, create an ICA sample object
for sample in df_cram['sample']:
    # Create sample object
    os.system(f"icav2 -k $ICA_API_KEY --project-id $ICA_PROJECT_ID projectsamples create {sample}")

In [55]:
# List ICA sample objects
os.system(f"icav2 -k $ICA_API_KEY --project-id $ICA_PROJECT_ID projectsamples list > projectsamples-list.txt")
# Read projectsamples-list
cols = ['ID', 'NAME', 'STATUS', 'DESCRIPTION', 'USER TAGS', 'TECHNICAL TAGS']
df_sample = pd.read_csv('projectsamples-list.txt', sep='\t', header=0, index_col=False, engine='python', skipfooter=1,
                 names=cols, converters={col: str.strip for col in cols}
                )
print(f'n={len(df_sample):,}')
print(df_sample.columns)

n=1,543
Index(['ID', 'NAME', 'STATUS', 'DESCRIPTION', 'USER TAGS', 'TECHNICAL TAGS'], dtype='object')


In [None]:
# For each sample object that are not completed
for i, row in df_sample.loc[df_sample['STATUS'].str.strip() == 'PARTIAL'].iterrows():
    # Complete the sample
    os.system(f"icav2 -k $ICA_API_KEY --project-id $ICA_PROJECT_ID projectsamples complete {row['ID']}")
    # Tag the sample
    os.system(f"icav2 -k $ICA_API_KEY --project-id $ICA_PROJECT_ID projectsamples update {row['ID']} --add-user-tag {row['NAME']}")


## Copy Cram files to ICA

In [None]:
# For each row in CRAM manifest
for i, row in df_cram.iterrows():
    cram_exit = os.system(f"aws s3 cp s3://{row['cram_bucket']}/{row['cram']} {os.environ['ICA_PREFIX']}cram/")
    crai_exit = os.system(f"aws s3 cp s3://{row['cram_bucket']}/{row['cram']}.crai {os.environ['ICA_PREFIX']}cram/")
    log = f"[{i}] {row['sample']} done - cram [{cram_exit}] - crai [{crai_exit}]"
    os.system(f"echo {log} >> cram_copy.log")
    print(log)

## Link CRAM & CRI to the sample

In [None]:
# For each sample
for i, row in df_sample.iterrows():
    # Get CRAM file ID
    cram_id = subprocess.check_output(f"icav2 -k $ICA_API_KEY --project-id $ICA_PROJECT_ID projectdata list --file-name {row['NAME']}.bqsr.cram | grep {row['NAME']} | cut -f 4", shell=True, text=True).strip()
    # Get CRAI file ID
    crai_id = subprocess.check_output(f"icav2 -k $ICA_API_KEY --project-id $ICA_PROJECT_ID projectdata list --file-name {row['NAME']}.bqsr.cram.crai | grep {row['NAME']} | cut -f 4", shell=True, text=True).strip()
    # Link the files to the sample object
    exit = os.system(f"icav2 -k $ICA_API_KEY --project-id $ICA_PROJECT_ID projectsamples link {row['ID']} --data-id {cram_id} --data-id {crai_id}")
    # log string
    log = f"[{i}] {row['NAME']} linked - [{exit}]"
    os.system(f"echo {log} >> cram_link.log")
    print(log)

In [None]:
# From the first pass above a few samples error
# In the log file it appears like: [45] WHH473 linked - [256]
# We can create a list of samples that failed & retry the linkage

# List of samples with error
# samples = ['WHH473'] # Test
samples = ['WHH474', 'WHH475', 'WHH483', 'WHH502', 'WHH884', 'WHH1385', 'WHH1539', 'WHH1540', 'WHH2160', 'WHH2161', 'WHH2162']
# For each sample
for sample in samples:
    # Get sample ID
    sample_id = subprocess.check_output(f"icav2 -k $ICA_API_KEY --project-id $ICA_PROJECT_ID projectsamples list --user-tag {sample} | grep {sample} | cut -f 1", shell=True, text=True).strip()
    # Get CRAM file ID
    cram_id = subprocess.check_output(f"icav2 -k $ICA_API_KEY --project-id $ICA_PROJECT_ID projectdata list --file-name {sample}.bqsr.cram | grep {sample} | cut -f 4", shell=True, text=True).strip()
    # Get CRAI file ID
    crai_id = subprocess.check_output(f"icav2 -k $ICA_API_KEY --project-id $ICA_PROJECT_ID projectdata list --file-name {sample}.bqsr.cram.crai | grep {sample} | cut -f 4", shell=True, text=True).strip()
    # Link the files to the sample object
    exit = os.system(f"icav2 -k $ICA_API_KEY --project-id $ICA_PROJECT_ID projectsamples link {sample_id} --data-id {cram_id} --data-id {crai_id}")
    # log string
    log = f"[fix] {sample} linked - [{exit}]"
    os.system(f"echo {log} >> cram_link.log")
    print(log)