# Clean-up

In [1]:
%load_ext dotenv

In [2]:
%dotenv

In [3]:
import os
import pandas as pd
import subprocess
import io

In [4]:
# Check ICA_API_KEY
print(f"ICA_API_KEY {'exists' if 'ICA_API_KEY' in os.environ else 'missing'}")

ICA_API_KEY exists


In [5]:
# Get project ID
project_name = 'sg10k-dragen-reanalysis'
project_id = subprocess.check_output(f"icav2 -k $ICA_API_KEY projects list | grep {project_name} | cut -f 1 ", shell=True, text=True).strip()
print(f"project: {project_name} [{project_id}]")

project: sg10k-dragen-reanalysis [bf47eb3e-868e-4f88-97ac-c2a76c3c6ac4]


## Copy the DRAGEN outputs into archives

In [6]:
# List ICA sample objects
# os.system(f"icav2 -k $ICA_API_KEY --project-id {project_id} projectsamples list > projectsamples-list.txt")
# Read projectsamples-list
cols = ['ID', 'NAME', 'STATUS', 'DESCRIPTION', 'USER TAGS', 'TECHNICAL TAGS']
df_sample = pd.read_csv('projectsamples-list.txt', sep='\t', header=0, index_col=False, engine='python', skipfooter=1,
                 names=cols, converters={col: str.strip for col in cols}
                )
# Drop NPM1NA# 
df_sample = df_sample.drop(0)

print(f'n={len(df_sample):,}')
print(df_sample.columns)
# df_sample

n=1,543
Index(['ID', 'NAME', 'STATUS', 'DESCRIPTION', 'USER TAGS', 'TECHNICAL TAGS'], dtype='object')


In [11]:
# For each sample
for i, row in df_sample.iterrows():
    # Get sample name & sample ID
    sample_name = row['NAME']
    # sample_id = row['ID']

    # List DRAGEN gVCF
    file = 'hard-filtered.gvcf.gz'
    os.system(f"icav2 -k $ICA_API_KEY --project-id {project_id} projectdata list --file-name {sample_name}.{file} | grep {sample_name} >> {file}-list.txt")
    
print('done')

done


In [17]:
# Read vcf list
cols = ['NAME', 'TYPE', 'STATUS', 'ID', 'PROJECT_NAME', 'PROJECT_ID', 'PATH']
df_vcf = pd.read_csv('hard-filtered.gvcf.gz-list.txt', sep='\t', header=None, index_col=False, engine='python',
                 names=cols, converters={col: str.strip for col in cols}
                )

print(f'n={len(df_vcf):,}')
print(df_vcf.columns)
# df_vcf

n=1,542
Index(['NAME', 'TYPE', 'STATUS', 'ID', 'PROJECT_NAME', 'PROJECT_ID', 'PATH'], dtype='object')


In [18]:
# Check for duplicates
df_vcf[df_vcf['NAME'].isin(df_vcf['NAME'][df_vcf['NAME'].duplicated()])]

Unnamed: 0,NAME,TYPE,STATUS,ID,PROJECT_NAME,PROJECT_ID,PATH


In [19]:
# Define the S3 prefix
source_prefix = "s3://precise-ica-storage/byob/sg10k-dragen-reanalysis"
target_prefix = "s3://precise-wgs-databundle/byob/sg10k-wgs-dragen"

# Extract sample ID
df_vcf['SAMPLE_ID'] = df_vcf['PATH'].str.extract(r'/dragen_378_output/(.*?)-[a-f0-9\-]+/output/')
df_vcf['UUID'] = df_vcf['PATH'].str.extract(r'/dragen_378_output/.*?-(.*?)/output/')

# Reconstruct source path
df_vcf['SOURCE_PATH'] = source_prefix + df_vcf['PATH'].str.extract(r'(/dragen_378_output/.+/output/dragen-germline/output/)')
# Reconstruct target path
df_vcf['TARGET_PATH'] = target_prefix + '/' + df_vcf['SAMPLE_ID'] + '/' + df_vcf['UUID'] + '/output/'


In [22]:
# For each sample
for i, row in df_vcf.iterrows():
    # Copy Data bundle
    exit = os.system(f"aws s3 sync {row['SOURCE_PATH']} {row['TARGET_PATH']} --storage-class DEEP_ARCHIVE")
    
    log = f"[{i}] {row['SAMPLE_ID']} done [{exit}]"
    os.system(f"echo {log} >> databundle_copy.log")
    print(log)

aws s3 sync s3://precise-ica-storage/byob/sg10k-dragen-reanalysis/dragen_378_output/WHH430-ad8eb831-23cb-4ce9-87b3-a333680c4b60/output/dragen-germline/output/ s3://precise-wgs-databundle/byob/sg10k-wgs-dragen/WHH430/ad8eb831-23cb-4ce9-87b3-a333680c4b60/output/ --dryrun
