In [None]:
# Load libraries
from IPython.display import display, HTML
import pandas as pd
import polars as pl

import os
import subprocess

version = %env WORKSPACE_CDR
my_bucket = os.getenv('WORKSPACE_BUCKET')

# Function

In [None]:
# !gsutil ls {my_bucket}/data/stg009/eur/

In [None]:
def check_dsub_status(user=None, full=False):
    """Check status of dsub jobs for the specified user"""
    if user is None:
        # Get current user if not specified
        user = os.getenv("OWNER_EMAIL").split('@')[0]
    
    project = os.getenv("GOOGLE_PROJECT")

    if full:
        make_full = ' --full'
    else:
        make_full = ''
    
    cmd = f"dstat --provider google-cls-v2 --user {user} --status '*' --project {project}{make_full}"
    # cmd = f"ddel --provider google-cls-v2 --project terra-vpc-sc-840afe1e --location us-central1 --jobs 'transances--bwaxse--250319-022343-75' --users 'bwaxse'"
    print(f"Running: {cmd}")
    return subprocess.run(cmd, shell=True, capture_output=False)

In [None]:
def job_details(user=None, job=None):
    """List all jobs for the user, including failed ones"""
    project = os.getenv("GOOGLE_PROJECT")
    
    if user is None:
        user = os.getenv("OWNER_EMAIL").split('@')[0]
        
    if job is None:
        job = "'*' "
    else:
        job = f'--jobs {job} '
    
    cmd = f"dstat --provider google-cls-v2 --project {project} --user {user} --status {job}--full"
    print(f"Running: {cmd}")
    return subprocess.run(cmd, shell=True, capture_output=False)

In [None]:
def cancel_running_jobs():
    """Cancel only running/pending jobs (safer)"""
    project = os.getenv("GOOGLE_PROJECT")
    
    # Cancel only running jobs
    cancel_cmd = f"ddel --provider google-cls-v2 --project {project} --users 'bwaxse' --jobs '*'"
    print(f"Canceling running jobs: {cancel_cmd}")
    
    return subprocess.run(cancel_cmd, shell=True, capture_output=False)

In [None]:
def cancel_job(job_id):
    """Cancel a specific job"""
    project = os.getenv("GOOGLE_PROJECT")
    
    cmd = f"ddel --provider google-cls-v2 --project {project} --jobs {job_id}"
    print(f"Running: {cmd}")
    return subprocess.run(cmd, shell=True, capture_output=False)

# Variant filter script and function

In [None]:
%%writefile run_vcf_export.sh

#!/bin/bash

# VCF export for single chromosome
# Input: Single chromosome pgen files by ancestry
# Output: Single chromosome vcf file by ancestry

INPUT_PGEN_BASE="${INPUT_PGEN_PGEN%.*}"  # Remove .pgen extension
echo "Derived INPUT_PGEN_BASE: $INPUT_PGEN_BASE"

nthread=$(python -c "import os; print(len(os.sched_getaffinity(0)))");
echo "Running with $nthread threads";

OUTPUT_PREFIX="${OUTPUT_RESULTS%\*}"
# Set ancestry-specific output

echo "Processing ancestry: $ANCESTRY, chromosome: $CHROM"
echo "Output prefix: $OUTPUT_PREFIX"

echo "Converting pgen to vcf"
plink2 \
    --pfile $INPUT_PGEN_BASE \
    --export vcf bgz \
    --threads $nthread \
    --out $OUTPUT_PREFIX

# Index the VCF (creates .tbi and .csi)
echo "Creating tabix index"
tabix -p vcf ${OUTPUT_PREFIX}.vcf.gz
bcftools index -c ${OUTPUT_PREFIX}.vcf.gz

echo "vcf output complete for $ANCESTRY"
echo "Files created with prefix: $OUTPUT_PREFIX"
ls -la ${OUTPUT_PREFIX}*

In [None]:
def get_file_list(query_dir):
    tmp = subprocess.run(
        f'gsutil ls {query_dir}',
        shell=True,
        capture_output=True
    )
    files = tmp.stdout.decode('utf-8').split('\n')
    return(files)

In [None]:
def dsub_script(
    machine_type,
    out_base,
    anc,
    chrom=1,
    boot_disk=100,
    disk_size=100,
    memory=12000,
    script='run_vcf_export.sh'
):
    
    # get useful info
    dsub_user_name = os.getenv("OWNER_EMAIL").split('@')[0]
    user_name = os.getenv("OWNER_EMAIL").split('@')[0].replace('.', '-')

    job_name = f'{anc}_{chrom}_vcf_export'

    # Template for input files (will be substituted in script)
    my_bucket = os.getenv('WORKSPACE_BUCKET') 
    input_pgen_base = f'gs://fc-aou-datasets-controlled/v8/wgs/short_read/snpindel/acaf_threshold/pgen/acaf_threshold.chr{chrom}'
    
    # Build dsub command
    cmd = [
        'dsub',
        '--provider', 'google-cls-v2',
        '--machine-type', machine_type,
        '--disk-type', 'pd-ssd',
        '--boot-disk-size', str(boot_disk),
        '--disk-size', str(disk_size),
        '--user-project', os.environ['GOOGLE_PROJECT'],
        '--project', os.environ['GOOGLE_PROJECT'],
        '--image', 'us.gcr.io/broad-dsp-gcr-public/terra-jupyter-aou:2.2.14',
        '--network', 'network',
        '--subnetwork', 'subnetwork',
        '--service-account', subprocess.check_output(['gcloud', 'config', 'get-value', 'account']).decode().strip(),
        '--user', dsub_user_name,
        '--logging', f"{os.environ['WORKSPACE_BUCKET']}/dsub/logs/{{job-name}}/{{user-id}}/{{job-id}}-{{task-id}}-{{task-attempt}}.log",
        '--name', job_name,
        '--env', f'GOOGLE_PROJECT={os.environ["GOOGLE_PROJECT"]}',
        '--env', f'ANCESTRY={anc}',
        '--env', f'CHROM={chrom}',
        # Input files
        '--input', f'INPUT_PGEN_PGEN={input_pgen_base}.pgen',
        '--input', f'INPUT_PGEN_PSAM={input_pgen_base}.psam', 
        '--input', f'INPUT_PGEN_PVAR={input_pgen_base}.pvar',
        # Output files
        '--output', f'OUTPUT_RESULTS={out_base}*',
        '--script', script
    ]
            
    subprocess.run(cmd)

In [None]:
def run_vcf_export(
    my_bucket,
    ancestries,
    script='run_vcf_export.sh',
):
    """
    Run VCF export for each ancestry and chromosome
    Output: one vcf.gz file with index per ancestry-chromosome
    """

    # Process each chromosome for each ancestry
    for chrom in [str(i) for i in range(1, 23)] + ['X', 'Y']:
        for anc in ancestries:
            # Output directory
            out_dir = f'{my_bucket}/data/stg009/{anc}'
    
            # Check if already exists (check for a few chromosome files)
            existing_files = get_file_list(out_dir)
            check_chroms = [1, 10, 22]  # Check beginning, middle, end
            if any(f'{anc}_genotypes_chr{chrom}.vcf.gz' in f for f in existing_files):
                print(f"VCF files already exist for {anc} chr{chrom}")
                continue  # Skip this combination
        
            print(f"Starting serial vcf export for {anc} ancestry...")

            dsub_script(
                machine_type='c3-standard-8',
                out_base=f'{out_dir}/{anc}_genotypes_chr{chrom}',
                anc=anc,
                chrom=chrom,
                boot_disk=100,
                disk_size=100,
                script=script
            )

# Run

In [None]:
ancestries_considered = ['eur', 'afr', 'amr', 'eas', 'sas', 'mid']

In [None]:
# # # Test
run_vcf_export(my_bucket, ['mid'])

In [None]:
run_vcf_export(my_bucket, ancestries_considered)

# Check dsub

In [None]:
check_dsub_status(full=False)

In [None]:
job_id = 'mid-22-vcf--bwaxse--250618-175310-96'

In [None]:
job_details(job=job_id)

In [None]:
!gsutil cat {bucket or my_bucket}/dsub/logs/mid-22-vcf-export/bwaxse/mid-22-vcf--bwaxse--250618-150520-32-task-None.log

In [None]:
!gsutil du -h {bucket or my_bucket}/data/stg009/mid/