In [None]:
## Package Import
import sys
import os 
import numpy as np
import pandas as pd
from datetime import datetime
import glob

## Shell script and R script were generated in the testing script under the tcr-quant-replication workspace copied here for new workspace

In [None]:
## Setting for running dsub jobs
pd.set_option('display.max_colwidth', 0)

In [None]:
USER_NAME = os.getenv('OWNER_EMAIL').split('@')[0].replace('.','-')

# Save this Python variable as an environment variable so that its easier to use within %%bash cells.
%env USER_NAME={USER_NAME}

In [None]:
## MODIFY FOR FULL DATA RUN
# Use hyphens, not whitespace since it will become part of the bucket path.
##TODO: ADD JOB NAME
JOB_NAME=''

# Save this Python variable as an environment variable so that its easier to use within %%bash cells.
%env JOB_NAME={JOB_NAME}

In [None]:
## Analysis Results Folder
line_count_results_folder = os.path.join(
    os.getenv('WORKSPACE_BUCKET'),
    'dsub',
    'results',
    JOB_NAME,
    USER_NAME,
    datetime.now().strftime('%Y%m%d'))

line_count_results_folder

In [None]:
## Where the output files will go
output_files = os.path.join(line_count_results_folder, "results")
print(output_files)

In [None]:
OUTPUT_FILES = output_files


# Save this Python variable as an environment variable so that its easier to use within %%bash cells.
%env OUTPUT_FILES={OUTPUT_FILES}

## R script for T-cell Extrect

In [None]:
%%writefile ~/tcell_extrect.R

#!/usr/bin env Rscript --vanilla
## Loading library
library(TcellExTRECT)
coverage_file <- commandArgs(trailingOnly=TRUE)[1]
outfile <- commandArgs(trailingOnly=TRUE)[2]

loaded_cov_file <- loadCov(coverage_file)

TCRA.out <- runTcellExTRECT(loaded_cov_file, TCRA_exons_hg38, tcra_seg_hg38, 'hg38')


write.table(TCRA.out, file=outfile, sep='\t')

### Shell Script for TcellExTRECT

In [None]:
%%writefile ~/tcell_extrect.sh

set -o pipefail
set -o errexit

# ---------Required Inputs---------

# Given a .txt file - get X samples.
# For parallel submissions:
# - Use a different .txt file per submission.
# - Each .txt file can contain a different number of lines
INPUT_FILES_PATH="$(dirname "${INPUT_FILES}")"
INPUT_FILES_PATTERN="$(basename "${INPUT_FILES}")"
readonly tcr_results=( $(ls "${INPUT_FILES_PATH}"/${INPUT_FILES_PATTERN}) )
echo "INPUT_FILES_PATH: ${INPUT_FILES_PATH}"
echo "INPUT_FILES_PATTERN: ${INPUT_FILES_PATTERN}"
echo "tcr_results: ${tcr_results}"
# ---------Required Output---------

for (( i=0; i<${#tcr_results[@]}; i++ ));
do
    export tcr_results_txt=${tcr_results[i]}
    export tcr_results_txt_name=`basename ${tcr_results_txt}`  # file_name.txt
    export txt_input="${tcr_results_txt}"
    sample_id=($(echo ${tcr_results_txt_name} | cut -d'.' -f1))
    # ----------------------------------WORKFLOW----------------------------------
        ##Run R-Script for T-Cell Extrect
    R < "${tca_rscript}" --vanilla --args "${txt_input}" ${sample_id}_fraction.txt

    # Outputs
    export tcr_fraction_txt="${sample_id}_fraction.txt"
    echo "tcr_fraction_txt: ${tcr_fraction_txt}"

    # Disk space
    echo "Disk space taken up so far:"
    du -d 1 -h
    echo "${i} run(s) finished."

    #Move results to output directory
    mv ${tcr_fraction_txt} ${OUTPUT_PATH}
done

### All of Us Dsub Command 

In [None]:
%%bash --out LINE_COUNT_JOB_ID

# Get a shorter username to leave more characters for the job name.
DSUB_USER_NAME="$(echo "${OWNER_EMAIL}" | cut -d@ -f1)"

# For AoU RWB projects network name is "network".
AOU_NETWORK=network
AOU_SUBNETWORK=subnetwork

# Get all tcr_files
bashArray=()

while read line; do
  bashArray+=($line)
done < ## TODO: Make file of filepaths to read

# Length of entire array
len_bashArray=${#bashArray[@]}

LOWER=2830
UPPER=2831
DATE=20230424
MACHINE_TYPE="n2-standard-4"
##TODO: update bash script pathway
BASH_SCRIPT=""
for ((batch=$LOWER;batch<$UPPER;batch+=1))
do
dsub \
    --provider google-cls-v2 \
    --project "${GOOGLE_PROJECT}"\
    --network "${AOU_NETWORK}" \
    --subnetwork "${AOU_SUBNETWORK}" \
    --service-account "$(gcloud config get-value account)" \
    --user "${DSUB_USER_NAME}" \
    --regions us-central1 \
    --logging "" \ ## TODO add loging pathway
    "$@" \
    --preemptible \
    --boot-disk-size 100 \
    --machine-type ${MACHINE_TYPE} \
    --disk-size 100 \
    --name "${JOB_NAME}" \
    --script "${BASH_SCRIPT}" \
    --image 'gcr.io/bick-aps2/briansha/tcellextrect:latest' \
    --env GOOGLE_PROJECT=${GOOGLE_PROJECT} \
    --input tca_rscript="" \ ## TODO add pathway to R script
    --input INPUT_FILES=${bashArray[batch]} \
    --output-recursive OUTPUT_PATH="${OUTPUT_FILES}/${batch}"
done