In [12]:
# Imports
import os
import shutil
import subprocess
from Bio import SeqIO

In [13]:
# Constants
DATA_DIR = "data"
JAR_LIBS = "lib/htsjdk-unspecified-SNAPSHOT.jar:lib/picocli-4.1.2.jar:lib/pal-1.5.1.jar:lib/cache2k-all-1.0.2.Final.jar:lib/commons-math3-3.6.1.jar"
JAR_DIR = "jar"
FASTA_REF = os.path.join(DATA_DIR, "fasta/HCMV_Merlin_UL33.fasta")
SAMPLES = ["04.B1.W14.01", "04.M1.W09.02", "05.B1.W14.04", "05.M1.W08.03", "27.B1.W13.06", "27.M1.W10.07", "30.B1.W11.08",
           "30.M1.W04.09", "38.B1.W10.11", "38.M1.W03.10", "39.B1.W11.12", "39.M1.W03.13", "39.M1.W05.14", "53.B1.W14.17", 
           "53.M1.W07.16", "56.B1.W09.22", "56.M1.W03.21", "63.B1.W09.29", "63.M1.W02.30", "66.B1.W09.25", "66.M1.W02.24"]

In [14]:
# Function to  move the file from src to dest
def move_file(src, dest):
    try: shutil.move(src, dest)
    except FileNotFoundError as e: print(f"Error moving or copying file from {src} to {dest}. Error: {e}")

In [15]:
# Function to generate strandcount for the given sample
def generate_strandcount(sample):
    sample_dir = os.path.join(DATA_DIR, sample)
    command = f"java -cp {JAR_LIBS}:{JAR_DIR}/MakeReadCount.jar makereadcount.MakeReadCount {sample_dir}/{sample}.bam"
    result = subprocess.run(command, shell=True, capture_output=True, text=True, check=True)
    move_file(f"{sample}.log", f"{sample_dir}/{sample}.log")
    move_file(f"{sample}.strandcount.csv", f"{sample_dir}/{sample}.strandcount.csv")
    with open(os.path.join(sample_dir, "sample.txt"), 'w') as file: file.write(f"{sample}.strandcount.csv")

In [16]:
# Function to run HaROLD for the given sample and haplotypes
def run_harold(sample, haplotypes):
    command = f"java -Xmx16384m -jar {JAR_DIR}/Cluster_RG/dist/HaROLD-2.0.jar --count-file {DATA_DIR}/{sample}/sample.txt --haplotypes {haplotypes} --alpha-frac 0.5 --gamma-cache 10000 -H -L --threads 12 -p {DATA_DIR}/{sample}/{sample} --seed 1"
    result = subprocess.run(command, shell=True, capture_output=True, text=True, check=True)

In [17]:
# Function to refine HaROLD output for the given sample and output directory
def refine_harold_output(sample, output_directory):
    command = f"java -Xmx16384m -cp {JAR_LIBS}:{JAR_DIR}/RefineHaplotypes.jar refineHaplotypes.RefineHaplotypes -t {DATA_DIR}/{sample}/{sample} --bam {DATA_DIR}/{sample}/{sample}.bam --baseFreq {output_directory}/step_1/{sample}.lld --refSequence {FASTA_REF} --hapAlignment {output_directory}/step_1/{sample}Haplo.fasta --iterate"
    result = subprocess.run(command, shell=True, capture_output=True, text=True, check=True)

In [25]:
# Function to execute the entire pipeline for a given sample
def run_pipeline(sample):
    # Generate strandcount for the sample
    generate_strandcount(sample)

    # Iterate over haplotype values from 2 to 5
    for n in range(1, 11):
        output_directory = os.path.join(DATA_DIR, sample, f"{n}")
        sub_dirs = [output_directory, os.path.join(output_directory, "step_1"), os.path.join(output_directory, "step_2")]

        # Create necessary subdirectories
        for sub_dir in sub_dirs:
            if not os.path.exists(sub_dir):
                os.makedirs(sub_dir)
                
        # Run HaROLD for the sample with current haplotype value and move necessary files
        run_harold(sample, 5)
        files_to_move = [".lld", ".log", "Haplo.fasta"]
        for ext in files_to_move:
            move_file(os.path.join(DATA_DIR, sample, f"{sample}{ext}"), os.path.join(output_directory, "step_1", f"{sample}{ext}"))

        # Refine HaROLD output and move the refined output files
        refine_harold_output(sample, output_directory)
        move_file(os.path.join(DATA_DIR, sample, f"{sample}.log"), os.path.join(output_directory, "step_2", f"{sample}.log"))
        move_file(os.path.join(DATA_DIR, sample, f"{sample}.fasta"), os.path.join(output_directory, "step_2", f"{sample}.fasta"))

In [None]:
# Loop over all samples to run the pipeline
for sample in SAMPLES:
    print(f"Processing sample: {sample}")
    run_pipeline(sample)

Processing sample: 04.B1.W14.01
Processing sample: 04.M1.W09.02
Processing sample: 05.B1.W14.04
Processing sample: 05.M1.W08.03
Processing sample: 27.B1.W13.06
Processing sample: 27.M1.W10.07
Processing sample: 30.B1.W11.08
Processing sample: 30.M1.W04.09
Processing sample: 38.B1.W10.11
Processing sample: 38.M1.W03.10
Processing sample: 39.B1.W11.12
Processing sample: 39.M1.W03.13
Processing sample: 39.M1.W05.14
Processing sample: 53.B1.W14.17
Processing sample: 53.M1.W07.16
Processing sample: 56.B1.W09.22
Processing sample: 56.M1.W03.21
Processing sample: 63.B1.W09.29
Processing sample: 63.M1.W02.30
Processing sample: 66.B1.W09.25
