In [1]:
# Imports
import os
import shutil
import subprocess
from Bio import SeqIO

In [2]:
def generate_strandcount(sample):
    # Define the command
    command = "java -cp lib/htsjdk-unspecified-SNAPSHOT.jar:lib/picocli-4.1.2.jar:lib/pal-1.5.1.jar:lib/cache2k-all-1.0.2.Final.jar:lib/commons-math3-3.6.1.jar:jar/MakeReadCount.jar makereadcount.MakeReadCount data/" + sample + "/" + sample +".bam"
    # Execute the command and capture the output
    result = subprocess.run(command, shell=True, capture_output=True, text=True, check=True)
    # Display the output
    print(result.stdout)
    # Move files
    shutil.move(sample + ".log", "data/" + sample + "/" + sample + ".log")
    shutil.move(sample + ".strandcount.csv", "data/" + sample + "/" + sample + ".strandcount.csv")
    # Generate sample.txt
    with open("data/" + sample + "/sample.txt", 'w') as file:
        file.write(sample + ".strandcount.csv")

In [3]:
def run_harold(sample):
    # Define the command
    command = "java -Xmx16384m -jar jar/Cluster_RG/dist/HaROLD-2.0.jar --count-file data/" + sample + "/sample.txt --haplotypes 5 --alpha-frac 0.5 --gamma-cache 10000 -H -L --threads 12 -p data/" + sample + "/" + sample
    # Execute the command and capture the output
    result = subprocess.run(command, shell=True, capture_output=True, text=True, check=True)
    # Display the output
    print(result.stdout)
    # Rename the fasta file
    current_path = "data/" + sample + "/" + sample + "Haplo.fasta"
    new_path = "data/" + sample + "/" + sample + ".Haplo.fasta"
    os.rename(current_path, new_path)

In [4]:
def refine_harold_ouput(sample):
    # Define the command
    command = "java -Xmx16384m -cp lib/htsjdk-unspecified-SNAPSHOT.jar:lib/picocli-4.1.2.jar:lib/pal-1.5.1.jar:lib/commons-math3-3.6.1.jar:lib/cache2k-all-1.0.2.Final.jar:lib/flanagan.jar:jar/RefineHaplotypes.jar refineHaplotypes.RefineHaplotypes -t data/"+ sample + "/" + sample + " --bam data/" + sample + "/" + sample + ".bam --baseFreq data/"  + sample + "/" + sample + ".lld --refSequence data/fasta/HCMV_Merlin_UL33.fasta --hapAlignment data/" + sample + "/" + sample + ".Haplo.fasta --iterate"
    # Execute the command and capture the output
    result = subprocess.run(command, shell=True, capture_output=True, text=True, check=True)
    # Display the output
    print(result.stdout)

In [5]:
# Define list of samples to process
samples = ["04.B1.W14.01", "04.M1.W09.02", 
           "05.B1.W14.04", "05.M1.W08.03",
           "27.B1.W13.06", "27.M1.W10.07", 
           "30.B1.W11.08", "30.M1.W04.09", 
           "38.B1.W10.11", "38.M1.W03.10", 
           "39.B1.W11.12", "39.M1.W03.13", "39.M1.W05.14", 
           "53.B1.W14.17", "53.M1.W07.16", 
           "56.B1.W09.22", "56.M1.W03.21", 
           "63.B1.W09.29", "63.M1.W02.30", 
           "66.B1.W09.25", "66.M1.W02.24"]

# Loop through each sample in the list
for sample in samples:
    # Print the current sample name
    print("Sample:", sample)        
    # Generate strand count for the current sample
    generate_strandcount(sample)  
    # Run the harold algorithm/tool on the current sample
    run_harold(sample)        
    # Refine the output obtained from harold for the current sample
    refine_harold_ouput(sample)