In [3]:
# Imports
import os
import subprocess
from Bio import SeqIO

In [6]:
# List of sample identifiers to process
samples = ["04.B1.W14.01", "04.M1.W09.02", 
           "05.B1.W14.04", "05.M1.W08.03",
           "27.B1.W13.06", "27.M1.W10.07", 
           "30.B1.W11.08", "30.M1.W04.09", 
           "38.B1.W10.11", "38.M1.W03.10", 
           "39.B1.W11.12", "39.M1.W03.13", "39.M1.W05.14", 
           "53.B1.W14.17", "53.M1.W07.16", 
           "56.B1.W09.22", "56.M1.W03.21", 
           "63.B1.W09.29", "63.M1.W02.30", 
           "66.B1.W09.25", "66.M1.W02.24"]

# Index the reference FASTA file to prepare for alignment
reference_fasta = "data/fasta/reference/HCMV_Merlin_UL33.fasta"
output_directory = "data/fasta/consensus/"
subprocess.run(["samtools", "faidx", reference_fasta], check=True)

# Initialize a list to store consensus sequences
all_consensus_records = []

# Process each sample
for sample in samples:
    print(f"{sample} in process") 
    initial_path = f"data/{sample}/"
    bam_file = f"{initial_path}{sample}.bam"
    bcf_file = f"{initial_path}{sample}.bcf"
    vcf_file = f"{initial_path}{sample}.vcf"
    consensus_file = f"{initial_path}/{sample}_consensus.fasta"
    
    # Check for the existence of the BAM file
    if not os.path.exists(bam_file):
        print(f"Error: BAM file for {sample} does not exist.")
        continue

    try:
        # Generate pileup from BAM file using the reference FASTA
        subprocess.run(["bcftools", "mpileup", "-f", reference_fasta, bam_file, "--max-depth", "500", "-Ou", "-o", bcf_file], check=True)
        # Call variants from the pileup data
        subprocess.run(["bcftools", "call", "-mv", "--ploidy", "1", "-Ov", "-o", vcf_file, bcf_file], check=True)
        # Compress and index the VCF for use in consensus
        subprocess.run(["bgzip", "-f", vcf_file], check=True)
        subprocess.run(["bcftools", "index", f"{vcf_file}.gz"], check=True)
        # Generate a consensus sequence from the VCF
        subprocess.run(["bcftools", "consensus", "-f", reference_fasta, "-o", consensus_file, f"{vcf_file}.gz"], check=True)
    except subprocess.CalledProcessError as e:
        print(f"Error processing {sample}: {e}")
        continue

    # Load consensus sequences into memory
    consensus_records = list(SeqIO.parse(consensus_file, "fasta"))
    for record in consensus_records:
        record.id = f"{sample}_consensus"
        record.description = f"{sample}_consensus"
        all_consensus_records.append(record)

    # Move each individual consensus file to the specified directory
    new_consensus_file_path = os.path.join(output_directory, f"{sample}_consensus.fasta")
    os.rename(consensus_file, new_consensus_file_path)
    print(f"Consensus sequence file for {sample} moved to {output_directory}")

# Write all consensus sequences to a single FASTA file
SeqIO.write(all_consensus_records, "data/fasta/consensus/all_consensus.fasta", "fasta")


04.B1.W14.01 in process


[mpileup] 1 samples in 1 input files
[mpileup] maximum number of reads per input file set to -d 500
Note: the --sample option not given, applying all records regardless of the genotype
Applied 175 variants
[mpileup] 1 samples in 1 input files
[mpileup] maximum number of reads per input file set to -d 500


Consensus sequence file for 04.B1.W14.01 moved to data/fasta/consensus/
04.M1.W09.02 in process


Note: the --sample option not given, applying all records regardless of the genotype
Applied 126 variants
[mpileup] 1 samples in 1 input files
[mpileup] maximum number of reads per input file set to -d 500


Consensus sequence file for 04.M1.W09.02 moved to data/fasta/consensus/
05.B1.W14.04 in process


Note: the --sample option not given, applying all records regardless of the genotype
Applied 11 variants
[mpileup] 1 samples in 1 input files
[mpileup] maximum number of reads per input file set to -d 500


Consensus sequence file for 05.B1.W14.04 moved to data/fasta/consensus/
05.M1.W08.03 in process


Note: the --sample option not given, applying all records regardless of the genotype
Applied 11 variants
[mpileup] 1 samples in 1 input files
[mpileup] maximum number of reads per input file set to -d 500


Consensus sequence file for 05.M1.W08.03 moved to data/fasta/consensus/
27.B1.W13.06 in process


Note: the --sample option not given, applying all records regardless of the genotype
Applied 152 variants
[mpileup] 1 samples in 1 input files
[mpileup] maximum number of reads per input file set to -d 500


Consensus sequence file for 27.B1.W13.06 moved to data/fasta/consensus/
27.M1.W10.07 in process


Note: the --sample option not given, applying all records regardless of the genotype
Applied 134 variants
[mpileup] 1 samples in 1 input files
[mpileup] maximum number of reads per input file set to -d 500


Consensus sequence file for 27.M1.W10.07 moved to data/fasta/consensus/
30.B1.W11.08 in process


Note: the --sample option not given, applying all records regardless of the genotype
Applied 47 variants
[mpileup] 1 samples in 1 input files
[mpileup] maximum number of reads per input file set to -d 500


Consensus sequence file for 30.B1.W11.08 moved to data/fasta/consensus/
30.M1.W04.09 in process


Note: the --sample option not given, applying all records regardless of the genotype
Applied 47 variants
[mpileup] 1 samples in 1 input files
[mpileup] maximum number of reads per input file set to -d 500


Consensus sequence file for 30.M1.W04.09 moved to data/fasta/consensus/
38.B1.W10.11 in process


Note: the --sample option not given, applying all records regardless of the genotype
Applied 159 variants
[mpileup] 1 samples in 1 input files
[mpileup] maximum number of reads per input file set to -d 500


Consensus sequence file for 38.B1.W10.11 moved to data/fasta/consensus/
38.M1.W03.10 in process


Note: the --sample option not given, applying all records regardless of the genotype
Applied 159 variants
[mpileup] 1 samples in 1 input files
[mpileup] maximum number of reads per input file set to -d 500


Consensus sequence file for 38.M1.W03.10 moved to data/fasta/consensus/
39.B1.W11.12 in process


Note: the --sample option not given, applying all records regardless of the genotype
Applied 145 variants
[mpileup] 1 samples in 1 input files
[mpileup] maximum number of reads per input file set to -d 500


Consensus sequence file for 39.B1.W11.12 moved to data/fasta/consensus/
39.M1.W03.13 in process


Note: the --sample option not given, applying all records regardless of the genotype
Applied 81 variants
[mpileup] 1 samples in 1 input files
[mpileup] maximum number of reads per input file set to -d 500


Consensus sequence file for 39.M1.W03.13 moved to data/fasta/consensus/
39.M1.W05.14 in process


Note: the --sample option not given, applying all records regardless of the genotype
Applied 30 variants
[mpileup] 1 samples in 1 input files
[mpileup] maximum number of reads per input file set to -d 500


Consensus sequence file for 39.M1.W05.14 moved to data/fasta/consensus/
53.B1.W14.17 in process


Note: the --sample option not given, applying all records regardless of the genotype
Applied 151 variants
[mpileup] 1 samples in 1 input files
[mpileup] maximum number of reads per input file set to -d 500


Consensus sequence file for 53.B1.W14.17 moved to data/fasta/consensus/
53.M1.W07.16 in process


Note: the --sample option not given, applying all records regardless of the genotype
Applied 156 variants
[mpileup] 1 samples in 1 input files
[mpileup] maximum number of reads per input file set to -d 500


Consensus sequence file for 53.M1.W07.16 moved to data/fasta/consensus/
56.B1.W09.22 in process


Note: the --sample option not given, applying all records regardless of the genotype
Applied 168 variants
[mpileup] 1 samples in 1 input files
[mpileup] maximum number of reads per input file set to -d 500


Consensus sequence file for 56.B1.W09.22 moved to data/fasta/consensus/
56.M1.W03.21 in process


Note: the --sample option not given, applying all records regardless of the genotype
Applied 143 variants
[mpileup] 1 samples in 1 input files
[mpileup] maximum number of reads per input file set to -d 500


Consensus sequence file for 56.M1.W03.21 moved to data/fasta/consensus/
63.B1.W09.29 in process


Note: the --sample option not given, applying all records regardless of the genotype
Applied 2 variants
[mpileup] 1 samples in 1 input files
[mpileup] maximum number of reads per input file set to -d 500


Consensus sequence file for 63.B1.W09.29 moved to data/fasta/consensus/
63.M1.W02.30 in process


Note: the --sample option not given, applying all records regardless of the genotype
Applied 2 variants
[mpileup] 1 samples in 1 input files
[mpileup] maximum number of reads per input file set to -d 500


Consensus sequence file for 63.M1.W02.30 moved to data/fasta/consensus/
66.B1.W09.25 in process


Note: the --sample option not given, applying all records regardless of the genotype
Applied 154 variants
[mpileup] 1 samples in 1 input files
[mpileup] maximum number of reads per input file set to -d 500


Consensus sequence file for 66.B1.W09.25 moved to data/fasta/consensus/
66.M1.W02.24 in process
Consensus sequence file for 66.M1.W02.24 moved to data/fasta/consensus/


Note: the --sample option not given, applying all records regardless of the genotype
Applied 157 variants


21