In [63]:
import os
import pandas as pd
from Bio import SeqIO
import yaml
import numpy as np

# Load config
with open("config/TB_reps.yaml", 'r') as file:
    config = yaml.safe_load(file)
gpscs = config['samples']

xlen = 2200
amplicon_stats = []

with open('output.txt', 'w') as f:
    for gpsc, fasta_file in gpscs.items():
        records = list(SeqIO.parse(fasta_file, "fasta"))
        genome_length = sum(len(record.seq) for record in records)
        covered_positions = set()

        print(f"Processing {gpsc}...", file=f)

        # Load depth files
        fwd_depth_file = os.path.join("samtools_depth_indiv_primers", f"{gpsc}_fwd.depth")
        rev_depth_file = os.path.join("samtools_depth_indiv_primers", f"{gpsc}_rev.depth")
        fwd_df = pd.read_csv(fwd_depth_file, sep="\t", names=["Ref", "Pos", "Depth"])
        rev_df = pd.read_csv(rev_depth_file, sep="\t", names=["Ref", "Pos", "Depth"])

        # Get binding sites
        fwd_sites = fwd_df[fwd_df["Depth"] == 1]["Pos"].sort_values().tolist()
        rev_sites = rev_df[rev_df["Depth"] == 1]["Pos"].sort_values().tolist()

        # Find amplicons: forward-facing
        for fwd_pos in fwd_sites:
            rev_candidates = [rev for rev in rev_sites if fwd_pos < rev <= fwd_pos + xlen]
            if rev_candidates:
                rev_pos = rev_candidates[0]  # nearest downstream reverse primer
                amplicon_stats.append((gpsc, fwd_pos, rev_pos, 'forward'))
                covered_positions.update(range(fwd_pos, rev_pos + 1))
                print(f"Forward amplicon: {fwd_pos}-{rev_pos}", file=f)

        # Find amplicons: reverse-facing
        for rev_pos in rev_sites:
            fwd_candidates = [fwd for fwd in fwd_sites if rev_pos - xlen <= fwd < rev_pos]
            if fwd_candidates:
                fwd_pos = fwd_candidates[-1]  # nearest upstream forward primer
                amplicon_stats.append((gpsc, fwd_pos, rev_pos, 'reverse'))
                covered_positions.update(range(fwd_pos, rev_pos + 1))
                print(f"Reverse amplicon: {fwd_pos}-{rev_pos}", file=f)

        coverage_percentage = (len(covered_positions) / genome_length) * 100
        print(f"Coverage: {coverage_percentage:.2f}%", file=f)

# Save amplicon stats
import csv
with open('amplicon_positions.csv', 'w', newline='') as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(["Sequence", "Start", "End", "Direction"])
    for row in amplicon_stats:
        csvwriter.writerow(row)

In [None]:
            ------------------------------------------------------ separate fwd and rev amplicon predictions --------------------------------------------------------------

In [64]:
import pandas as pd

# create csv file containing predicted genome coverage for each sequence based on predicted amplicon coverage
total_genome_coverages_df = pd.DataFrame([total_genome_coverages])

total_genome_coverages_df.to_csv('genome_coverage_pc.csv', index=False)

In [65]:
for gpsc, coverage in total_genome_coverages.items():
    print(f"{gpsc}: {coverage}%")

H37Rv: 93.64311536219164%
K.pneumoniae: 0.12555861540967705%
M.canetti: 87.48249829777191%
M.intracellulare: 2.988559533333506%
M.kansasii: 3.6939795969607654%
M.leprae: 2.255239225982645%
M.bovis: 92.60376780728954%
PA01: 0.5210391922360053%
S.odontolytica: 0.38443199063779226%
H.influenzae: 0.0%
M.abscessus: 0.9090869621161469%
M.fortuitum: 1.1757594981761055%
OXC141: 0.16270085381126995%
S.aureus: 0.0%


In [66]:
from Bio import SeqIO

# Load the config file
with open("config/TB_reps.yaml", 'r') as file:
    config=yaml.safe_load(file)

# Extract the GPSCs
gpscs = config['samples']

# Initialize a dictionary to store the genome lengths
genome_lengths = {}

# Calculate and store the length of each sequence
for gpsc, fasta_file in gpscs.items():
    records = list(SeqIO.parse(fasta_file, "fasta"))
    genome_length = sum(len(record.seq) for record in records)
    genome_lengths[gpsc] = genome_length  # Store the genome length in the dictionary

#Print the genome lengths
for gpsc, length in genome_lengths.items():
    print(f"{gpsc}: {length} bp")

H37Rv: 4411532 bp
K.pneumoniae: 5438894 bp
M.canetti: 4432426 bp
M.intracellulare: 5402402 bp
M.kansasii: 6432277 bp
M.leprae: 3187112 bp
M.bovis: 4349904 bp
PA01: 6264404 bp
S.odontolytica: 2455831 bp
H.influenzae: 1830138 bp
M.abscessus: 5067172 bp
M.fortuitum: 6406072 bp
OXC141: 2036867 bp
S.aureus: 2872762 bp


In [67]:
import csv
import yaml
from Bio import SeqIO

# Load the config file
with open("config/TB_reps.yaml", 'r') as file:
    config = yaml.safe_load(file)

# Extract the GPSCs
gpscs = config['samples']

# Open a CSV file to write
with open('assembly_lengths.csv', mode='w', newline='') as csv_file:
    # Create a CSV writer object
    csv_writer = csv.writer(csv_file)
    
    # Calculate and store the length of each sequence, then write to CSV
    for gpsc, fasta_file in gpscs.items():
        records = list(SeqIO.parse(fasta_file, "fasta"))
        genome_length = sum(len(record.seq) for record in records)
        # Write the ID, start (0), and the length to the CSV file
        csv_writer.writerow([gpsc, 0, genome_length])