In [3]:
import random


def generate_intervals(chrom_length, min_interval=20, max_interval=100, gap_max=50):
    intervals = []
    start = 0

    while start < chrom_length:
        # Generate an interval length within the specified bounds
        interval_length = random.randint(min_interval, max_interval)
        end = min(
            start + interval_length, chrom_length
        )  # Ensure we don't exceed chromosome length

        intervals.append((start, end))

        # Move the start to the end of this interval plus a random gap
        start = end + random.randint(1, gap_max)

    # Adjust the last interval if it doesn't reach the end
    if intervals[-1][1] < chrom_length:
        intervals[-1] = (intervals[-1][0], chrom_length)

    return intervals


def assign_values(intervals, max_value):
    return [(start, end, random.randint(0, max_value)) for start, end in intervals]


def write_bedgraph(chrom, intervals_with_values, output_file):
    with open(output_file, "w") as bedgraph:
        for start, end, value in intervals_with_values:
            bedgraph.write(f"{chrom}\t{start}\t{end}\t{value}\n")


# Parameters
chrom = "chr1"
chrom_length = 1000
max_value = 4
output_file = "diploid/pop1.bedgraph"

# Generate intervals, assign values, and write to BEDGraph file
intervals = generate_intervals(chrom_length)
intervals_with_values = assign_values(intervals, max_value)
write_bedgraph(chrom, intervals_with_values, output_file)

print(f"BEDGraph file '{output_file}' generated.")


BEDGraph file 'diploid/pop1.bedgraph' generated.


In [None]:
import pandas as pd


def compute_truth_from_combined_file(combined_file_path, ploidy, output_truth_path):
    # Read the combined file into a DataFrame
    df = pd.read_csv(combined_file_path, sep="\t")

    # Initialize lists to store computed values
    within_pop0 = []
    within_pop1 = []
    dxy_comps = []

    # Loop through each row and compute within and dxy comparisons
    for _, row in df.iterrows():
        haps_pop0 = ploidy * row["pop0"]
        haps_pop1 = ploidy * row["pop1"]
        interval_len = row["End"] - row["Start"]
        # Calculate within-population comparisons
        within_comp_pop0 = (
            haps_pop0 * (haps_pop0 - 1) // 2 if haps_pop0 > 1 else 0
        ) * interval_len
        within_comp_pop1 = (
            haps_pop1 * (haps_pop1 - 1) // 2 if haps_pop1 > 1 else 0
        ) * interval_len

        # Calculate dxy comparison between pop0 and pop1
        dxy_comp = (haps_pop0 * haps_pop1) * interval_len

        # Append computed values to lists
        within_pop0.append(within_comp_pop0)
        within_pop1.append(within_comp_pop1)
        dxy_comps.append(dxy_comp)

    # Create a DataFrame for the output
    result = pd.DataFrame(
        {
            "chrom": df["Chr"],
            "start": df["Start"],
            "end": df["End"],
            "within_pop0": within_pop0,
            "within_pop1": within_pop1,
            "dxy": dxy_comps,
        }
    )

    # Save the result to a tab-separated file
    result.to_csv(output_truth_path, sep="\t", index=False, header=True)


# Example usage
combined_file = "diploid/pops_callable_sites.bedgraph"
output_truth = "diploid/pops_truth.tsv"
ploidy = 2  # Set ploidy based on your data

compute_truth_from_combined_file(combined_file, ploidy, output_truth)


In [9]:
import pandas as pd


def compute_within_truth(input_file_path, ploidy, output_truth_path):
    # Read the input file into a DataFrame
    df = pd.read_csv(
        input_file_path,
        sep="\t",
        header=None,
        names=["chrom", "start", "end", "callable_indvs"],
    )

    # Initialize a list to store within-population comparisons
    within_comps = []

    # Loop through each row and compute within comparisons
    for row in df.itertuples(index=False):
        haps = ploidy * row.callable_indvs

        # Calculate within-population comparisons
        within_comp = haps * (haps - 1) // 2 if haps > 1 else 0
        within_comp = within_comp * (row.end - row.start)
        # Append computed value to the list
        within_comps.append(within_comp)

    # Create a DataFrame for the output
    result = pd.DataFrame(
        {
            "chrom": df["chrom"],
            "start": df["start"],
            "end": df["end"],
            "within_comp": within_comps,
        }
    )

    # Save the result to a tab-separated file
    result.to_csv(output_truth_path, sep="\t", index=False, header=True)


# Example usage
input_file = "diploid/no_pops.bedgraph"
output_truth = "diploid/no_pops_truth.tsv"
ploidy = 2  # Set ploidy based on your data

compute_within_truth(input_file, ploidy, output_truth)


In [None]:
lines = []
with open("diploid/small.vcf") as f:
    for line in f:
        if not line.startswith("#"):
            line = line.strip()
            lines.append(line.split("\t"))
with open("diploid/small.vcf.counts.tsv", "w") as f:
    print("chrom\tpos\tref_count\talt_count", file=f)
    for split in lines:
        chrom = split[0]
        pos = split[1]
        gts = "".join(split[9:]).replace("/", "")
        ref = gts.count("0")
        alt = gts.count("1")

        print(chrom, pos, ref, alt, sep="\t", file=f)

In [3]:
lines = []
with open("diploid/small.vcf") as f:
    for line in f:
        if not line.startswith("#"):
            line = line.strip()
            lines.append(line.split("\t"))

with open("diploid/small.vcf.pop0.counts.tsv", "w") as f0:
    with open("diploid/small.vcf.pop1.counts.tsv", "w") as f1:
        print(
            "chrom\tpos\tref_count\talt_count",
            file=f0,
        )
        print(
            "chrom\tpos\tref_count\talt_count",
            file=f1,
        )
        for split in lines:
            chrom = split[0]
            pos = split[1]
            pop_0_gts = "".join(split[9:14]).replace("/", "")
            pop_0_ref = pop_0_gts.count("0")
            pop_0_alt = pop_0_gts.count("1")
            pop_1_gts = "".join(split[14:]).replace("/", "")
            pop_1_ref = pop_1_gts.count("0")
            pop_1_alt = pop_1_gts.count("1")

            print(chrom, pos, pop_0_ref, pop_0_alt, sep="\t", file=f0)
            print(chrom, pos, pop_1_ref, pop_1_alt, sep="\t", file=f1)