In [3]:
import pandas as pd
import numpy as np
import os
import random
from pyfastaq.sequences import file_reader as fasta_reader
import requests

In [4]:
# === PARAMETERS ===
SEQUENCE_LENGTH = 393_216
ENHANCER_INSERT_START  = 997  # Enhancer inserted here in the construct
CONSTRUCT_LENGTH = 1371
FLANK_LEFT = (SEQUENCE_LENGTH - CONSTRUCT_LENGTH) // 2
FLANK_RIGHT = (SEQUENCE_LENGTH - CONSTRUCT_LENGTH) - FLANK_LEFT
POS_ENCODING_REPLICATES = 5
MAX_SHIFT = 64

In [5]:
# === PATHS ===
enhancer_csv = "lib_tab_TF_syntax_rd_label_and_seq_for_combined_libs_4p5and4p7_combined_2025_04_04.csv"  # Should contain 'seq' column
construct_fasta = "downstream_construct_seq.fasta"  # Should be a FASTA file with 1371bp
output_dir = "enformer_input_fasta_downstream"
output_fasta = os.path.join(output_dir, "downstream_enformer_inputs.fasta")
os.makedirs(output_dir, exist_ok=True)

In [6]:
# === Load Enhancer Sequences ===
enhancer_df = pd.read_csv(enhancer_csv)

# === Load Construct Template ===
with open(construct_fasta) as f:
    lines = f.read().splitlines()
    construct_seq = "".join([l.strip() for l in lines if not l.startswith(">")])
assert len(construct_seq) == CONSTRUCT_LENGTH, f"Expected construct length {CONSTRUCT_LENGTH}, got {len(construct_seq)}"

# === Fetch AXIN2 Genomic Flank from UCSC ===
AXIN2_CHR, AXIN2_LOC = "chr17", 65509852  # approximate center of insertion
total_flank_needed = SEQUENCE_LENGTH - CONSTRUCT_LENGTH
start = AXIN2_LOC - (total_flank_needed // 2)
end = AXIN2_LOC + (total_flank_needed - total_flank_needed // 2)
url = f"https://api.genome.ucsc.edu/getData/sequence?genome=hg38;chrom={AXIN2_CHR};start={start};end={end}"
axin2_seq = requests.get(url).json()["dna"].upper()
assert len(axin2_seq) == total_flank_needed, f"Axin2 flank is {len(axin2_seq)}, expected {total_flank_needed}"

In [10]:
# === Write Positional Encoding FASTA ===
with open(output_fasta, "w") as out_fasta:
    for i, row in enhancer_df.iterrows():
        enhancer_seq = row["seq"]
        enhancer_id = row.get("inferred_name", f"enhancer_{i}")
        ENHANCER_INSERT_END = ENHANCER_INSERT_START + len(enhancer_seq)

        # Insert enhancer into construct
        modified_construct = (
            construct_seq[:ENHANCER_INSERT_START] +
            enhancer_seq +
            construct_seq[ENHANCER_INSERT_END:]
        )

        construct_len = len(modified_construct)
        flank_total = SEQUENCE_LENGTH - construct_len
        assert flank_total > 0, f"Construct too long for Enformer input: {construct_len}"

        for j in range(1, POS_ENCODING_REPLICATES + 1):
            # Shift up to MAX_SHIFT bp randomly
            shift = random.randint(-MAX_SHIFT, MAX_SHIFT)
            left_flank_size = (flank_total // 2) + shift
            right_flank_size = flank_total - left_flank_size

            left_flank = axin2_seq[:left_flank_size]
            right_flank = axin2_seq[-right_flank_size:]
            full_seq = left_flank + modified_construct + right_flank

            assert len(full_seq) == SEQUENCE_LENGTH, f"Final sequence length is {len(full_seq)} (expected {SEQUENCE_LENGTH})"

            header = f"{enhancer_id}_pos_enc_shift{shift}"
            out_fasta.write(f">{header}\n")
            for k in range(0, len(full_seq), 80):
                out_fasta.write(full_seq[k:k+80] + "\n")

In [11]:
def check_fasta_lengths(fasta_file, expected_length=393_216):
    with open(fasta_file) as f:
        seq = ""
        header = None
        for line in f:
            line = line.strip()
            if line.startswith(">"):
                if header and len(seq) != expected_length:
                    print(f"❌ {header} has length {len(seq)} (expected {expected_length})")
                header = line[1:]
                seq = ""
            else:
                seq += line
        # Check the last sequence
        if header and len(seq) != expected_length:
            print(f"❌ {header} has length {len(seq)} (expected {expected_length})")
        else:
            print("✅ All sequences are 393216 bp long.")

# Example usage
check_fasta_lengths("enformer_input_fasta_downstream/downstream_enformer_inputs.fasta")

✅ All sequences are 393216 bp long.


In [20]:
# Read and preview first N lines of a large FASTA file
fasta_path = "enformer_input_fasta_downstream/downstream_enformer_inputs.fasta"

N = 20  # Number of lines to preview
with open(fasta_path) as f:
    for i in range(N):
        line = f.readline()
        if not line:
            break
        print(line.strip())

>rd_1_pos_enc_shift-46
CCTAGGCCTCCCAAAGTGCCGGGATTACAGGCATGGGCCACTGCACCCAAGGCAAGACTAGATTTTTTAAAATACTGTTC
CAGAACCAGCTGACCAGTGGCCCCAATACCACATATGCAGGGCTGTGATCCAGTGGGACATCCTAGCACCGGGTAGAAAA
TATTGAAATACTGAGCTAATGGTTGATAAACAGCCATAGCTTCTAGTTTCCTGAGTGCCATCTGCTGTCCCAACTACTTG
CCCCCTCTCCCAGGACCTCCTGGACTTCCCCACATTTCAGACACCTAACAGTGAATGACTGACACAGCAGAGGGTCTCTG
GGTCCCTGAACCTGCTGCTAGGGCCCGCATCTGCTCTGATTGGCCTGTGTCTTATCACTGGCTAAGTGTTTTCACTATTC
CCCTTCACATGCGATTGTATGACAGAAGCATACCCAGCCGTGGTTGACATCGGGGACGTGGTTAACCACTTGCTCCCGCC
TGCAACACCTCCCTGCCATTTTCCTGGTTTCCTACTACCCTTGCCCTCTGTCTGCTGCTTCTCTTTTGCAGTTCCCTTTT
TTTGTTGTTGTTCCATATTGGTGCCCTGTCCTAGTCCCCTTCTCCTGTCTAGCCCTAAGTGATTACTTTAAGCACCATCT
ATATGTCCTGAACTCAGAAGGCAATAGACAAGCTTATCAGACTTGTGTAAAAAGTAAAAGTTAATAATTCTGAGGAAGGC
AGATACTCTTTCTCAAAAAGAAAAGATGCCTATGGTTTATTGTGATGGTCCCCAGACACTCATTTGTGGATTAACCATAG
CAGAATTACCCAGGAATATCTAAACGATGAGGATTCATTCAACTCCTTCCCTAGAGGACCGGTTTCAGTAGGTGTGGAAT
AGGTTCTGAGAATCTCTGCTTGTTCAAAAGCAGCCTAGGCAAAGGATATGAATAGACAATTCTCAAAAGAAGATACACAA
ATGGC