In [20]:
from src.utils import get_project_root

with open(get_project_root() / "inputs/ryr1_rabit.txt", "r") as f:
    # read the only line
    line = f.readline()


len(line)

5037

In [24]:

from typing import NamedTuple


class SubseqResult(NamedTuple):
    final_sequence: str
    new_pocket_location: int  # 1-based index in final_sequence

def extract_and_remove_disordered(sequence: str, aa_start: int, aa_end: int, pocket_location: int, disordered_regions: list[tuple[int, int]]) -> SubseqResult:
    seq_len = len(sequence)
    aa_start_idx = max(aa_start - 1, 0)
    aa_end_idx = min(aa_end, seq_len)

    # Get the initial substring
    sub_seq = sequence[aa_start_idx:aa_end_idx]
    sub_len = len(sub_seq)

    # Build mask: True = keep, False = remove (relative to the substring)
    mask = [True] * sub_len

    for dr_start, dr_end in disordered_regions:
        dr_start_in_sub = max(dr_start, aa_start) - aa_start
        dr_end_in_sub = min(dr_end, aa_end) - aa_start
        for i in range(dr_start_in_sub, dr_end_in_sub + 1):
            if 0 <= i < sub_len:
                mask[i] = False

    # Build final sequence and track mapping of original indices to new indices
    final_seq_chars = []
    orig_to_new_index = {}  # Maps original sub_seq 1-based idx to new 1-based idx, if still present

    new_idx = 1
    for orig_idx, (c, keep) in enumerate(zip(sub_seq, mask), start=1):
        if keep:
            final_seq_chars.append(c)
            orig_to_new_index[orig_idx] = new_idx
            new_idx += 1

    final_sequence = ''.join(final_seq_chars)

    # Calculate new pocket location (if pocket location is within the kept region)
    pocket_idx_in_sub = pocket_location - aa_start + 1  # 1-based index in sub_seq
    new_pocket_location = orig_to_new_index.get(pocket_idx_in_sub, None)

    return SubseqResult(final_sequence, new_pocket_location)


disordered_regions = [
    (4255, 4539),
    (4588, 4625)
]

subseq_result = extract_and_remove_disordered(
    sequence=line,
    aa_start=3750,
    aa_end=5037,
    pocket_location=4716,
    disordered_regions=disordered_regions
)


print(f"AA pocket: {line[pocket_location - 1]}. New pocket ({subseq_result.new_pocket_location}): {subseq_result.final_sequence[subseq_result.new_pocket_location - 1]}")

AA pocket: W. New pocket (644): W


In [26]:
len(subseq_result.final_sequence)

965

In [27]:
# remove output if exist
import os

output_file = get_project_root() / "inputs/ryr1_rabit_section.txt"


if output_file.exists():
    print(f"{output_file.name} already exists")
    os.remove(output_file)

with open(output_file, "w") as f:
    f.write(subseq_result.final_sequence)

ryr1_rabit_section.txt already exists
