# 今回の取り組み

- 挿入塩基のクラスタリングに取り組みます
    - 挿入塩基があるアレルは、preprocessの段階である程度ざっくりと検出する
    - consensus配列まで出して、これに再度mapping->midsvする？
    - AyabeTask1だと挿入は3アレルある（flox, left-loxp, right-loxp）
    - このうち、left-loxpとright-loxpのFASTAファイルを生成する

## いつものセットアップ

In [1]:
# ルートディレクトリをPathに含めるおまじない
import sys, os
from pathlib import Path
if Path(os.getcwd()).stem != "DAJIN2":
    parent_path = str(Path(os.path.dirname(os.path.abspath("__file__"))).parent.parent)
    sys.path.append(parent_path)
    os.chdir(parent_path)

print(os.getcwd())
sys.path.append(os.getcwd() + "/" + "src")

/mnt/d/Research/DAJIN2


In [2]:
%%bash
# pipの更新
pip install -q -U pip
pip install -q -U -r requirements.txt


[notice] A new release of pip is available: 22.3 -> 23.0.1
[notice] To update, run: pip install --upgrade pip


# 実験

In [3]:
from __future__ import annotations

import sys, os
from pathlib import Path

import hashlib
from collections import defaultdict
from pathlib import Path

from DAJIN2.core import preprocess, classification, clustering, consensus, report


##### # * 2-cut deletion
SAMPLE, CONTROL, ALLELE, NAME, GENOME, DEBUG, THREADS = (
    "tests/data/knockout/test_barcode25.fq.gz",
    "tests/data/knockout/test_barcode30.fq.gz",
    "tests/data/knockout/design_stx2.fa",
    "test-knockout",
    "mm10",
    True,
    30,
)


print(f"processing {NAME}...")

##########################################################
# Check inputs
##########################################################
preprocess.validate_inputs.check_files(SAMPLE, CONTROL, ALLELE)
TEMPDIR = Path("DAJINResults", ".tempdir", NAME)
IS_CACHE_CONTROL = preprocess.validate_inputs.exists_cached_control(CONTROL, TEMPDIR)
IS_CACHE_GENOME = preprocess.validate_inputs.exists_cached_genome(GENOME, TEMPDIR, IS_CACHE_CONTROL)
UCSC_URL, GOLDENPATH_URL = None, None
if GENOME and not IS_CACHE_GENOME:
    UCSC_URL, GOLDENPATH_URL = preprocess.validate_inputs.check_and_fetch_genome(GENOME)

##########################################################
# Format inputs
##########################################################
SAMPLE_NAME = preprocess.format_inputs.extract_basename(SAMPLE)
CONTROL_NAME = preprocess.format_inputs.extract_basename(CONTROL)
FASTA_ALLELES = preprocess.format_inputs.dictionize_allele(ALLELE)
THREADS = min(THREADS, len(os.sched_getaffinity(0)) - 1)

preprocess.format_inputs.make_directories(TEMPDIR, SAMPLE_NAME, CONTROL_NAME)

if GENOME:
    GENOME_COODINATES = preprocess.format_inputs.fetch_coodinate(GENOME, UCSC_URL, FASTA_ALLELES["control"])
    CHROME_SIZE = preprocess.format_inputs.fetch_chrom_size(GENOME_COODINATES["chr"], GENOME, GOLDENPATH_URL)
    preprocess.format_inputs.cache_coodinates_and_chromsize(TEMPDIR, GENOME, GENOME_COODINATES, CHROME_SIZE)


processing test-knockout...


In [4]:
####################################################################################
# Classify alleles
####################################################################################
print("Classify...")

classif_sample = classification.classify_alleles(TEMPDIR, SAMPLE_NAME)

# for classif in classif_sample:
#     classif["SV"] = classification.detect_sv(classif["CSSPLIT"], threshold=50)

####################################################################################
# Clustering
####################################################################################
print("Clustering...")

MUTATION_LOCI = clustering.extract_mutation_loci(TEMPDIR, FASTA_ALLELES, SAMPLE_NAME, CONTROL_NAME)
clust_sample = clustering.add_labels(classif_sample, TEMPDIR, CONTROL_NAME, MUTATION_LOCI, THREADS)
clust_sample = clustering.add_readnum(clust_sample)
clust_sample = clustering.add_percent(clust_sample)
clust_sample = clustering.update_labels(clust_sample)

####################################################################################
# Consensus call
####################################################################################
print("Consensus call...")

cons_percentage, cons_sequence = consensus.call_consensus(clust_sample)
allele_names = consensus.call_allele_name(cons_sequence, FASTA_ALLELES)
cons_percentage = consensus.update_key_by_allele_name(cons_percentage, allele_names)
cons_sequence = consensus.update_key_by_allele_name(cons_sequence, allele_names)
RESULT_SAMPLE = consensus.add_key_by_allele_name(clust_sample, allele_names)
RESULT_SAMPLE.sort(key=lambda x: x["LABEL"])


Classify...
Clustering...
Consensus call...


In [21]:
cons_percentage, cons_sequence = consensus.call_consensus(clust_sample)
print(cons_percentage)

defaultdict(<class 'list'>, {('deletion', 1, 33.4): [{'=G': 66.66666666666707, 'N': 33.33333333333331}, {'=C': 68.26347305389264, 'N': 31.73652694610774}, {'=T': 70.0598802395214, 'N': 29.940119760479007}, {'=C': 79.24151696606843, 'N': 20.758483033932144}, {'=C': 89.02195608782505, 'N': 10.978043912175659}, {'=A': 90.01996007984103, 'N': 9.980039920159687}, {'=G': 91.8163672654698, 'N': 8.183632734530937}, {'=G': 95.40918163672733, 'N': 4.590818363273453}, {'=G': 95.80838323353372, 'N': 4.191616766467066}, {'=T': 96.4071856287433, 'N': 3.5928143712574845}, {'=G': 97.20558882235609, 'N': 2.7944111776447103}, {'=T': 97.40518962075929, 'N': 2.5948103792415167}, {'=C': 97.40518962075929, 'N': 2.5948103792415167}, {'=T': 97.60479041916248, 'N': 2.395209580838323}, {'=C': 98.00399201596888, 'N': 1.9960079840319358}, {'=A': 98.20359281437207, 'N': 1.7964071856287422}, {'=T': 98.20359281437207, 'N': 1.7964071856287422}, {'=A': 98.40319361277527, 'N': 1.5968063872255487}, {'=G': 98.60279441117

In [15]:
key = list(cons_percentage.keys())[0]
print(key)
print(cons_percentage[key])

allele1_deletion_indels_33.4%
[{'=G': 66.66666666666707, 'N': 33.33333333333331}, {'=C': 68.26347305389264, 'N': 31.73652694610774}, {'=T': 70.0598802395214, 'N': 29.940119760479007}, {'=C': 79.24151696606843, 'N': 20.758483033932144}, {'=C': 89.02195608782505, 'N': 10.978043912175659}, {'=A': 90.01996007984103, 'N': 9.980039920159687}, {'=G': 91.8163672654698, 'N': 8.183632734530937}, {'=G': 95.40918163672733, 'N': 4.590818363273453}, {'=G': 95.80838323353372, 'N': 4.191616766467066}, {'=T': 96.4071856287433, 'N': 3.5928143712574845}, {'=G': 97.20558882235609, 'N': 2.7944111776447103}, {'=T': 97.40518962075929, 'N': 2.5948103792415167}, {'=C': 97.40518962075929, 'N': 2.5948103792415167}, {'=T': 97.60479041916248, 'N': 2.395209580838323}, {'=C': 98.00399201596888, 'N': 1.9960079840319358}, {'=A': 98.20359281437207, 'N': 1.7964071856287422}, {'=T': 98.20359281437207, 'N': 1.7964071856287422}, {'=A': 98.40319361277527, 'N': 1.5968063872255487}, {'=G': 98.60279441117846, 'N': 1.3972055888

In [19]:
from collections import Counter
Counter({'=G': 66.66666666666707, 'N': 33.33333333333331}).most_common()[0][0]

'=G'

In [22]:
from __future__ import annotations

import re
from itertools import groupby
from collections import defaultdict


def _call_percentage(cssplits: list[str]) -> list[dict[str, float]]:
    """
    Call position weight matrix in defferent loci.
    Non defferent loci are annotated to "Match" or "Unknown(N)"
    """
    cssplits_transposed = [list(cs) for cs in zip(*cssplits)]
    coverage = len(cssplits)
    cons_percentage = []
    for cs_transposed in cssplits_transposed:
        count_cs = defaultdict(int)
        for cs in cs_transposed:
            count_cs[cs] += 1 / coverage * 100
        count_cs_sorted = dict(sorted(count_cs.items(), key=lambda x: x[1], reverse=True))
        cons_percentage.append(count_cs_sorted)
    return cons_percentage


def _call_sequence(cons_percentage: list[dict[str, float]]) -> list[str]:
    consensus_sequence = []
    for cons_per in cons_percentage:
        cons = max(cons_per, key=cons_per.get)
        if cons.startswith("="):
            cons = cons.replace("=", "")
        elif cons.startswith("-"):
            continue
        elif cons.startswith("*"):
            cons = cons[-1]
        elif cons.startswith("+"):
            cons_ins = cons.split("|")
            if cons_ins[-1].startswith("="):
                cons = cons.replace("=", "")
            elif cons_ins[-1].startswith("-"):
                cons = "".join(cons_ins[:-1])
            elif cons_ins[-1].startswith("*"):
                cons = "".join([*cons_ins[:-1], cons_ins[-1][-1]])
            cons = cons.replace("+", "")
            cons = cons.replace("|", "")
        consensus_sequence.append(cons)
    return "".join(consensus_sequence)


def call_consensus(clust_sample: list[dict]) -> tuple[list[dict], list[dict]]:
    cons_percentage = defaultdict(list)
    cons_sequence = defaultdict(list)
    clust_sample.sort(key=lambda x: x["LABEL"])
    for _, group in groupby(clust_sample, key=lambda x: x["LABEL"]):
        clust = list(group)
        keys = (
            clust[0]["ALLELE"],
            clust[0]["LABEL"],
            clust[0]["PERCENT"],
        )
        cssplits = [cs["CSSPLIT"].split(",") for cs in clust]
        cons_per = _call_percentage(cssplits)
        cons_seq = _call_sequence(cons_per)
        cons_percentage[keys] = cons_per
        cons_sequence[keys] = cons_seq
    return cons_percentage, cons_sequence


def _detect_sv(cons_percentage: list[dict[str, float]], threshold: int = 50) -> list[bool]:
    exists_sv = []
    for cons_per in cons_percentage:
        seq = max(cons_per, key=cons_per.get)
        if "N" * threshold in seq:
            exists_sv.append(True)
        elif re.search(rf"(\+[ACGTN]\|){{{threshold}}}", seq):
            exists_sv.append(True)
        elif re.search(rf"(\-[ACGTN]){{{threshold}}}", seq):
            exists_sv.append(True)
        elif re.search(rf"(\*[ACGTN][ACGTN]){{{threshold}}}", seq):
            exists_sv.append(True)
        elif re.search(r"[acgtn]", seq):
            exists_sv.append(True)
        else:
            exists_sv.append(False)
    return exists_sv

# def _detect_sv(cons_sequence: list[dict], threshold: int = 50) -> list[bool]:
#     exists_sv = []
#     for seq in cons_sequence.values():
#         if "N" * threshold in seq:
#             exists_sv.append(True)
#         elif re.search(rf"(\+[ACGTN]\|){{{threshold}}}", seq):
#             exists_sv.append(True)
#         elif re.search(rf"(\-[ACGTN]){{{threshold}}}", seq):
#             exists_sv.append(True)
#         elif re.search(rf"(\*[ACGTN][ACGTN]){{{threshold}}}", seq):
#             exists_sv.append(True)
#         elif re.search(r"[acgtn]", seq):
#             exists_sv.append(True)
#         else:
#             exists_sv.append(False)
#     return exists_sv


def call_allele_name(cons_sequence: list[dict], cons_percentage: list[dict], FASTA_ALLELES: dict) -> dict[int, str]:
    exists_sv = _detect_sv(cons_percentage)
    label_digits = len(str(len(cons_percentage)))
    allele_names = {}
    for is_sv, (keys, cons_seq) in zip(exists_sv, cons_sequence.items()):
        ALLELE, LABEL, PERCENT = keys
        label_format = f"{LABEL:0{label_digits}}"
        allele_name = f"allele{label_format}_{ALLELE}"
        if cons_seq == FASTA_ALLELES[ALLELE]:
            allele_name += "_intact"
        elif is_sv:
            allele_name += "_sv"
        else:
            allele_name += "_indels"
        allele_name += f"_{PERCENT}%"
        allele_names.update({LABEL: allele_name})
    return allele_names


def update_key_by_allele_name(cons: dict, allele_names: dict[int, str]) -> dict:
    for key, allele_name in zip(list(cons.keys()), allele_names.values()):
        cons[allele_name] = cons.pop(key)
    return cons


def add_key_by_allele_name(clust_sample: list[dict], allele_names: dict[int, str]) -> list[dict]:
    for clust in clust_sample:
        label = clust["LABEL"]
        clust["NAME"] = allele_names[label]
    return clust_sample


In [24]:
call_allele_name(cons_sequence, cons_percentage, FASTA_ALLELES)

AttributeError: 'tuple' object has no attribute 'get'

In [32]:
for cons_per in cons_percentage.values():
    break

for cssplit in cons_per:
    break

print(cssplit)
print(max(cssplit, key=cssplit.get))

{'=G': 66.66666666666707, 'N': 33.33333333333331}
=G


In [42]:
def _detect_sv(cons_percentage: list[dict[str, float]], threshold: int = 50) -> list[bool]:
    exists_sv = []
    for cons_per in cons_percentage.values():
        cons_cssplits = []
        for cssplit in cons_per:
            seq = max(cssplit, key=cssplit.get)
            cons_cssplits.append(seq)
        cons_cssplits = "".join(cons_cssplits)
        if "N" * threshold in cons_cssplits:
            exists_sv.append(True)
        elif re.search(rf"(\+[ACGTN]\|){{{threshold}}}", cons_cssplits):
            exists_sv.append(True)
        elif re.search(rf"(\-[ACGTN]){{{threshold}}}", cons_cssplits):
            exists_sv.append(True)
        elif re.search(rf"(\*[ACGTN][ACGTN]){{{threshold}}}", cons_cssplits):
            exists_sv.append(True)
        elif re.search(r"[acgtn]", cons_cssplits):
            exists_sv.append(True)
        else:
            exists_sv.append(False)
    return exists_sv



In [43]:
cons_percentage, cons_sequence = consensus.call_consensus(clust_sample)
_detect_sv(cons_percentage)

[True, True, True, True, True]

In [45]:

def call_allele_name(cons_sequence: list[dict], cons_percentage: list[dict], FASTA_ALLELES: dict) -> dict[int, str]:
    exists_sv = _detect_sv(cons_percentage)
    label_digits = len(str(len(cons_percentage)))
    allele_names = {}
    for is_sv, (keys, cons_seq) in zip(exists_sv, cons_sequence.items()):
        ALLELE, LABEL, PERCENT = keys
        label_format = f"{LABEL:0{label_digits}}"
        allele_name = f"allele{label_format}_{ALLELE}"
        if cons_seq == FASTA_ALLELES[ALLELE]:
            allele_name += "_intact"
        elif is_sv:
            allele_name += "_sv"
        else:
            allele_name += "_indels"
        allele_name += f"_{PERCENT}%"
        allele_names.update({LABEL: allele_name})
    return allele_names


In [46]:
call_allele_name(cons_sequence, cons_percentage, FASTA_ALLELES)

{1: 'allele1_deletion_sv_33.4%',
 2: 'allele2_deletion_sv_33.2%',
 3: 'allele3_deletion_sv_32.8%',
 4: 'allele4_deletion_sv_0.533%',
 5: 'allele5_inversion_sv_0.067%'}

- `_detect_sv`のバグを修正しました
- 動作確認をします

In [47]:
%%bash
rm -rf DAJINResults/single-stx2deletion
rm -rf DAJINResults/.tempdir/single-stx2deletion

In [49]:
%%bash
pip install -qe .
DAJIN2 \
    --name single-stx2deletion \
    --sample "tests/data/knockout/test_barcode25.fq.gz" \
    --control "tests/data/knockout/test_barcode30.fq.gz" \
    --allele "tests/data/knockout/design_stx2.fa" \
    --genome mm10 \
    --threads 10


[notice] A new release of pip is available: 22.1.2 -> 23.0.1
[notice] To update, run: pip install --upgrade pip
tests/data/knockout/test_barcode30.fq.gz is now processing...
tests/data/knockout/test_barcode25.fq.gz is now processing...


Classify...
Clustering...
Consensus call...


Finished! Open DAJINResults/single-stx2deletion to see the report.


In [50]:
%%bash
ls -l DAJINResults/single-stx2deletion/HTML/test_barcode25/

total 10240
-rwxrwxrwx 1 kuno kuno 5194 Apr 17 15:48 test_barcode25_allele1_deletion_sv_33.4%.html
-rwxrwxrwx 1 kuno kuno 5169 Apr 17 15:48 test_barcode25_allele2_deletion_sv_33.2%.html
-rwxrwxrwx 1 kuno kuno 5169 Apr 17 15:48 test_barcode25_allele3_deletion_sv_32.8%.html
-rwxrwxrwx 1 kuno kuno 5195 Apr 17 15:48 test_barcode25_allele4_deletion_sv_0.533%.html
-rwxrwxrwx 1 kuno kuno 5927 Apr 17 15:48 test_barcode25_allele5_inversion_sv_0.067%.html


- 無事に`_sv`とすることができました

# 👉👉👉 いまここ 👈👈👈

# 👌👌👌 まとめ 👌👌👌


- `consensus`の`_detect_sv`を更新し、indelsをSVと変更しました

- 次はいよいよ挿入塩基について考えます

# 次に取り組むこと

### Lists

+ ⬜ Insertionのなかにある変異を同定する手法を考案する
+ ⬜ Ayabe-taks1のright_loxpがいまいちな理由を考察する
+ ✅ 断端リードの扱いをどうするべきか
+ ✅ `SV`の判定をconsensus callのあとにする
> + ⬜ `preprocess.correct_sequence_error.replace_atmark`のコードがわかりにくい
    + テストを用意してリファクタリングする