# 今回の取り組み

- preprocessで時間がかかりすぎているので、速度が遅い部分を調べる

## いつものセットアップ

In [1]:
# ルートディレクトリをPathに含めるおまじない
import sys, os
from pathlib import Path
if Path(os.getcwd()).stem != "DAJIN2":
    parent_path = str(Path(os.path.dirname(os.path.abspath("__file__"))).parent.parent)
    sys.path.append(parent_path)
    os.chdir(parent_path)
print(os.getcwd())

/mnt/f/DAJIN_R10/DAJIN2


In [2]:
%%bash
# pipの更新
pip install -q -U pip
pip install -q -U -r requirements.txt

# 実験

- `preprocess.correct_sequence_error`と`preprocess.correct_knockin`でシークエンスエラーではない変異箇所を抽出する
- もしくは、コードを独立させたほうが良いかもしれません
    - correctionが終わったあとに、sampleとcontrolと比べて変異のある塩基位置を抽出する
    - correctionと独立させることで、correctionの方法が変わっても塩基位置を抽出するコードを変える必要がなくなる

In [10]:
from __future__ import annotations

import sys, os
from pathlib import Path

import hashlib
from collections import defaultdict
from pathlib import Path
from importlib import reload

from src.DAJIN2.core import preprocess, classification, clustering, consensus, report
from src.DAJIN2.core.clustering import clustering

reload(preprocess)
reload(classification)
reload(clustering)
reload(consensus)
reload(report)


# ##### # * R9 vs R10 Point mutation
SAMPLE, CONTROL, ALLELE, NAME, GENOME, DEBUG, THREADS = (
    "../fastq_20230304/barcode31.fq.gz",
    "../fastq_20230304/barcode32.fq.gz",
    "examples/pm-tyr/design_tyr.fa",
    "test-20230304-pm-tyr",
    "mm10",
    True,
    6,
)

print(f"processing {NAME}...")

##########################################################
# Check inputs
##########################################################
preprocess.check_inputs.check_files(SAMPLE, CONTROL, ALLELE)
TEMPDIR = Path("DAJINResults", ".tempdir", NAME)
IS_CACHE_CONTROL = preprocess.check_inputs.exists_cached_control(CONTROL, TEMPDIR)
IS_CACHE_GENOME = preprocess.check_inputs.exists_cached_genome(GENOME, TEMPDIR, IS_CACHE_CONTROL)
UCSC_URL, GOLDENPATH_URL = None, None
if GENOME and not IS_CACHE_GENOME:
    UCSC_URL, GOLDENPATH_URL = preprocess.check_inputs.check_and_fetch_genome(GENOME)

##########################################################
# Format inputs
##########################################################
SAMPLE_NAME = preprocess.format_inputs.extract_basename(SAMPLE)
CONTROL_NAME = preprocess.format_inputs.extract_basename(CONTROL)
FASTA_ALLELES = preprocess.format_inputs.dictionize_allele(ALLELE)
THREADS = min(THREADS, os.cpu_count()-1)

preprocess.format_inputs.make_directories(TEMPDIR, SAMPLE_NAME, CONTROL_NAME)

if GENOME:
    GENOME_COODINATES = preprocess.format_inputs.fetch_coodinate(GENOME, UCSC_URL, FASTA_ALLELES["control"])
    CHROME_SIZE = preprocess.format_inputs.fetch_chrom_size(GENOME_COODINATES["chr"], GENOME, GOLDENPATH_URL)
    preprocess.format_inputs.cache_coodinates_and_chromsize(TEMPDIR, GENOME, GENOME_COODINATES, CHROME_SIZE)

################################################################################
# Export fasta files as single-FASTA format
################################################################################
# TODO: use yeild, not export
for identifier, sequence in FASTA_ALLELES.items():
    contents = "\n".join([">" + identifier, sequence]) + "\n"
    output_fasta = Path(TEMPDIR, "fasta", f"{identifier}.fasta")
    output_fasta.write_text(contents)
###############################################################################
# Mapping with mappy
###############################################################################
for path_fasta in Path(TEMPDIR, "fasta").glob("*.fasta"):
    name_fasta = path_fasta.stem
    preprocess.mappy_align.output_sam(TEMPDIR, path_fasta, name_fasta, CONTROL, CONTROL_NAME, threads=THREADS)
    preprocess.mappy_align.output_sam(TEMPDIR, path_fasta, name_fasta, SAMPLE, SAMPLE_NAME, threads=THREADS)
    preprocess.mappy_align.output_sam(
        TEMPDIR, path_fasta, name_fasta, CONTROL, CONTROL_NAME, preset="splice", threads=THREADS
    )
    preprocess.mappy_align.output_sam(
        TEMPDIR, path_fasta, name_fasta, SAMPLE, SAMPLE_NAME, preset="splice", threads=THREADS
    )
########################################################################
# MIDSV conversion
########################################################################
for path_sam in Path(TEMPDIR, "sam").glob(f"{CONTROL_NAME}_splice_*"):
    preprocess.calc_midsv.output_midsv(TEMPDIR, path_sam)
for path_sam in Path(TEMPDIR, "sam").glob(f"{SAMPLE_NAME}_splice_*"):
    preprocess.calc_midsv.output_midsv(TEMPDIR, path_sam)
###############################################################################
# Correct CSSPLITS
###############################################################################
# preprocess.correct_revititive_deletions.execute(TEMPDIR, FASTA_ALLELES, CONTROL_NAME, SAMPLE_NAME)
preprocess.correct_sequence_error.execute(TEMPDIR, FASTA_ALLELES, CONTROL_NAME, SAMPLE_NAME)
preprocess.correct_knockin.execute(TEMPDIR, FASTA_ALLELES, CONTROL_NAME, SAMPLE_NAME)

processing test-20230304-pm-tyr...


In [11]:
import midsv

In [12]:
allele="control"

In [13]:
midsv_control = midsv.read_jsonl((Path(TEMPDIR, "midsv", f"{CONTROL_NAME}_splice_{allele}.jsonl")))
cssplits_control = [cs["CSSPLIT"].split(",") for cs in midsv_control]
# Sample
midsv_sample = midsv.read_jsonl((Path(TEMPDIR, "midsv", f"{SAMPLE_NAME}_splice_{allele}.jsonl")))
cssplits_sample = [cs["CSSPLIT"].split(",") for cs in midsv_sample]

In [14]:
from collections import defaultdict
readnum = defaultdict(int)
for cs in cssplits_control:
    readnum[len(cs)] += 1

print(readnum)

defaultdict(<class 'int'>, {2845: 48350})


In [15]:
for m in midsv_control:
    cs = m["CSSPLIT"].split(",")
    if len(cs) != 2845:
        print(m["QNAME"], len(cs))


In [21]:
print(Path(TEMPDIR, "midsv", f"{CONTROL_NAME}_splice_{allele}.jsonl"))

DAJINResults/.tempdir/test-20230304-pm-tyr/midsv/barcode32_splice_control.jsonl


In [25]:
%%bash
cat "DAJINResults/.tempdir/test-20230304-pm-tyr/sam/barcode32_map-ont_control.sam" |
    grep -e "^@" -e "9d00aba6d0fe" |
    cat > tmp_9d00aba6d0fe.sam

cat "DAJINResults/.tempdir/test-20230304-pm-tyr/sam/barcode32_map-ont_control.sam" |
    grep -e "^@" -e "278e14de22da" |
    cat > tmp_278e14de22da.sam

In [None]:
%%bash

cat "DAJINResults/.tempdir/test-20230304-pm-tyr/sam/barcode32_map-ont_control.sam" |
    grep -e "^@" -e "9d00aba6d0fe" |
    samtools sort > tmp_9d00aba6d0fe.bam
    samtools index tmp_9d00aba6d0fe.bam

cat "DAJINResults/.tempdir/test-20230304-pm-tyr/sam/barcode32_map-ont_control.sam" |
    grep -e "^@" -e "278e14de22da" |
    samtools sort > tmp_278e14de22da.bam
    samtools index tmp_278e14de22da.bam

### midsvの更新(version 0.9.0 -> 0.9.5)

In [1]:
%%bash
pip list | grep midsv

midsv                                     0.9.5      /mnt/f/midsv/src


In [2]:
# ルートディレクトリをPathに含めるおまじない
import sys, os
from pathlib import Path
if Path(os.getcwd()).stem != "DAJIN2":
    parent_path = str(Path(os.path.dirname(os.path.abspath("__file__"))).parent.parent)
    sys.path.append(parent_path)
    os.chdir(parent_path)
print(os.getcwd())

/mnt/f/DAJIN_R10/DAJIN2


In [16]:
import midsv

In [17]:
allele="control"

In [18]:
midsv_control = midsv.read_jsonl((Path(TEMPDIR, "midsv", f"{CONTROL_NAME}_splice_{allele}.jsonl")))
cssplits_control = [cs["CSSPLIT"].split(",") for cs in midsv_control]
# Sample
midsv_sample = midsv.read_jsonl((Path(TEMPDIR, "midsv", f"{SAMPLE_NAME}_splice_{allele}.jsonl")))
cssplits_sample = [cs["CSSPLIT"].split(",") for cs in midsv_sample]

In [19]:
from collections import defaultdict
readnum = defaultdict(int)
for cs in cssplits_control:
    readnum[len(cs)] += 1

print(readnum)

defaultdict(<class 'int'>, {2845: 48350})


In [20]:
for m in midsv_control:
    cs = m["CSSPLIT"].split(",")
    if len(cs) != 2845:
        print(m["QNAME"], len(cs))


# まとめ


- MIDSVの更新をして、最終的にリファレンス配列と長さが異なる配列を除去することにしました

# 次に取り組むこと

### Lists

+ [ ] cis変異の両端が欠失している場合に、Nで置き換えるとtransとなってしまうのをどうするか（`replace_n`）
+ [ ] 短いリードの扱いをどうするべきか
+ [ ] Insertionのなかにある変異を同定する手法を考案する
+ [ ] Ayabe-taks1のright_loxpがいまいちな理由を考察する
+ [ ] `preprocess.correct_sequence_error.replace_atmark`のコードがわかりにくい
    + テストを用意してリファクタリングする

### Focus
+ [ ] cis変異の両端が欠失している場合に、Nで置き換えるとtransとなってしまうのをどうするか（`replace_n`）
+ [ ] 短いリードの扱いをどうするべきか

両者については、`correct_sequence_error`などで補正するときに、**変異候補の塩基配列のみを対象とする**ことで対応できる可能性がある

- 変異候補の塩基配列のみを対象とする
    - 両端が欠失しているようなリードについて、変異候補部位を含まないリードは`uncategorized`といったカテゴリにできる
    - よって**変異候補部位を含むか含まないか**を考えることで、短いリードや両端が欠失しているリードの分類が可能になる？