# 今回の取り組み

- `correct_sequence_error`の内部処理を可能な限りGeneratorに変更します

## いつものセットアップ

In [1]:
# ルートディレクトリをPathに含めるおまじない
import sys, os
from pathlib import Path
if Path(os.getcwd()).stem != "DAJIN2":
    parent_path = str(Path(os.path.dirname(os.path.abspath("__file__"))).parent.parent)
    sys.path.append(parent_path)
    os.chdir(parent_path)

print(os.getcwd())
sys.path.append(os.getcwd() + "/" + "src")

/mnt/d/Research/DAJIN2


In [2]:
%%bash
pip uninstall -qy DAJIN2
# pipの更新
# pip install -q -U pip
# pip install -q -U -r requirements.txt

# 実験

In [3]:
from __future__ import annotations

import sys, os
from pathlib import Path

import hashlib
from collections import defaultdict
from pathlib import Path
from importlib import reload

from src.DAJIN2.core import preprocess, classification, clustering, consensus, report

reload(preprocess)
reload(classification)
reload(clustering)
reload(consensus)
reload(report)

#### # * Subset of Point mutation
#### # 50 or 10 or 01%
percent = "50"
SAMPLE, CONTROL, ALLELE, NAME, GENOME, DEBUG, THREADS = (
    f"misc/data/tyr_albino_{percent}%.fq.gz",
    "misc/data/tyr_control.fq.gz",
    "misc/data/tyr_control.fasta",
    "single-tyr50",
    "mm10",
    True,
    30,
)


######################################################################
# Preprocessing
######################################################################

print(f"processing {NAME}...")

SAMPLE = preprocess.format_inputs.convert_to_posix_path(SAMPLE)
CONTROL = preprocess.format_inputs.convert_to_posix_path(CONTROL)
ALLELE = preprocess.format_inputs.convert_to_posix_path(ALLELE)

# ====================================================================
# Varidate inputs
# ====================================================================

preprocess.validate_inputs.check_files(SAMPLE, CONTROL, ALLELE)
TEMPDIR = Path("DAJINResults", ".tempdir", NAME)
IS_CACHE_CONTROL = preprocess.validate_inputs.exists_cached_control(CONTROL, TEMPDIR)
IS_CACHE_GENOME = preprocess.validate_inputs.exists_cached_genome(GENOME, TEMPDIR, IS_CACHE_CONTROL)
UCSC_URL, GOLDENPATH_URL = None, None
if GENOME and not IS_CACHE_GENOME:
    UCSC_URL, GOLDENPATH_URL = preprocess.validate_inputs.check_and_fetch_genome(GENOME)

# ====================================================================
# Format inputs
# ====================================================================
SAMPLE_NAME = preprocess.format_inputs.extract_basename(SAMPLE)
CONTROL_NAME = preprocess.format_inputs.extract_basename(CONTROL)
FASTA_ALLELES = preprocess.format_inputs.dictionize_allele(ALLELE)
THREADS = preprocess.format_inputs.update_threads(THREADS)

preprocess.format_inputs.make_directories(TEMPDIR, SAMPLE_NAME, CONTROL_NAME)

if GENOME:
    GENOME_COODINATES = preprocess.format_inputs.fetch_coodinate(GENOME, UCSC_URL, FASTA_ALLELES["control"])
    CHROME_SIZE = preprocess.format_inputs.fetch_chrom_size(GENOME_COODINATES["chr"], GENOME, GOLDENPATH_URL)
    preprocess.format_inputs.cache_coodinates_and_chromsize(TEMPDIR, GENOME, GENOME_COODINATES, CHROME_SIZE)


processing single-tyr50...


In [4]:
import pickle
with open(Path(TEMPDIR, "midsv", f"{CONTROL_NAME}.plk"), 'rb') as p:
        midsv_control_alleles = pickle.load(p)

In [7]:
midsv_sample_alleles = preprocess.call_midsv(TEMPDIR, FASTA_ALLELES, SAMPLE_NAME)

In [9]:
print(len(midsv_sample_alleles["control"]))
print(len(midsv_control_alleles["control"]))

5000
10000


In [10]:
MUTATION_LOCI_ALLELES = preprocess.extract_mutation_loci(midsv_sample_alleles, midsv_control_alleles)

In [7]:
from __future__ import annotations

import random
import re
from collections import defaultdict, Counter
from typing import Generator


def _replace_errors_to_atmark(midsv_sample: dict, mutation_loci: dict[str, set[int]]) -> Generator[str]:
    for cssplits in (cs["CSSPLIT"].split(",") for cs in midsv_sample):
        cssplits_replaced = []
        for i, cs in enumerate(cssplits):
            if cs.startswith("=") or cs == "N" or re.search(r"a|c|g|t|n", cs):
                cssplits_replaced.append(cs)
                continue
            if i in mutation_loci[cs[0]]:
                cssplits_replaced.append(cs)
            else:
                cssplits_replaced.append("@")
            cssplits_replaced = ",".join(cssplits_replaced)
        yield cssplits_replaced


def _create_replace_dict(cssplits_replaced: Generator[list[str]]) -> dict[str, dict[str, int]]:
    cssplits_atmark = defaultdict(set)
    cssplits_sampling = defaultdict(lambda: defaultdict(lambda: Counter(["N"])))
    for cssplits in cssplits_replaced:
        for seq_idx, cs in enumerate(cssplits):
            if seq_idx == 0 or seq_idx == len(cssplits) - 1:
                continue
            kmer = ",".join([cssplits[seq_idx - 1], cssplits[seq_idx + 1]])
            if cs == "@":
                cssplits_atmark[seq_idx].add(kmer)
            else:
                cssplits_sampling[seq_idx][kmer] += Counter([cs])
    return cssplits_atmark, cssplits_sampling



def _replace_atmark(cssplits: Generator[str], sequence: str) -> list[str]:
    random.seed(1)
    cssplits_replaced = []
    for i in range(1, len(sequence) - 1):
        cssplits_atmark = defaultdict(str)
        cssplits_sampling_key = defaultdict(list)
        cssplits_sampling_all = []
        flag_all_atmark = True
        for idx, cssplit in enumerate(cssplits):
            key = ",".join([cssplit[i - 1], cssplit[i + 1]])
            if cssplit[i] == "@":
                cssplits_atmark[idx] = key
            else:
                cssplits_sampling_key[key].append(cssplit[i])
                cssplits_sampling_all.append(cssplit[i])
                flag_all_atmark = False
        for idx, key in cssplits_atmark.items():
            if flag_all_atmark:
                cssplits_replaced[idx][i] = "N"
            elif cssplits_sampling_key[key]:
                cssplits_replaced[idx][i] = random.choice(cssplits_sampling_key[key])
            else:
                cssplits_replaced[idx][i] = random.choice(cssplits_sampling_all)
    for cs in cssplits_replaced:
        if cs[0] == "@":
            cs[0] = "N"
        if cs[-1] == "@":
            cs[-1] = "N"
    return cssplits_replaced

###############################################################################
# main
###############################################################################


def correct_sequence_error(midsv_sample_alleles, midsv_control_alleles, FASTA_ALLELES, MUTATION_LOCI_ALLELES) -> None:
    midsv_alleles_corrected = defaultdict(dict)
    for allele, sequence in FASTA_ALLELES.items():
        midsv_sample = midsv_sample_alleles[allele]
        midsv_control = midsv_control_alleles[allele]
        cssplits_sample = (cs["CSSPLIT"].split(",") for cs in midsv_sample)
        cssplits_control = (cs["CSSPLIT"].split(",") for cs in midsv_control)
        # Extract mutation loci
        mutation_loci = MUTATION_LOCI_ALLELES[allele]
        # Correct sequence errors
        cssplits_sample_atmark = _replace_errors_to_atmark(cssplits_sample, mutation_loci)
        cssplits_control_atmark = _replace_errors_to_atmark(cssplits_control, mutation_loci)
        cssplits_sample_atmark_replaced = _replace_atmark(cssplits_sample_atmark, sequence)
        cssplits_control_atmark_replaced = _replace_atmark(cssplits_control_atmark, sequence)
        # Replace CSSPLIT
        cssplits_sample_corrected = (",".join(cs) for cs in cssplits_sample_atmark_replaced)
        cssplits_control_corrected = (",".join(cs) for cs in cssplits_control_atmark_replaced)
        for i, cssplits in enumerate(cssplits_sample_corrected):
            midsv_sample[i]["CSSPLIT"] = cssplits
        for i, cssplits in enumerate(cssplits_control_corrected):
            midsv_control[i]["CSSPLIT"] = cssplits
        midsv_alleles_corrected["sample"][allele] = midsv_sample
        midsv_alleles_corrected["control"][allele] = midsv_control
    return midsv_alleles_corrected


In [8]:
allele = "control"
sequence = FASTA_ALLELES[allele]

In [9]:
midsv_sample = midsv_sample_alleles[allele]
midsv_control = midsv_control_alleles[allele]
cssplits_sample = (cs["CSSPLIT"].split(",") for cs in midsv_sample)
cssplits_control = (cs["CSSPLIT"].split(",") for cs in midsv_control)
# Extract mutation loci
mutation_loci = MUTATION_LOCI_ALLELES[allele]

In [12]:
# Correct sequence errors
cssplits_sample_atmark = _replace_errors_to_atmark(cssplits_sample, mutation_loci)
cssplits_control_atmark = _replace_errors_to_atmark(cssplits_control, mutation_loci)

In [13]:
print(next(cssplits_sample_atmark))

TypeError: list indices must be integers or slices, not str

In [38]:
def _replace_errors_to_atmark(midsv_sample: dict, mutation_loci: dict[str, set[int]]) -> Generator[str]:
    for cssplits in (cs["CSSPLIT"].split(",") for cs in midsv_sample):
        cssplits_replaced = []
        for i, cs in enumerate(cssplits):
            # print(cs, cs[0])
            if cs.startswith("=") or cs == "N" or re.search(r"a|c|g|t|n", cs):
                cssplits_replaced.append(cs)
                continue
            if cs[0] in mutation_loci[i]:
                cssplits_replaced.append(cs)
            else:
                cssplits_replaced.append("@")
        yield ",".join(cssplits_replaced)

In [39]:
# Correct sequence errors
cssplits_sample_atmark = _replace_errors_to_atmark(midsv_sample, mutation_loci)
cssplits_control_atmark = _replace_errors_to_atmark(midsv_control, mutation_loci)

In [40]:
print(next(cssplits_sample_atmark))
print(next(cssplits_sample_atmark))
print(next(cssplits_sample_atmark))

=T,=G,=C,=A,=T,=T,=G,=A,=A,=G,=C,=A,=G,=T,=T,=C,=A,=C,=C,@,=A,=A,=A,=T,=A,=A,=C,=A,=A,=A,=G,=T,=A,=A,=C,@,=A,=A,=G,=T,=A,=A,=G,=A,=T,=A,=T,=C,=T,=T,=T,=G,=G,=A,=A,=T,=A,=A,=T,=C,=A,=A,=T,=T,=C,=A,=A,=G,=A,=T,=A,=A,=T,=C,=A,=A,=G,=G,=A,=A,=A,=A,=A,=T,=G,=A,=G,=A,@,=G,=C,=A,=A,=C,=T,=A,=T,=T,=T,=T,=A,=G,=A,=C,=T,=G,=A,=T,=T,=A,=C,=T,=T,=T,=T,=A,=T,=A,=A,=A,=A,=T,=A,=A,=A,=T,=A,=A,=G,=C,=T,=C,=A,=G,=C,=T,=T,=A,=G,=C,=C,@,@,=A,=T,=A,=T,=A,=A,=G,=C,=A,=A,=T,=A,=T,=T,=C,=T,=G,=A,=G,=T,=T,=C,=T,=G,=A,=A,=G,=A,=A,=A,=A,=A,=T,=T,=T,=T,=T,=G,=A,=C,=A,=A,=A,=A,=T,=G,=A,=G,=T,=T,=C,=T,=A,=T,=A,=A,=A,=T,=G,=T,=T,=A,=T,=T,=G,=T,=C,=T,=A,=C,=T,=T,=A,=T,=G,=A,=T,=C,=T,=C,=T,=A,=A,=A,=T,=A,=C,=A,=A,=C,=A,=G,=G,=C,=T,=T,=G,=T,=A,=T,=T,=C,=A,=G,=A,=A,=T,=C,=T,=A,=G,=A,=T,=G,=T,=T,=T,=C,=A,=T,=G,=A,=C,=C,=T,=T,=T,=A,=T,=T,=C,=A,=T,=A,=A,=G,=A,=G,=A,=T,=G,@,=T,=G,=T,=A,=T,=T,=C,=T,=T,=G,=A,=T,=A,=C,=T,=A,=C,=T,=T,=C,=T,=C,=A,@,=T,=T,@,=C,=A,=A,=A,=T,=T,=C,=C,=A,=A,=T,=T,=A,=T,=T,=A,=T,=T,=A,=A,=T,=T,=T,=C,

In [45]:
print(len(mutation_loci))

2845


- `_replace_errors_to_atmark`は不要では？

In [57]:
def _create_replace_dict(midsv_sample: dict[list[str, str]], mutation_loci: dict[str, set[int]]) -> dict[str, dict[str, int]]:
    cssplits_error_kmer = defaultdict(set)
    cssplits_sampling = defaultdict(lambda: defaultdict(lambda: Counter(["N"])))
    for cssplits in (cs["CSSPLIT"].split(",") for cs in midsv_sample):
        cssplits = list(cssplits)
        for idx_seq, cs in enumerate(cssplits):
            if idx_seq == 0 or idx_seq == len(cssplits) - 1:
                continue
            kmer = ",".join([cssplits[idx_seq - 1], cs, cssplits[idx_seq + 1]])
            if not (cs[0] in mutation_loci[idx_seq] or cs.startswith("=") or cs == "N" or re.search(r"a|c|g|t|n", cs)):
                cssplits_error_kmer[idx_seq].add(kmer)
            else:
                cssplits_sampling[idx_seq][kmer] += Counter([cs])
    return cssplits_error_kmer, cssplits_sampling


In [58]:
x, y = _create_replace_dict(midsv_sample[:100], mutation_loci)

In [60]:
print(len(x))
print(len(y))
print(x.keys())
print(y.keys())

2463
2843
dict_keys([19, 35, 88, 141, 142, 284, 308, 311, 340, 370, 371, 400, 405, 478, 526, 736, 960, 1001, 1006, 1007, 1054, 1159, 1161, 1266, 1269, 1270, 1271, 1290, 1291, 1292, 1382, 1383, 1418, 1434, 1435, 1508, 1509, 1511, 1513, 1577, 1659, 1662, 1667, 1749, 1826, 1873, 2067, 2200, 2216, 2238, 2250, 2257, 2379, 2430, 2433, 2527, 2721, 2722, 2734, 2814, 9, 11, 13, 15, 16, 109, 111, 112, 113, 114, 137, 318, 365, 401, 402, 454, 460, 462, 473, 477, 479, 481, 483, 520, 582, 622, 655, 731, 763, 764, 765, 786, 787, 808, 809, 814, 815, 847, 886, 920, 924, 925, 937, 938, 939, 953, 1015, 1043, 1051, 1052, 1093, 1119, 1175, 1228, 1229, 1231, 1234, 1251, 1252, 1288, 1303, 1305, 1312, 1313, 1348, 1436, 1437, 1510, 1517, 1525, 1527, 1528, 1529, 1576, 1583, 1677, 1685, 1686, 1688, 1807, 1814, 1815, 1850, 1877, 1881, 1897, 1901, 2050, 2052, 2053, 2069, 2070, 2198, 2226, 2230, 2251, 2336, 2449, 2477, 2543, 2544, 2545, 2549, 2550, 2551, 2567, 2600, 2708, 2709, 2723, 2724, 2809, 2824, 2825, 17, 40,

In [63]:
print(x[19])
print(y[19])
print(mutation_loci[19])

{'*CT,*AG,=A', '=C,*AG,=A', '-C,-A,=A', '=C,-A,=A', '=C,-A,-A', '=C,+A|=A,=A', '=C,+G|=A,=A'}
defaultdict(<function _create_replace_dict.<locals>.<lambda>.<locals>.<lambda> at 0x7f44428fa4d0>, {'=C,=A,=A': Counter({'=A': 82, 'N': 1}), '+T|+T|+T|*CA,=A,=A': Counter({'N': 1, '=A': 1}), '+A|*CA,=A,=A': Counter({'N': 1, '=A': 1}), '*CA,=A,=A': Counter({'N': 1, '=A': 1}), 'N,N,=A': Counter({'N': 2}), 'N,=A,=A': Counter({'N': 1, '=A': 1}), '+G|=C,=A,=A': Counter({'N': 1, '=A': 1})})
set()


- y (cssplits_sampling)のkeyはいらない?

In [73]:
def _create_replace_dict(midsv_sample: dict[list[str, str]], mutation_loci: dict[str, set[int]]) -> tuple(defaultdict, defaultdict[set]):
    cssplits_sampling = defaultdict(lambda: Counter(["N"]))
    cssplits_error_kmer = defaultdict(set)
    for cssplits in (cs["CSSPLIT"].split(",") for cs in midsv_sample):
        cssplits = list(cssplits)
        for idx_seq, cs in enumerate(cssplits):
            if idx_seq == 0 or idx_seq == len(cssplits) - 1:
                continue
            if cs[0] in mutation_loci[idx_seq] or cs.startswith("=") or cs == "N" or re.search(r"a|c|g|t|n", cs):
                cssplits_sampling[idx_seq] += Counter([cs])
            else:
                kmer = ",".join([cssplits[idx_seq - 1], cs, cssplits[idx_seq + 1]])
                cssplits_error_kmer[idx_seq].add(kmer)
    return cssplits_sampling, cssplits_error_kmer


In [74]:
cssplits_sampling, cssplits_error_kmer = _create_replace_dict(midsv_sample[:100], mutation_loci)

2463
2843
dict_keys([19, 35, 88, 141, 142, 284, 308, 311, 340, 370, 371, 400, 405, 478, 526, 736, 960, 1001, 1006, 1007, 1054, 1159, 1161, 1266, 1269, 1270, 1271, 1290, 1291, 1292, 1382, 1383, 1418, 1434, 1435, 1508, 1509, 1511, 1513, 1577, 1659, 1662, 1667, 1749, 1826, 1873, 2067, 2200, 2216, 2238, 2250, 2257, 2379, 2430, 2433, 2527, 2721, 2722, 2734, 2814, 9, 11, 13, 15, 16, 109, 111, 112, 113, 114, 137, 318, 365, 401, 402, 454, 460, 462, 473, 477, 479, 481, 483, 520, 582, 622, 655, 731, 763, 764, 765, 786, 787, 808, 809, 814, 815, 847, 886, 920, 924, 925, 937, 938, 939, 953, 1015, 1043, 1051, 1052, 1093, 1119, 1175, 1228, 1229, 1231, 1234, 1251, 1252, 1288, 1303, 1305, 1312, 1313, 1348, 1436, 1437, 1510, 1517, 1525, 1527, 1528, 1529, 1576, 1583, 1677, 1685, 1686, 1688, 1807, 1814, 1815, 1850, 1877, 1881, 1897, 1901, 2050, 2052, 2053, 2069, 2070, 2198, 2226, 2230, 2251, 2336, 2449, 2477, 2543, 2544, 2545, 2549, 2550, 2551, 2567, 2600, 2708, 2709, 2723, 2724, 2809, 2824, 2825, 17, 40,

In [75]:
idx = 308
print(cssplits_sampling[idx])
print(cssplits_error_kmer[idx])
print(mutation_loci[idx])

Counter({'=T': 91, 'N': 1})
{'=A,+T|=T,=T', '-A,-T,=T', '=A,+T|+G|=T,=T', '=A,-T,=T', '=A,+T|+T|=T,=T'}
set()


In [76]:
print(midsv_sample[0])

{'QNAME': '00077750-d7ab-4c73-ac65-8707d39936c2', 'CSSPLIT': '=T,=G,=C,=A,=T,=T,=G,=A,=A,=G,=C,=A,=G,=T,=T,=C,=A,=C,=C,+G|=A,=A,=A,=A,=T,=A,=A,=C,=A,=A,=A,=G,=T,=A,=A,=C,+A|=A,=A,=A,=G,=T,=A,=A,=G,=A,=T,=A,=T,=C,=T,=T,=T,=G,=G,=A,=A,=T,=A,=A,=T,=C,=A,=A,=T,=T,=C,=A,=A,=G,=A,=T,=A,=A,=T,=C,=A,=A,=G,=G,=A,=A,=A,=A,=A,=T,=G,=A,=G,=A,-G,=G,=C,=A,=A,=C,=T,=A,=T,=T,=T,=T,=A,=G,=A,=C,=T,=G,=A,=T,=T,=A,=C,=T,=T,=T,=T,=A,=T,=A,=A,=A,=A,=T,=A,=A,=A,=T,=A,=A,=G,=C,=T,=C,=A,=G,=C,=T,=T,=A,=G,=C,=C,*AG,*GA,=A,=T,=A,=T,=A,=A,=G,=C,=A,=A,=T,=A,=T,=T,=C,=T,=G,=A,=G,=T,=T,=C,=T,=G,=A,=A,=G,=A,=A,=A,=A,=A,=T,=T,=T,=T,=T,=G,=A,=C,=A,=A,=A,=A,=T,=G,=A,=G,=T,=T,=C,=T,=A,=T,=A,=A,=A,=T,=G,=T,=T,=A,=T,=T,=G,=T,=C,=T,=A,=C,=T,=T,=A,=T,=G,=A,=T,=C,=T,=C,=T,=A,=A,=A,=T,=A,=C,=A,=A,=C,=A,=G,=G,=C,=T,=T,=G,=T,=A,=T,=T,=C,=A,=G,=A,=A,=T,=C,=T,=A,=G,=A,=T,=G,=T,=T,=T,=C,=A,=T,=G,=A,=C,=C,=T,=T,=T,=A,=T,=T,=C,=A,=T,=A,=A,=G,=A,=G,=A,=T,=G,+A|+A|=A,=T,=G,=T,=A,=T,=T,=C,=T,=T,=G,=A,=T,=A,=C,=T,=A,=C,=T,=T,=C,=T,=C,=A,

- `_create_replace_dict`において、`cssplits = list(cssplits)`を無くします

In [77]:
def _create_replace_dict(midsv_sample: dict[list[str, str]], mutation_loci: dict[str, set[int]]) -> tuple(defaultdict, defaultdict[set]):
    cssplits_sampling = defaultdict(lambda: Counter(["N"]))
    cssplits_error_kmer = defaultdict(set)
    for cssplits in (cs["CSSPLIT"].split(",") for cs in midsv_sample):
        for idx_seq, cs in enumerate(cssplits):
            if idx_seq == len(mutation_loci) - 1:
                break
            if idx_seq == 0:
                one_prior = cs
                continue
            if idx_seq == 1:
                two_prior = cs
                continue
            if cs[0] in mutation_loci[idx_seq] or cs.startswith("=") or cs == "N" or re.search(r"a|c|g|t|n", cs):
                cssplits_sampling[idx_seq] += Counter([cs])
            else:
                kmer = ",".join([one_prior, two_prior, cs])
                cssplits_error_kmer[idx_seq].add(kmer)
            one_prior, two_prior = two_prior, cs
    return cssplits_sampling, cssplits_error_kmer


In [78]:
cssplits_sampling, cssplits_error_kmer = _create_replace_dict(midsv_sample[:100], mutation_loci)

In [79]:
idx = 308
print(cssplits_sampling[idx])
print(cssplits_error_kmer[idx])
print(mutation_loci[idx])

Counter({'=T': 91, 'N': 1})
{'=C,=A,+T|+G|=T', '=C,=A,+T|=T', '=C,=A,+T|+T|=T', '=C,-A,-T', '=C,=A,-T'}
set()


- やはり、`cssplits_sampling`にはkmerの情報をもたせるべき
- defaultdictで管理すべきか、リストにすべきかは後で検討します

In [158]:
def _create_replace_dict(midsv_sample: dict[list[str, str]], mutation_loci: dict[str, set[int]]) -> tuple(defaultdict, defaultdict[set]):
    cssplits_sampling = defaultdict(lambda: defaultdict(lambda: Counter(["N"])))
    for cssplits in (cs["CSSPLIT"].split(",") for cs in midsv_sample):
        for idx_seq, cs in enumerate(cssplits):
            if idx_seq == len(mutation_loci) - 1:
                break
            if idx_seq == 0:
                one_prior = cs
                continue
            if idx_seq == 1:
                two_prior = cs
                continue
            if cs[0] in mutation_loci[idx_seq] or cs.startswith("=") or cs == "N" or re.search(r"a|c|g|t|n", cs):
                prev_cs = f"{one_prior},{two_prior}"
                cssplits_sampling[idx_seq][prev_cs] += Counter([cs])
            one_prior, two_prior = two_prior, cs
    return cssplits_sampling

In [159]:
cssplits_sampling = _create_replace_dict(midsv_sample[:100], mutation_loci)
idx = 308
print(cssplits_sampling[idx])
print(len(cssplits_sampling))
print(mutation_loci[idx])


defaultdict(<function _create_replace_dict.<locals>.<lambda>.<locals>.<lambda> at 0x7f4356e876d0>, {'=C,=A': Counter({'=T': 90, 'N': 1}), '+C|=C,=A': Counter({'N': 1, '=T': 1})})
2842
set()


In [155]:
def _get_ignorable_nucreotide_index(midsv_sample: dict[list[str, str]], mutation_loci) -> set[int]:
    """ if all nucreotides are match or unknown, then ignore the index of the nucreotide
    """
    unignorable_nucreotide_index = set()
    for cssplits in (cs["CSSPLIT"].split(",") for cs in midsv_sample):
        for i, cs in enumerate(cssplits):
            if not (cs[0] in mutation_loci[i] or cs.startswith("=") or cs == "N" or re.search(r"a|c|g|t|n", cs)):
                unignorable_nucreotide_index.add(i)
    ignorable_nucreotide_index = set(range(i)) - unignorable_nucreotide_index
    return ignorable_nucreotide_index


In [157]:
x = _get_ignorable_nucreotide_index(midsv_sample, mutation_loci)
print(len(x))
print(x)
print(828 in x)
print(0 in mutation_loci)
print(mutation_loci[0])
print(mutation_loci[3])

6
{0, 1, 2, 2841, 2842, 2843}
False
False
set()
set()


In [151]:
count = defaultdict(int)
for cssplits in (cs["CSSPLIT"].split(",") for cs in midsv_sample):
    cs = cssplits[5]
    count[cs] += 1

print(count)

defaultdict(<class 'int'>, {'=T': 4833, 'N': 163, '*TG': 1, '*TC': 2, '+T|+A|=T': 1})


- 計算が重たくなりそうなので補正が必要ないインデックスがあれば嬉しいと思い、`_get_ignorable_nucreotide_index`を作りましたがignorableなインデックスは殆どないことがわかりましたのでお蔵入りとなりました😭

In [161]:
cssplits_sampling = _create_replace_dict(midsv_sample[:100], mutation_loci)
idx = 308
prev_cs = '=C,=A'
print(cssplits_sampling[idx][prev_cs])


Counter({'=T': 90, 'N': 1})


In [178]:
def _correct_errors(midsv_sample, mutation_loci, cssplits_sampling) -> Generator[tuple[str, str]]:
    random.seed(1)
    for samp in midsv_sample:
        qname = samp["QNAME"]
        cssplits = samp["CSSPLIT"].split(",")
        for idx_seq, cs in enumerate(cssplits):
            if idx_seq == len(mutation_loci) - 1:
                break
            if idx_seq == 0:
                one_prior = cs
                continue
            if idx_seq == 1:
                two_prior = cs
                continue
            if not (cs[0] in mutation_loci[idx_seq] or cs.startswith("=") or cs == "N" or re.search(r"a|c|g|t|n", cs)):
                prev_cs = f"{one_prior},{two_prior}"
                sampling = cssplits_sampling[idx_seq][prev_cs]
                cssplits[idx_seq] = random.choices(*zip(*sampling.items()))[0]
            one_prior, two_prior = two_prior, cs
        cssplits_joined = ",".join(cssplits)
        yield (qname, cssplits_joined)



In [179]:
x = _correct_errors(midsv_sample, mutation_loci, cssplits_sampling)

In [180]:
print(next(x))

('00077750-d7ab-4c73-ac65-8707d39936c2', '=T,=G,=C,=A,=T,=T,=G,=A,=A,=G,=C,=A,=G,=T,=T,=C,=A,=C,=C,=A,=A,=A,=A,=T,=A,=A,=C,=A,=A,=A,=G,=T,=A,=A,=C,=A,=A,=A,=G,=T,=A,=A,=G,=A,=T,=A,=T,=C,=T,=T,=T,=G,=G,=A,=A,=T,=A,=A,=T,=C,=A,=A,=T,=T,=C,=A,=A,=G,=A,=T,=A,=A,=T,=C,=A,=A,=G,=G,=A,=A,=A,=A,=A,=T,=G,=A,=G,=A,=G,=G,=C,=A,=A,=C,=T,=A,=T,=T,=T,=T,=A,=G,=A,=C,=T,=G,=A,=T,=T,=A,=C,=T,=T,=T,=T,=A,=T,=A,=A,=A,=A,=T,=A,=A,=A,=T,=A,=A,=G,=C,=T,=C,=A,=G,=C,=T,=T,=A,=G,=C,=C,=A,N,=A,=T,=A,=T,=A,=A,=G,=C,=A,=A,=T,=A,=T,=T,=C,=T,=G,=A,=G,=T,=T,=C,=T,=G,=A,=A,=G,=A,=A,=A,=A,=A,=T,=T,=T,=T,=T,=G,=A,=C,=A,=A,=A,=A,=T,=G,=A,=G,=T,=T,=C,=T,=A,=T,=A,=A,=A,=T,=G,=T,=T,=A,=T,=T,=G,=T,=C,=T,=A,=C,=T,=T,=A,=T,=G,=A,=T,=C,=T,=C,=T,=A,=A,=A,=T,=A,=C,=A,=A,=C,=A,=G,=G,=C,=T,=T,=G,=T,=A,=T,=T,=C,=A,=G,=A,=A,=T,=C,=T,=A,=G,=A,=T,=G,=T,=T,=T,=C,=A,=T,=G,=A,=C,=C,=T,=T,=T,=A,=T,=T,=C,=A,=T,=A,=A,=G,=A,=G,=A,=T,=G,=A,=T,=G,=T,=A,=T,=T,=C,=T,=T,=G,=A,=T,=A,=C,=T,=A,=C,=T,=T,=C,=T,=C,=A,=T,=T,=T,=G,=C,=A,=A,=A,=T,=T,=C,=C

In [181]:

def _sampling_cssplits(midsv_sample: dict[list[str, str]], mutation_loci: dict[str, set[int]]) -> defaultdict:
    cssplits_sampling = defaultdict(lambda: defaultdict(lambda: Counter(["N"])))
    for cssplits in (cs["CSSPLIT"].split(",") for cs in midsv_sample):
        for idx_seq, cs in enumerate(cssplits):
            if idx_seq == len(mutation_loci) - 1:
                break
            if idx_seq == 0:
                one_prior = cs
                continue
            if idx_seq == 1:
                two_prior = cs
                continue
            if cs[0] in mutation_loci[idx_seq] or cs.startswith("=") or cs == "N" or re.search(r"a|c|g|t|n", cs):
                prev_cs = f"{one_prior},{two_prior}"
                cssplits_sampling[idx_seq][prev_cs] += Counter([cs])
            one_prior, two_prior = two_prior, cs
    return cssplits_sampling

def _correct_errors(midsv_sample, mutation_loci, cssplits_sampling) -> Generator[tuple[str, str]]:
    random.seed(1)
    for samp in midsv_sample:
        qname = samp["QNAME"]
        cssplits = samp["CSSPLIT"].split(",")
        for idx_seq, cs in enumerate(cssplits):
            if idx_seq == len(mutation_loci) - 1:
                break
            if idx_seq == 0:
                one_prior = cs
                continue
            if idx_seq == 1:
                two_prior = cs
                continue
            if not (cs[0] in mutation_loci[idx_seq] or cs.startswith("=") or cs == "N" or re.search(r"a|c|g|t|n", cs)):
                prev_cs = f"{one_prior},{two_prior}"
                sampling = cssplits_sampling[idx_seq][prev_cs]
                cssplits[idx_seq] = random.choices(*zip(*sampling.items()))[0]
            one_prior, two_prior = two_prior, cs
        cssplits_joined = ",".join(cssplits)
        yield (qname, cssplits_joined)

In [185]:
midsv_sample = midsv_sample_alleles[allele]
midsv_control = midsv_control_alleles[allele]
# Extract mutation loci
mutation_loci = MUTATION_LOCI_ALLELES[allele]
# # Correct sequence errors
sampling_sample =_sampling_cssplits(midsv_sample, mutation_loci)
sampling_control =_sampling_cssplits(midsv_control, mutation_loci)
midsv_corected_sample = _correct_errors(midsv_sample, mutation_loci, sampling_sample)
midsv_corected_control = _correct_errors(midsv_control, mutation_loci, sampling_control)


In [186]:
print(midsv_sample[0])
print(next(midsv_corected_sample))

{'QNAME': '00077750-d7ab-4c73-ac65-8707d39936c2', 'CSSPLIT': '=T,=G,=C,=A,=T,=T,=G,=A,=A,=G,=C,=A,=G,=T,=T,=C,=A,=C,=C,+G|=A,=A,=A,=A,=T,=A,=A,=C,=A,=A,=A,=G,=T,=A,=A,=C,+A|=A,=A,=A,=G,=T,=A,=A,=G,=A,=T,=A,=T,=C,=T,=T,=T,=G,=G,=A,=A,=T,=A,=A,=T,=C,=A,=A,=T,=T,=C,=A,=A,=G,=A,=T,=A,=A,=T,=C,=A,=A,=G,=G,=A,=A,=A,=A,=A,=T,=G,=A,=G,=A,-G,=G,=C,=A,=A,=C,=T,=A,=T,=T,=T,=T,=A,=G,=A,=C,=T,=G,=A,=T,=T,=A,=C,=T,=T,=T,=T,=A,=T,=A,=A,=A,=A,=T,=A,=A,=A,=T,=A,=A,=G,=C,=T,=C,=A,=G,=C,=T,=T,=A,=G,=C,=C,*AG,*GA,=A,=T,=A,=T,=A,=A,=G,=C,=A,=A,=T,=A,=T,=T,=C,=T,=G,=A,=G,=T,=T,=C,=T,=G,=A,=A,=G,=A,=A,=A,=A,=A,=T,=T,=T,=T,=T,=G,=A,=C,=A,=A,=A,=A,=T,=G,=A,=G,=T,=T,=C,=T,=A,=T,=A,=A,=A,=T,=G,=T,=T,=A,=T,=T,=G,=T,=C,=T,=A,=C,=T,=T,=A,=T,=G,=A,=T,=C,=T,=C,=T,=A,=A,=A,=T,=A,=C,=A,=A,=C,=A,=G,=G,=C,=T,=T,=G,=T,=A,=T,=T,=C,=A,=G,=A,=A,=T,=C,=T,=A,=G,=A,=T,=G,=T,=T,=T,=C,=A,=T,=G,=A,=C,=C,=T,=T,=T,=A,=T,=T,=C,=A,=T,=A,=A,=G,=A,=G,=A,=T,=G,+A|+A|=A,=T,=G,=T,=A,=T,=T,=C,=T,=T,=G,=A,=T,=A,=C,=T,=A,=C,=T,=T,=C,=T,=C,=A,

- 動いていそうです
- 一方で速度がいまいちな気がするので、line_profilerを使います

In [187]:
! pip install line_profiler
%load_ext line_profiler

Collecting line_profiler
  Downloading line_profiler-4.0.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (661 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m661.9/661.9 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: line_profiler
Successfully installed line_profiler-4.0.3


In [190]:
%lprun -f _sampling_cssplits _sampling_cssplits(midsv_sample[:500], mutation_loci)

Timer unit: 1e-09 s

Total time: 5.46833 s
File: /tmp/ipykernel_23197/3606121126.py
Function: _sampling_cssplits at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
     1                                           def _sampling_cssplits(midsv_sample: dict[list[str, str]], mutation_loci: dict[str, set[int]]) -> defaultdict:
     2         1       1333.0   1333.0      0.0      cssplits_sampling = defaultdict(lambda: defaultdict(lambda: Counter(["N"])))
     3       500   59913987.0 119828.0      1.1      for cssplits in (cs["CSSPLIT"].split(",") for cs in midsv_sample):
     4   1422500  195775305.0    137.6      3.6          for idx_seq, cs in enumerate(cssplits):
     5   1422000  293101734.0    206.1      5.4              if idx_seq == len(mutation_loci) - 1:
     6       500     237232.0    474.5      0.0                  break
     7   1421500  158459588.0    111.5      2.9              if idx_seq == 0:
     8       500      55597.0    111.2      0.0           

In [193]:
%lprun -f _correct_errors list(_correct_errors(midsv_sample[:500], mutation_loci, sampling_sample))

Timer unit: 1e-09 s

Total time: 1.4795 s
File: /tmp/ipykernel_23197/3606121126.py
Function: _correct_errors at line 19

Line #      Hits         Time  Per Hit   % Time  Line Contents
    19                                           def _correct_errors(midsv_sample, mutation_loci, cssplits_sampling) -> Generator[tuple[str, str]]:
    20         1      21459.0  21459.0      0.0      random.seed(1)
    21       500     206437.0    412.9      0.0      for samp in midsv_sample:
    22       500    1493203.0   2986.4      0.1          qname = samp["QNAME"]
    23       500   33685745.0  67371.5      2.3          cssplits = samp["CSSPLIT"].split(",")
    24   1422500  172971559.0    121.6     11.7          for idx_seq, cs in enumerate(cssplits):
    25   1422000  248887782.0    175.0     16.8              if idx_seq == len(mutation_loci) - 1:
    26       500     171747.0    343.5      0.0                  break
    27   1421500  163220134.0    114.8     11.0              if idx_seq == 0:
  

- やはり`cssplits_sampling[idx_seq][prev_cs] += Counter([cs])`の計算にものすごい時間がかかっています…
    - リストにしてみます

In [236]:
def _sampling_cssplits(midsv_sample: dict[list[str, str]], mutation_loci: dict[str, set[int]]) -> Generator:
    sampling = defaultdict(lambda: defaultdict(list))
    # sampling = [defaultdict(list) for _ in range(len(mutation_loci))]
    for cssplits in (cs["CSSPLIT"].split(",") for cs in midsv_sample):
        for idx_seq, cs in enumerate(cssplits):
            if idx_seq == len(mutation_loci) - 1:
                break
            if idx_seq == 0:
                one_prior = cs
                continue
            if idx_seq == 1:
                two_prior = cs
                continue
            if cs[0] in mutation_loci[idx_seq] or cs.startswith("=") or cs == "N" or re.search(r"a|c|g|t|n", cs):
                prev_cs = one_prior + "," + two_prior
                sampling[idx_seq][prev_cs].append(cs)
            one_prior, two_prior = two_prior, cs
    return sampling


In [237]:
%lprun -f _sampling_cssplits _sampling_cssplits(midsv_sample[:500], mutation_loci)

Timer unit: 1e-09 s

Total time: 2.00056 s
File: /tmp/ipykernel_23197/3007809539.py
Function: _sampling_cssplits at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
     1                                           def _sampling_cssplits(midsv_sample: dict[list[str, str]], mutation_loci: dict[str, set[int]]) -> Generator:
     2         1       1814.0   1814.0      0.0      sampling = defaultdict(lambda: defaultdict(list))
     3                                               # sampling = [defaultdict(list) for _ in range(len(mutation_loci))]
     4       500   28094135.0  56188.3      1.4      for cssplits in (cs["CSSPLIT"].split(",") for cs in midsv_sample):
     5   1422500  178867583.0    125.7      8.9          for idx_seq, cs in enumerate(cssplits):
     6   1422000  263306696.0    185.2     13.2              if idx_seq == len(mutation_loci) - 1:
     7       500     268810.0    537.6      0.0                  break
     8   1421500  162763560.0    114.5      

In [234]:
def _sampling_cssplits(midsv_sample: dict[list[str, str]], mutation_loci: dict[str, set[int]]) -> Generator:
    # sampling = defaultdict(lambda: defaultdict(list))
    sampling = [defaultdict(list) for _ in range(len(mutation_loci))]
    for cssplits in (cs["CSSPLIT"].split(",") for cs in midsv_sample):
        for idx_seq, cs in enumerate(cssplits):
            if idx_seq == len(mutation_loci) - 1:
                break
            if idx_seq == 0:
                one_prior = cs
                continue
            if idx_seq == 1:
                two_prior = cs
                continue
            if cs[0] in mutation_loci[idx_seq] or cs.startswith("=") or cs == "N" or re.search(r"a|c|g|t|n", cs):
                prev_cs = one_prior + "," + two_prior
                sampling[idx_seq][prev_cs].append(cs)
            one_prior, two_prior = two_prior, cs
    return sampling


defaultdict(<class 'list'>, {'=C,=A': ['=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '=T', '

In [238]:
%lprun -f _sampling_cssplits _sampling_cssplits(midsv_sample[:500], mutation_loci)

Timer unit: 1e-09 s

Total time: 1.99297 s
File: /tmp/ipykernel_23197/3007809539.py
Function: _sampling_cssplits at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
     1                                           def _sampling_cssplits(midsv_sample: dict[list[str, str]], mutation_loci: dict[str, set[int]]) -> Generator:
     2         1       1303.0   1303.0      0.0      sampling = defaultdict(lambda: defaultdict(list))
     3                                               # sampling = [defaultdict(list) for _ in range(len(mutation_loci))]
     4       500   30604828.0  61209.7      1.5      for cssplits in (cs["CSSPLIT"].split(",") for cs in midsv_sample):
     5   1422500  182922540.0    128.6      9.2          for idx_seq, cs in enumerate(cssplits):
     6   1422000  265974481.0    187.0     13.3              if idx_seq == len(mutation_loci) - 1:
     7       500     274578.0    549.2      0.0                  break
     8   1421500  165836400.0    116.7      

In [268]:
def _sampling_cssplits(midsv_sample: dict[list[str, str]], mutation_loci: dict[str, set[int]]) -> Generator[dict[Counter]]:
    sampling = [defaultdict(list) for _ in range(len(mutation_loci))]
    for cssplits in (cs["CSSPLIT"].split(",") for cs in midsv_sample):
        for idx_seq, cs in enumerate(cssplits):
            if idx_seq == len(mutation_loci) - 1:
                break
            if idx_seq == 0:
                one_prior = cs
                continue
            if idx_seq == 1:
                two_prior = cs
                continue
            if cs[0] in mutation_loci[idx_seq] or cs.startswith("=") or cs == "N" or re.search(r"a|c|g|t|n", cs):
                prev_cs = one_prior + "," + two_prior
                sampling[idx_seq][prev_cs].append(cs)
            one_prior, two_prior = two_prior, cs
    for samp in sampling:
        samp_counter = dict()
        for key, val in samp.items():
            val = Counter(val)
            samp_counter.update({key: val})
        yield samp_counter


In [263]:
sampling = _sampling_cssplits(midsv_sample[:500], mutation_loci)

In [264]:
print(next(sampling))
print(next(sampling))
print(next(sampling))
print(next(sampling))

{}
{}
{'=T,=G': Counter({'=C': 428}), 'N,N': Counter({'N': 53, '=C': 13}), 'N,=G': Counter({'=C': 6})}
{'=G,=C': Counter({'=A': 433}), 'N,N': Counter({'N': 48, '=A': 5}), 'N,=C': Counter({'=A': 13})}


In [252]:
sampling_replaced = sampling.copy()

- このままだと毎回`random.choices`を行うことになり、もったいない？
- おそらく各indexの各変異の数をカウントして、一括してramdom.choicesを作ったほうが良さそう？
- iter()で保存して、next()で出力すれば良い

- ↑に取り組む前にまずは今までの改変が動くかどうかを観察します


In [16]:
%load_ext memory_profiler

The memory_profiler extension is already loaded. To reload it, use:
  %reload_ext memory_profiler


In [20]:
%%file tmp_memtest.py

import random
import re
from collections import defaultdict, Counter
from typing import Generator

def _sampling_cssplits(midsv_sample: dict[list[str, str]], mutation_loci: dict[str, set[int]]) -> list[dict[Counter]]:
    sampling = [defaultdict(list) for _ in range(len(mutation_loci))]
    for cssplits in (cs["CSSPLIT"].split(",") for cs in midsv_sample):
        for idx_seq, cs in enumerate(cssplits):
            if idx_seq == len(mutation_loci) - 1:
                break
            if idx_seq == 0:
                one_prior = cs
                continue
            if idx_seq == 1:
                two_prior = cs
                continue
            if cs[0] in mutation_loci[idx_seq] or cs.startswith("=") or cs == "N" or re.search(r"a|c|g|t|n", cs):
                prev_cs = one_prior + "," + two_prior
                sampling[idx_seq][prev_cs].append(cs)
            one_prior, two_prior = two_prior, cs
    sampling_cssplits = []
    for samp in sampling:
        samp_counter = dict()
        for key, val in samp.items():
            val = Counter(val)
            samp_counter.update({key: val})
        sampling_cssplits.append(samp_counter)
    return sampling_cssplits

Overwriting tmp_memtest.py


In [21]:
allele = "control"
midsv_sample = midsv_sample_alleles[allele]
midsv_control = midsv_control_alleles[allele]
# Extract mutation loci
mutation_loci = MUTATION_LOCI_ALLELES[allele]


In [23]:

def _sampling_cssplits(midsv_sample: dict[list[str, str]], mutation_loci: dict[str, set[int]]) -> list[dict[Counter]]:
    sampling = [defaultdict(list) for _ in range(len(mutation_loci))]
    for cssplits in (cs["CSSPLIT"].split(",") for cs in midsv_sample):
        for idx_seq, cs in enumerate(cssplits):
            if idx_seq == len(mutation_loci) - 1:
                break
            if idx_seq == 0:
                one_prior = cs
                continue
            if idx_seq == 1:
                two_prior = cs
                continue
            if cs[0] in mutation_loci[idx_seq] or cs.startswith("=") or cs == "N" or re.search(r"a|c|g|t|n", cs):
                prev_cs = one_prior + "," + two_prior
                sampling[idx_seq][prev_cs].append(cs)
            one_prior, two_prior = two_prior, cs
    sampling_cssplits = []
    for samp in sampling:
        samp_counter = dict()
        for key, val in samp.items():
            val = Counter(val)
            samp_counter.update({key: val})
        sampling_cssplits.append(samp_counter)
    return sampling_cssplits

x = _sampling_cssplits(midsv_sample[:100], mutation_loci)

In [24]:
from tmp_memtest import _sampling_cssplits
%mprun -f _sampling_cssplits _sampling_cssplits(midsv_sample[:100], mutation_loci)




Filename: /mnt/d/Research/DAJIN2/tmp_memtest.py

Line #    Mem usage    Increment  Occurrences   Line Contents
     6    606.8 MiB    606.8 MiB           1   def _sampling_cssplits(midsv_sample: dict[list[str, str]], mutation_loci: dict[str, set[int]]) -> list[dict[Counter]]:
     7    606.8 MiB      0.0 MiB        2848       sampling = [defaultdict(list) for _ in range(len(mutation_loci))]
     8    608.0 MiB      0.0 MiB         303       for cssplits in (cs["CSSPLIT"].split(",") for cs in midsv_sample):
     9    608.0 MiB      0.0 MiB      284500           for idx_seq, cs in enumerate(cssplits):
    10    608.0 MiB      0.0 MiB      284500               if idx_seq == len(mutation_loci) - 1:
    11    608.0 MiB      0.0 MiB         100                   break
    12    608.0 MiB      0.0 MiB      284400               if idx_seq == 0:
    13    608.0 MiB      0.0 MiB         100                   one_prior = cs
    14    608.0 MiB      0.0 MiB         100                   continue
 

In [25]:
!mprof plot

No input file found. 
This program looks for mprofile_*.dat files, generated by the 'mprof run' command.


In [None]:
# # Correct sequence errors
sampling_sample =_sampling_cssplits(midsv_sample, mutation_loci)
sampling_control =_sampling_cssplits(midsv_control, mutation_loci)
midsv_corected_sample = _correct_errors(midsv_sample, mutation_loci, sampling_sample)
midsv_corected_control = _correct_errors(midsv_control, mutation_loci, sampling_control)

In [275]:
print(midsv_sample[3])

{'QNAME': '00328905-1c46-4f17-8816-7881d8d44bb3', 'CSSPLIT': '=T,=G,=C,=A,=T,=T,=G,=A,=A,=G,=C,=A,=G,=T,=T,=C,=A,=C,=C,=A,=A,=A,=A,=T,=A,=A,=C,*AG,=A,=A,=G,=T,=A,=A,=C,-A,=A,=A,=G,=T,=A,=A,=G,=A,=T,=A,=T,=C,=T,=T,=T,=G,=G,=A,=A,=T,=A,*AG,=T,-C,=A,=A,=T,=T,=C,=A,=A,=G,=A,=T,=A,=A,=T,=C,=A,=A,=G,=G,=A,=A,=A,=A,=A,=T,=G,=A,=G,=A,=G,=G,=C,=A,=A,=C,=T,=A,=T,=T,=T,=T,=A,=G,=A,=C,=T,=G,=A,=T,=T,=A,=C,=T,=T,=T,=T,=A,=T,=A,=A,=A,=A,=T,=A,=A,=A,=T,=A,=A,=G,=C,=T,=C,=A,=G,=C,=T,=T,=A,=G,=C,=C,=A,=G,=A,=T,=A,=T,=A,=A,=G,=C,=A,=A,=T,=A,=T,=T,=C,=T,=G,=A,=G,=T,=T,=C,=T,=G,=A,=A,=G,=A,=A,=A,=A,=A,=T,=T,=T,=T,=T,=G,=A,=C,-A,=A,=A,=A,=T,=G,=A,=G,=T,=T,=C,=T,=A,=T,=A,=A,=A,=T,=G,=T,=T,=A,=T,=T,=G,=T,=C,=T,=A,=C,=T,=T,=A,=T,=G,=A,=T,=C,=T,=C,=T,=A,=A,=A,=T,=A,=C,=A,=A,=C,=A,=G,=G,=C,=T,=T,=G,=T,=A,=T,=T,=C,=A,=G,=A,=A,=T,=C,=T,=A,=G,=A,=T,=G,=T,=T,=T,=C,=A,=T,=G,=A,=C,=C,=T,=T,=T,=A,=T,=T,=C,=A,=T,=A,=A,=G,=A,=G,=A,=T,=G,=A,=T,=G,=T,=A,=T,=T,=C,=T,=T,=G,=A,=T,=A,=C,=T,=A,=C,=T,=T,=C,=T,=C,=A,=T,=T,=T,=G,

In [276]:
print(next(midsv_corected_sample))
print(next(midsv_corected_sample))
print(next(midsv_corected_sample))
print(next(midsv_corected_sample))

{'QNAME': '00328905-1c46-4f17-8816-7881d8d44bb3', 'CSSPLIT': '=T,=G,=C,=A,=T,=T,=G,=A,=A,=G,=C,=A,=G,=T,=T,=C,=A,=C,=C,=A,=A,=A,=A,=T,=A,=A,=C,=A,=A,=A,=G,=T,=A,=A,=C,=A,=A,=A,=G,=T,=A,=A,=G,=A,=T,=A,=T,=C,=T,=T,=T,=G,=G,=A,=A,=T,=A,=A,=T,=C,=A,=A,=T,=T,=C,=A,=A,=G,=A,=T,=A,=A,=T,=C,=A,=A,=G,=G,=A,=A,=A,=A,=A,=T,=G,=A,=G,=A,=G,=G,=C,=A,=A,=C,=T,=A,=T,=T,=T,=T,=A,=G,=A,=C,=T,=G,=A,=T,=T,=A,=C,=T,=T,=T,=T,=A,=T,=A,=A,=A,=A,=T,=A,=A,=A,=T,=A,=A,=G,=C,=T,=C,=A,=G,=C,=T,=T,=A,=G,=C,=C,=A,=G,=A,=T,=A,=T,=A,=A,=G,=C,=A,=A,=T,=A,=T,=T,=C,=T,=G,=A,=G,=T,=T,=C,=T,=G,=A,=A,=G,=A,=A,=A,=A,=A,=T,=T,=T,=T,=T,=G,=A,=C,=A,=A,=A,=A,=T,=G,=A,=G,=T,=T,=C,=T,=A,=T,=A,=A,=A,=T,=G,=T,=T,=A,=T,=T,=G,=T,=C,=T,=A,=C,=T,=T,=A,=T,=G,=A,=T,=C,=T,=C,=T,=A,=A,=A,=T,=A,=C,=A,=A,=C,=A,=G,=G,=C,=T,=T,=G,=T,=A,=T,=T,=C,=A,=G,=A,=A,=T,=C,=T,=A,=G,=A,=T,=G,=T,=T,=T,=C,=A,=T,=G,=A,=C,=C,=T,=T,=T,=A,=T,=T,=C,=A,=T,=A,=A,=G,=A,=G,=A,=T,=G,=A,=T,=G,=T,=A,=T,=T,=C,=T,=T,=G,=A,=T,=A,=C,=T,=A,=C,=T,=T,=C,=T,=C,=A,=T,=T,=T,=G,=C

# 👉👉👉 いまここ 👈👈👈

# 👌👌👌 まとめ 👌👌👌


- `_transpose_mutation_loci`のバグを修正しました
    - `loci.add({})`が`TypeError: unhashable type: 'dict'`を引き起こしていました
    - こちらの一行は必要がなかったので削除しました
- Generatorの変更は特に行っていませんが、いまのところメモリ的には問題なさそうな気がします…

# 次に取り組むこと

- できるかぎりGeneratorで返すようにします
    - ✅ `calc_midsv`
    - ✅ `extract_mutation_loci`
    - ⬜ `correct_sequence_error`
    - ⬜ `classify`
    - ⬜ `clustaring`
    - ⬜ `consensus`
    - ⬜ `report`

### Lists

+ GUIの見栄え
+ igv.jsの起動
+ VCFによる長鎖挿入・欠失情報の付与
+ Figの作成
+ ⬜ Insertionのなかにある変異を同定する手法を考案する
+ ⬜ Ayabe-taks1のright_loxpがいまいちな理由を考察する
+ ✅ 断端リードの扱いをどうするべきか
+ ✅ `SV`の判定をconsensus callのあとにする
+ ✅ Tyrの動作確認
+ ✅ ayabe-task1のleft/right-loxpの検出
+ ✅ mutation_lociをpreprocessで使用したものに変更する
> + ⬜ `preprocess.correct_sequence_error.replace_atmark`のコードがわかりにくい
    + テストを用意してリファクタリングする